Skip to content

Bug: Grouping by index and column fails on DataFrame with single index (GH14327) #14333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,4 @@ Performance Improvements

Bug Fixes
~~~~~~~~~
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index (:issue`14327`)
28 changes: 2 additions & 26 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,36 +2201,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
raise AssertionError('Level %s not in index' % str(level))
level = index.names.index(level)

inds = index.labels[level]
level_index = index.levels[level]

if self.name is None:
self.name = index.names[level]

# XXX complete hack

if grouper is not None:
level_values = index.levels[level].take(inds)
self.grouper = level_values.map(self.grouper)
else:
# all levels may not be observed
labels, uniques = algos.factorize(inds, sort=True)

if len(uniques) > 0 and uniques[0] == -1:
# handle NAs
mask = inds != -1
ok_labels, uniques = algos.factorize(inds[mask], sort=True)

labels = np.empty(len(inds), dtype=inds.dtype)
labels[mask] = ok_labels
labels[~mask] = -1

if len(uniques) < len(level_index):
level_index = level_index.take(uniques)
self.grouper, self._labels, self._group_index = \
index._get_grouper_for_level(self.grouper, level)

self._labels = labels
self._group_index = level_index
self.grouper = level_index.take(labels)
else:
if isinstance(self.grouper, (list, tuple)):
self.grouper = com._asarray_tuplesafe(self.grouper)
Expand Down
29 changes: 29 additions & 0 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,35 @@ def _update_inplace(self, result, **kwargs):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")

def _get_grouper_for_level(self, group_mapper, level):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this mapper

"""
Get index grouper corresponding to an index level

Parameters
----------
group_mapper: Group mapping function or None
Function mapping index values to groups
level : int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just make this level=None by default and assert it is None for Index. don't put the (Only used phrase)

Index level (Only used by MultiIndex override)

Returns
-------
grouper : Index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it needs to always return a tuple (doesn't matter if they are used or not)

Index of values to group on
labels : None
Array of locations in level_index
(Only returned by MultiIndex override)
level_index : None
Index of unique values for level
(Only returned by MultiIndex override)
"""
if group_mapper is None:
grouper = self
else:
grouper = self.map(group_mapper)

return grouper, None, None

def is_(self, other):
"""
More flexible, faster check like ``is`` but that works through views
Expand Down
49 changes: 49 additions & 0 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,55 @@ def _format_native_types(self, na_rep='nan', **kwargs):

return mi.values

def _get_grouper_for_level(self, group_mapper, level):
"""
Get index grouper corresponding to an index level

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a docstring here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, same as above for styling.

Parameters
----------
group_mapper: Group mapping function or None
Function mapping index values to groups
level : int
Index level

Returns
-------
grouper : Index
Index of values to group on
labels : ndarray of int or None
Array of locations in level_index
level_index : Index or None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this uniques (these are not actually used anywhere, but are descriptive). We use certain terms in the codebase. This will make it consistent.

Index of unique values for level
"""
inds = self.labels[level]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this indexer

level_index = self.levels[level]

if group_mapper is not None:
# Handle group mapping function and return
level_values = self.levels[level].take(inds)
grouper = level_values.map(group_mapper)
return grouper, None, None

labels, uniques = algos.factorize(inds, sort=True)

if len(uniques) > 0 and uniques[0] == -1:
# Handle NAs
mask = inds != -1
ok_labels, uniques = algos.factorize(inds[mask],
sort=True)

labels = np.empty(len(inds), dtype=inds.dtype)
labels[mask] = ok_labels
labels[~mask] = -1

if len(uniques) < len(level_index):
# Remove unobserved levels from level_index
level_index = level_index.take(uniques)

grouper = level_index.take(labels)

return grouper, labels, level_index

@property
def _constructor(self):
return MultiIndex.from_tuples
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,39 @@ def test_grouper_creation_bug(self):
expected = s.groupby(level='one').sum()
assert_series_equal(result, expected)

def test_grouper_column_and_index(self):
# GH 14327

# Grouping a multi-index frame by a column and an index level should
# be equivalent to resetting the index and grouping by two columns
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one']},
index=idx)
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try with these reversed as well, e.g. [pd.Grouper(....), 'B'])

expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
assert_frame_equal(result, expected)

# Test the reverse grouping order
result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
assert_frame_equal(result, expected)

# Grouping a single-index frame by a column and the index should
# be equivalent to resetting the index and grouping by two columns
df_single = df_multi.reset_index('outer')
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
expected = df_single.reset_index().groupby(['B', 'inner']).mean()
assert_frame_equal(result, expected)

# Test the reverse grouping order
result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
expected = df_single.reset_index().groupby(['inner', 'B']).mean()
assert_frame_equal(result, expected)

def test_grouper_getting_correct_binner(self):

# GH 10063
Expand Down