Skip to content

Bug: Grouping by index and column fails on DataFrame with single index (GH14327) #14333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,4 @@ Performance Improvements

Bug Fixes
~~~~~~~~~
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index (:issue`14327`)
28 changes: 2 additions & 26 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,36 +2201,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
raise AssertionError('Level %s not in index' % str(level))
level = index.names.index(level)

inds = index.labels[level]
level_index = index.levels[level]

if self.name is None:
self.name = index.names[level]

# XXX complete hack

if grouper is not None:
level_values = index.levels[level].take(inds)
self.grouper = level_values.map(self.grouper)
else:
# all levels may not be observed
labels, uniques = algos.factorize(inds, sort=True)

if len(uniques) > 0 and uniques[0] == -1:
# handle NAs
mask = inds != -1
ok_labels, uniques = algos.factorize(inds[mask], sort=True)

labels = np.empty(len(inds), dtype=inds.dtype)
labels[mask] = ok_labels
labels[~mask] = -1

if len(uniques) < len(level_index):
level_index = level_index.take(uniques)
self.grouper, self._labels, self._group_index = \
index._get_grouper_for_level(self.grouper, level)

self._labels = labels
self._group_index = level_index
self.grouper = level_index.take(labels)
else:
if isinstance(self.grouper, (list, tuple)):
self.grouper = com._asarray_tuplesafe(self.grouper)
Expand Down
11 changes: 11 additions & 0 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,17 @@ def _update_inplace(self, result, **kwargs):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")

def _get_grouper_for_level(self, grouper, level):
# Use self (Index) as grouper if None was passed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a Parameters section in the doc-string. and move the in-line comment to the Returns part.

if grouper is None:
grouper = self

# Return tuple of (grouper, labels, level_index)
# where labels and level_index are None for the Index
# implementation. The labels and level_index values
# are only calculated in the MultiIndex implementation
return grouper, None, None

def is_(self, other):
"""
More flexible, faster check like ``is`` but that works through views
Expand Down
33 changes: 33 additions & 0 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,39 @@ def _format_native_types(self, na_rep='nan', **kwargs):

return mi.values

def _get_grouper_for_level(self, grouper, level):

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a docstring here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, same as above for styling.

inds = self.labels[level]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this indexer

level_index = self.levels[level]

# XXX complete hack

if grouper is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you want to put a comment here explain what is going on would be great (for future readers).

Further you can just return if grouper is not None (and then don't use an else), I think makes the code read slightly better.

level_values = self.levels[level].take(inds)
grouper = level_values.map(grouper)
labels = None
level_index = None
else:
# all levels may not be observed
labels, uniques = algos.factorize(inds, sort=True)

if len(uniques) > 0 and uniques[0] == -1:
# handle NAs
mask = inds != -1
ok_labels, uniques = algos.factorize(inds[mask],
sort=True)

labels = np.empty(len(inds), dtype=inds.dtype)
labels[mask] = ok_labels
labels[~mask] = -1

if len(uniques) < len(level_index):
level_index = level_index.take(uniques)

grouper = level_index.take(labels)

return grouper, labels, level_index

@property
def _constructor(self):
return MultiIndex.from_tuples
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,28 @@ def test_grouper_creation_bug(self):
expected = s.groupby(level='one').sum()
assert_series_equal(result, expected)

def test_grouper_column_and_index(self):
# GH 14327

# Grouping a multi-index frame by a column and an index level should
# be equivalent to resetting the index and grouping by two columns
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one']},
index=idx)
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you try with these reversed as well, e.g. [pd.Grouper(....), 'B'])

expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
assert_frame_equal(result, expected)

# Grouping a single-index frame by a column and the index should
# be equivalent to resetting the index and grouping by two columns
df_single = df_multi.reset_index('outer')
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
assert_frame_equal(result, expected)

def test_grouper_getting_correct_binner(self):

# GH 10063
Expand Down