Skip to content

Bug: Grouping by index and column fails on DataFrame with single index (GH14327) #14428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 15, 2016
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ Bug Fixes
- Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`)
- Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`)
- Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`)
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
28 changes: 2 additions & 26 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,36 +2201,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
raise AssertionError('Level %s not in index' % str(level))
level = index.names.index(level)

inds = index.labels[level]
level_index = index.levels[level]

if self.name is None:
self.name = index.names[level]

# XXX complete hack

if grouper is not None:
level_values = index.levels[level].take(inds)
self.grouper = level_values.map(self.grouper)
else:
# all levels may not be observed
labels, uniques = algos.factorize(inds, sort=True)

if len(uniques) > 0 and uniques[0] == -1:
# handle NAs
mask = inds != -1
ok_labels, uniques = algos.factorize(inds[mask], sort=True)

labels = np.empty(len(inds), dtype=inds.dtype)
labels[mask] = ok_labels
labels[~mask] = -1

if len(uniques) < len(level_index):
level_index = level_index.take(uniques)
self.grouper, self._labels, self._group_index = \
index._get_grouper_for_level(self.grouper, level)

self._labels = labels
self._group_index = level_index
self.grouper = level_index.take(labels)
else:
if isinstance(self.grouper, (list, tuple)):
self.grouper = com._asarray_tuplesafe(self.grouper)
Expand Down
30 changes: 30 additions & 0 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,36 @@ def _update_inplace(self, result, **kwargs):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")

_index_shared_docs['_get_grouper_for_level'] = """
Get index grouper corresponding to an index level

Parameters
----------
mapper: Group mapping function or None
Function mapping index values to groups
level : int or None
Index level

Returns
-------
grouper : Index
Index of values to group on
labels : ndarray of int or None
Array of locations in level_index
uniques : Index or None
Index of unique values for level
"""

@Appender(_index_shared_docs['_get_grouper_for_level'])
def _get_grouper_for_level(self, mapper, level=None):
assert level is None or level == 0
if mapper is None:
grouper = self
else:
grouper = self.map(mapper)

return grouper, None, None

def is_(self, other):
"""
More flexible, faster check like ``is`` but that works through views
Expand Down
31 changes: 31 additions & 0 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,37 @@ def _format_native_types(self, na_rep='nan', **kwargs):

return mi.values

@Appender(_index_shared_docs['_get_grouper_for_level'])
def _get_grouper_for_level(self, mapper, level):
indexer = self.labels[level]
level_index = self.levels[level]

if mapper is not None:
# Handle group mapping function and return
level_values = self.levels[level].take(indexer)
grouper = level_values.map(mapper)
return grouper, None, None

labels, uniques = algos.factorize(indexer, sort=True)

if len(uniques) > 0 and uniques[0] == -1:
# Handle NAs
mask = indexer != -1
ok_labels, uniques = algos.factorize(indexer[mask],
sort=True)

labels = np.empty(len(indexer), dtype=indexer.dtype)
labels[mask] = ok_labels
labels[~mask] = -1

if len(uniques) < len(level_index):
# Remove unobserved levels from level_index
level_index = level_index.take(uniques)

grouper = level_index.take(labels)

return grouper, labels, level_index

@property
def _constructor(self):
return MultiIndex.from_tuples
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,39 @@ def test_grouper_creation_bug(self):
expected = s.groupby(level='one').sum()
assert_series_equal(result, expected)

def test_grouper_column_and_index(self):
# GH 14327

# Grouping a multi-index frame by a column and an index level should
# be equivalent to resetting the index and grouping by two columns
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one']},
index=idx)
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
assert_frame_equal(result, expected)

# Test the reverse grouping order
result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
assert_frame_equal(result, expected)

# Grouping a single-index frame by a column and the index should
# be equivalent to resetting the index and grouping by two columns
df_single = df_multi.reset_index('outer')
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
expected = df_single.reset_index().groupby(['B', 'inner']).mean()
assert_frame_equal(result, expected)

# Test the reverse grouping order
result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
expected = df_single.reset_index().groupby(['inner', 'B']).mean()
assert_frame_equal(result, expected)

def test_grouper_getting_correct_binner(self):

# GH 10063
Expand Down