diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0354a8046e873..a556d8707a21d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -79,3 +79,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index (:issue`14327`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3c376e3188eac..5223c0ac270f3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2201,36 +2201,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, raise AssertionError('Level %s not in index' % str(level)) level = index.names.index(level) - inds = index.labels[level] - level_index = index.levels[level] - if self.name is None: self.name = index.names[level] - # XXX complete hack - - if grouper is not None: - level_values = index.levels[level].take(inds) - self.grouper = level_values.map(self.grouper) - else: - # all levels may not be observed - labels, uniques = algos.factorize(inds, sort=True) - - if len(uniques) > 0 and uniques[0] == -1: - # handle NAs - mask = inds != -1 - ok_labels, uniques = algos.factorize(inds[mask], sort=True) - - labels = np.empty(len(inds), dtype=inds.dtype) - labels[mask] = ok_labels - labels[~mask] = -1 - - if len(uniques) < len(level_index): - level_index = level_index.take(uniques) + self.grouper, self._labels, self._group_index = \ + index._get_grouper_for_level(self.grouper, level) - self._labels = labels - self._group_index = level_index - self.grouper = level_index.take(labels) else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 557b9b2b17e95..aea50d490a9c9 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -432,6 +432,35 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") + def _get_grouper_for_level(self, group_mapper, level): + """ + Get index grouper corresponding to an index level + + Parameters + ---------- + group_mapper: Group mapping function or None + Function mapping index values to groups + level : int + Index level (Only used by MultiIndex override) + + Returns + ------- + grouper : Index + Index of values to group on + labels : None + Array of locations in level_index + (Only returned by MultiIndex override) + level_index : None + Index of unique values for level + (Only returned by MultiIndex override) + """ + if group_mapper is None: + grouper = self + else: + grouper = self.map(group_mapper) + + return grouper, None, None + def is_(self, other): """ More flexible, faster check like ``is`` but that works through views diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 1ab5dbb737739..d43034b0ebdd6 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -524,6 +524,55 @@ def _format_native_types(self, na_rep='nan', **kwargs): return mi.values + def _get_grouper_for_level(self, group_mapper, level): + """ + Get index grouper corresponding to an index level + + Parameters + ---------- + group_mapper: Group mapping function or None + Function mapping index values to groups + level : int + Index level + + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + level_index : Index or None + Index of unique values for level + """ + inds = self.labels[level] + level_index = self.levels[level] + + if group_mapper is not None: + # Handle group mapping function and return + level_values = self.levels[level].take(inds) + grouper = level_values.map(group_mapper) + return grouper, None, None + + labels, uniques = algos.factorize(inds, sort=True) + + if len(uniques) > 0 and uniques[0] == -1: + # Handle NAs + mask = inds != -1 + ok_labels, uniques = algos.factorize(inds[mask], + sort=True) + + labels = np.empty(len(inds), dtype=inds.dtype) + labels[mask] = ok_labels + labels[~mask] = -1 + + if len(uniques) < len(level_index): + # Remove unobserved levels from level_index + level_index = level_index.take(uniques) + + grouper = level_index.take(labels) + + return grouper, labels, level_index + @property def _constructor(self): return MultiIndex.from_tuples diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 01c1d48c6d5c0..e787af5b7c322 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -458,6 +458,39 @@ def test_grouper_creation_bug(self): expected = s.groupby(level='one').sum() assert_series_equal(result, expected) + def test_grouper_column_and_index(self): + # GH 14327 + + # Grouping a multi-index frame by a column and an index level should + # be equivalent to resetting the index and grouping by two columns + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_multi.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_multi.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + + # Grouping a single-index frame by a column and the index should + # be equivalent to resetting the index and grouping by two columns + df_single = df_multi.reset_index('outer') + result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_single.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_single.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063