From a421a52d5eac98343b4f94925f0fdf36cf9ad17f Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 1 Oct 2016 20:59:37 -0400 Subject: [PATCH 1/9] Added failing test case of GH 14327 --- pandas/tests/test_groupby.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 01c1d48c6d5c0..b3b381b064e34 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -458,6 +458,28 @@ def test_grouper_creation_bug(self): expected = s.groupby(level='one').sum() assert_series_equal(result, expected) + def test_grouper_column_and_index(self): + # GH 14327 + + # Grouping a multi-index frame by a column and an index level should + # be equivalent to resetting the index and grouping by two columns + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_multi.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Grouping a single-index frame by a column and the index should + # be equivalent to resetting the index and grouping by two columns + df_single = df_multi.reset_index('outer') + result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063 From ec9340f79d5f13982a50f72a5fab44b9839c5faf Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 1 Oct 2016 21:05:41 -0400 Subject: [PATCH 2/9] Handle specification of level for non-MultiIndex in Grouping constructor Existing logic under "if level is not None:" assumed that index was a MultiIndex. Now we check and also handle the case where an Index is passed in with a None grouper. This resolves GH 14327 --- pandas/core/groupby.py | 53 ++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 3c376e3188eac..feee0d73e05df 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2201,36 +2201,45 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, raise AssertionError('Level %s not in index' % str(level)) level = index.names.index(level) - inds = index.labels[level] - level_index = index.levels[level] - if self.name is None: self.name = index.names[level] - # XXX complete hack + if isinstance(index, MultiIndex): + inds = index.labels[level] + level_index = index.levels[level] - if grouper is not None: - level_values = index.levels[level].take(inds) - self.grouper = level_values.map(self.grouper) - else: - # all levels may not be observed - labels, uniques = algos.factorize(inds, sort=True) + # XXX complete hack + + if grouper is not None: + level_values = index.levels[level].take(inds) + self.grouper = level_values.map(self.grouper) + else: + # all levels may not be observed + labels, uniques = algos.factorize(inds, sort=True) - if len(uniques) > 0 and uniques[0] == -1: - # handle NAs - mask = inds != -1 - ok_labels, uniques = algos.factorize(inds[mask], sort=True) + if len(uniques) > 0 and uniques[0] == -1: + # handle NAs + mask = inds != -1 + ok_labels, uniques = algos.factorize(inds[mask], + sort=True) - labels = np.empty(len(inds), dtype=inds.dtype) - labels[mask] = ok_labels - labels[~mask] = -1 + labels = np.empty(len(inds), dtype=inds.dtype) + labels[mask] = ok_labels + labels[~mask] = -1 - if len(uniques) < len(level_index): - level_index = level_index.take(uniques) + if len(uniques) < len(level_index): + level_index = level_index.take(uniques) + + self._labels = labels + self._group_index = level_index + self.grouper = level_index.take(labels) + + # Single level index passed + else: + # Use single level index as grouper if none passed + if grouper is None: + self.grouper = index - self._labels = labels - self._group_index = level_index - self.grouper = level_index.take(labels) else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) From 848c9bb9015b880656c885f0d320c0d05291d73c Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 1 Oct 2016 21:26:06 -0400 Subject: [PATCH 3/9] Release notes for fix to GH 14327 --- doc/source/whatsnew/v0.19.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index a066641b36746..f2bdf10bf2360 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1584,3 +1584,4 @@ Bug Fixes - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - Bug in ``df.groupby`` where ``.median()`` returns arbitrary values if grouped dataframe contains empty bins (:issue:`13629`) - Bug in ``Index.copy()`` where ``name`` parameter was ignored (:issue:`14302`) +- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index (:issue`14327`) From 0f95bca34747105b41bd495a0ecca7ec63ac9ac5 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Mon, 3 Oct 2016 15:29:03 -0400 Subject: [PATCH 4/9] Moved whatsnew to 0.20.0 --- doc/source/whatsnew/v0.19.0.txt | 1 - doc/source/whatsnew/v0.20.0.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index f2bdf10bf2360..a066641b36746 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1584,4 +1584,3 @@ Bug Fixes - ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) - Bug in ``df.groupby`` where ``.median()`` returns arbitrary values if grouped dataframe contains empty bins (:issue:`13629`) - Bug in ``Index.copy()`` where ``name`` parameter was ignored (:issue:`14302`) -- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index (:issue`14327`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0354a8046e873..a556d8707a21d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -79,3 +79,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index (:issue`14327`) From 75a0390fbd11ac8ddc5de676a1a3151086544a8f Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Wed, 5 Oct 2016 07:55:11 -0400 Subject: [PATCH 5/9] Moved grouper level handling logic to methods on Index/MultiIndex (GH14327) --- pandas/core/groupby.py | 37 +------------------------------------ pandas/indexes/base.py | 7 +++++++ pandas/indexes/multi.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index feee0d73e05df..cc9c89702a0f1 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2204,42 +2204,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if self.name is None: self.name = index.names[level] - if isinstance(index, MultiIndex): - inds = index.labels[level] - level_index = index.levels[level] - - # XXX complete hack - - if grouper is not None: - level_values = index.levels[level].take(inds) - self.grouper = level_values.map(self.grouper) - else: - # all levels may not be observed - labels, uniques = algos.factorize(inds, sort=True) - - if len(uniques) > 0 and uniques[0] == -1: - # handle NAs - mask = inds != -1 - ok_labels, uniques = algos.factorize(inds[mask], - sort=True) - - labels = np.empty(len(inds), dtype=inds.dtype) - labels[mask] = ok_labels - labels[~mask] = -1 - - if len(uniques) < len(level_index): - level_index = level_index.take(uniques) - - self._labels = labels - self._group_index = level_index - self.grouper = level_index.take(labels) - - # Single level index passed - else: - # Use single level index as grouper if none passed - if grouper is None: - self.grouper = index - + self.grouper, self._labels, self._group_index = index._get_grouper_for_level(self.grouper, level) else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 557b9b2b17e95..e8452fc9cabfd 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -432,6 +432,13 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") + def _get_grouper_for_level(self, grouper, level): + # return grouper if grouper is not None else self + if grouper is None: + grouper = self + + return grouper, None, None + def is_(self, other): """ More flexible, faster check like ``is`` but that works through views diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 1ab5dbb737739..4619a6193daa4 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -524,6 +524,39 @@ def _format_native_types(self, na_rep='nan', **kwargs): return mi.values + def _get_grouper_for_level(self, grouper, level): + + inds = self.labels[level] + level_index = self.levels[level] + + # XXX complete hack + + if grouper is not None: + level_values = self.levels[level].take(inds) + grouper = level_values.map(grouper) + labels = None + level_index = None + else: + # all levels may not be observed + labels, uniques = algos.factorize(inds, sort=True) + + if len(uniques) > 0 and uniques[0] == -1: + # handle NAs + mask = inds != -1 + ok_labels, uniques = algos.factorize(inds[mask], + sort=True) + + labels = np.empty(len(inds), dtype=inds.dtype) + labels[mask] = ok_labels + labels[~mask] = -1 + + if len(uniques) < len(level_index): + level_index = level_index.take(uniques) + + grouper = level_index.take(labels) + + return grouper, labels, level_index + @property def _constructor(self): return MultiIndex.from_tuples From 6b37bd4aea68816fe720eac2d116df9dd6e07220 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Wed, 5 Oct 2016 19:38:55 -0400 Subject: [PATCH 6/9] Improve comments for Index._get_grouper_for_level --- pandas/indexes/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index e8452fc9cabfd..0e624b4c700b8 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -433,10 +433,14 @@ def _update_inplace(self, result, **kwargs): raise TypeError("Index can't be updated inplace") def _get_grouper_for_level(self, grouper, level): - # return grouper if grouper is not None else self + # Use self (Index) as grouper if None was passed if grouper is None: grouper = self + # Return tuple of (grouper, labels, level_index) + # where labels and level_index are None for the Index + # implementation. The labels and level_index values + # are only calculated in the MultiIndex implementation return grouper, None, None def is_(self, other): From 897ec1c5d33b933e063b3aace33654b4f0e68bf4 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Wed, 5 Oct 2016 19:43:23 -0400 Subject: [PATCH 7/9] Wrap line violating PEP8 --- pandas/core/groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index cc9c89702a0f1..5223c0ac270f3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2204,7 +2204,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if self.name is None: self.name = index.names[level] - self.grouper, self._labels, self._group_index = index._get_grouper_for_level(self.grouper, level) + self.grouper, self._labels, self._group_index = \ + index._get_grouper_for_level(self.grouper, level) + else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) From 05e655746ffd31317931fe5174feac9a2e4c95f8 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 6 Oct 2016 07:44:57 -0400 Subject: [PATCH 8/9] Added test cases that group by column then index --- pandas/tests/test_groupby.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index b3b381b064e34..e787af5b7c322 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -474,10 +474,21 @@ def test_grouper_column_and_index(self): expected = df_multi.reset_index().groupby(['B', 'inner']).mean() assert_frame_equal(result, expected) + # Test the reverse grouping order + result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_multi.reset_index().groupby(['inner', 'B']).mean() + assert_frame_equal(result, expected) + # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns df_single = df_multi.reset_index('outer') result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() + expected = df_single.reset_index().groupby(['B', 'inner']).mean() + assert_frame_equal(result, expected) + + # Test the reverse grouping order + result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() + expected = df_single.reset_index().groupby(['inner', 'B']).mean() assert_frame_equal(result, expected) def test_grouper_getting_correct_binner(self): From 33eb725c39e38b89c46042848dce703de78a6b52 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 6 Oct 2016 14:37:35 -0400 Subject: [PATCH 9/9] Cleaned up _get_grouper_for_level implementations and added docstrings --- pandas/indexes/base.py | 32 ++++++++++++++++++----- pandas/indexes/multi.py | 58 ++++++++++++++++++++++++++--------------- 2 files changed, 62 insertions(+), 28 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 0e624b4c700b8..aea50d490a9c9 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -432,15 +432,33 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def _get_grouper_for_level(self, grouper, level): - # Use self (Index) as grouper if None was passed - if grouper is None: + def _get_grouper_for_level(self, group_mapper, level): + """ + Get index grouper corresponding to an index level + + Parameters + ---------- + group_mapper: Group mapping function or None + Function mapping index values to groups + level : int + Index level (Only used by MultiIndex override) + + Returns + ------- + grouper : Index + Index of values to group on + labels : None + Array of locations in level_index + (Only returned by MultiIndex override) + level_index : None + Index of unique values for level + (Only returned by MultiIndex override) + """ + if group_mapper is None: grouper = self + else: + grouper = self.map(group_mapper) - # Return tuple of (grouper, labels, level_index) - # where labels and level_index are None for the Index - # implementation. The labels and level_index values - # are only calculated in the MultiIndex implementation return grouper, None, None def is_(self, other): diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 4619a6193daa4..d43034b0ebdd6 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -524,36 +524,52 @@ def _format_native_types(self, na_rep='nan', **kwargs): return mi.values - def _get_grouper_for_level(self, grouper, level): + def _get_grouper_for_level(self, group_mapper, level): + """ + Get index grouper corresponding to an index level + Parameters + ---------- + group_mapper: Group mapping function or None + Function mapping index values to groups + level : int + Index level + + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + level_index : Index or None + Index of unique values for level + """ inds = self.labels[level] level_index = self.levels[level] - # XXX complete hack - - if grouper is not None: + if group_mapper is not None: + # Handle group mapping function and return level_values = self.levels[level].take(inds) - grouper = level_values.map(grouper) - labels = None - level_index = None - else: - # all levels may not be observed - labels, uniques = algos.factorize(inds, sort=True) + grouper = level_values.map(group_mapper) + return grouper, None, None + + labels, uniques = algos.factorize(inds, sort=True) - if len(uniques) > 0 and uniques[0] == -1: - # handle NAs - mask = inds != -1 - ok_labels, uniques = algos.factorize(inds[mask], - sort=True) + if len(uniques) > 0 and uniques[0] == -1: + # Handle NAs + mask = inds != -1 + ok_labels, uniques = algos.factorize(inds[mask], + sort=True) - labels = np.empty(len(inds), dtype=inds.dtype) - labels[mask] = ok_labels - labels[~mask] = -1 + labels = np.empty(len(inds), dtype=inds.dtype) + labels[mask] = ok_labels + labels[~mask] = -1 - if len(uniques) < len(level_index): - level_index = level_index.take(uniques) + if len(uniques) < len(level_index): + # Remove unobserved levels from level_index + level_index = level_index.take(uniques) - grouper = level_index.take(labels) + grouper = level_index.take(labels) return grouper, labels, level_index