From 47a10571858816356f217d29b9e78b18a29f6f59 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Mon, 13 May 2019 22:14:29 +0530 Subject: [PATCH 01/12] BUG: ngroups and len(groups) do not equal when grouping with a list of Grouper and column label (GH26326) --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/groupby/ops.py | 5 ++++- pandas/tests/groupby/test_groupby.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index dacd433f112a5..de2de7e2e8985 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -406,7 +406,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - +- Bug in :meth:`pandas.core.groupby.ops.BaseGrouper.groups` in which a :class:`BaseGrouper` object with another :class:`BaseGrouper` as part of its :class:`Groupings` would return incorrect set of groups (:issue:`26326`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 235975a9b68a0..c7f735a7d007a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -258,7 +258,10 @@ def groups(self): if len(self.groupings) == 1: return self.groupings[0].groups else: - to_groupby = zip(*(ping.grouper for ping in self.groupings)) + to_groupby = zip(*(ping.grouper if not isinstance(ping.grouper, + self.__class__) + else ping.grouper.groupings[0].grouper for ping + in self.groupings)) to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 354242995d3c3..a2041f93a0ee1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1737,3 +1737,15 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): expected = pd.Series([3], index=ei) assert_series_equal(result, expected) + + +def test_groupby_groups_in_BaseGrouper(): + # https://github.com/pandas-dev/pandas/issues/26326 + m_index = pd.MultiIndex.from_product([['A', 'B'], + ['C', 'D']], names=['alpha', 'beta']) + df_sample = pd.DataFrame({'foo': [1, 2, 1, 2], 'bar': [1, 2, 3, 4]}, + index=m_index) + dfGBY_BaseGrouper = df_sample.groupby([pd.Grouper(level='alpha'), 'beta']) + dfGBY_noBaseGrouper = df_sample.groupby(['alpha', 'beta']) + + assert(dfGBY_BaseGrouper.groups == dfGBY_noBaseGrouper.groups) From cad3d6bb627b4c165e3c4a7e42f63e3fabb65185 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Mon, 13 May 2019 22:47:56 +0530 Subject: [PATCH 02/12] Sorted Imports in test_groupby.py --- pandas/tests/groupby/test_groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a2041f93a0ee1..ec13be7c3928b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,8 +10,7 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, - read_csv) + DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( From 0dba40d139f9ba16d1ad30c1e9b74e2bb2e52b6a Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Mon, 13 May 2019 23:16:56 +0530 Subject: [PATCH 03/12] Updated test and edited bugfix message --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/groupby/test_groupby.py | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index de2de7e2e8985..4a8d9ad6bfca4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -406,7 +406,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) -- Bug in :meth:`pandas.core.groupby.ops.BaseGrouper.groups` in which a :class:`BaseGrouper` object with another :class:`BaseGrouper` as part of its :class:`Groupings` would return incorrect set of groups (:issue:`26326`) +- Bug in :meth:`pandas.core.groupby.ops.BaseGrouper.groups` in which creating a :class:`GroupBy` object with a key of type :class:`Grouper` would result in producing incorrect groups (:issue:`26326`) Reshaping ^^^^^^^^^ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ec13be7c3928b..32d9057dcf70d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1739,12 +1739,15 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): def test_groupby_groups_in_BaseGrouper(): - # https://github.com/pandas-dev/pandas/issues/26326 - m_index = pd.MultiIndex.from_product([['A', 'B'], - ['C', 'D']], names=['alpha', 'beta']) - df_sample = pd.DataFrame({'foo': [1, 2, 1, 2], 'bar': [1, 2, 3, 4]}, - index=m_index) - dfGBY_BaseGrouper = df_sample.groupby([pd.Grouper(level='alpha'), 'beta']) - dfGBY_noBaseGrouper = df_sample.groupby(['alpha', 'beta']) - - assert(dfGBY_BaseGrouper.groups == dfGBY_noBaseGrouper.groups) + # GH 26326 + mi = pd.MultiIndex.from_product([['A', 'B'], + ['C', 'D']], names=['alpha', 'beta']) + df = pd.DataFrame({'foo': [1, 2, 1, 2], 'bar': [1, 2, 3, 4]}, + index=mi) + grp1 = df.groupby([pd.Grouper(level='alpha'), 'beta']) + grp2 = df.groupby(['beta', pd.Grouper(level='alpha')]) + nbggrp1 = df.groupby(['alpha', 'beta']) + nbggrp2 = df.groupby(['beta', 'alpha']) + + assert(grp1.groups == nbggrp1.groups) + assert(grp2.groups == nbggrp2.groups) From 648ddfa8bb5672833c458ad09efb310710e26978 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Tue, 14 May 2019 17:27:52 +0530 Subject: [PATCH 04/12] Updated method to not check for subclasses --- pandas/core/groupby/ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c7f735a7d007a..a312c9905bee5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -258,8 +258,10 @@ def groups(self): if len(self.groupings) == 1: return self.groupings[0].groups else: - to_groupby = zip(*(ping.grouper if not isinstance(ping.grouper, - self.__class__) + def is_base_grouper(self, obj): + return obj.__class__.__name__ == self.__class__.__name__ + to_groupby = zip(*(ping.grouper if not is_base_grouper(self, + ping.grouper) else ping.grouper.groupings[0].grouper for ping in self.groupings)) to_groupby = Index(to_groupby) From f8a32201d4c9f5b5db6d43245db78d21e72d17d8 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Tue, 14 May 2019 20:45:02 +0530 Subject: [PATCH 05/12] fixed under indent on line 264 --- pandas/core/groupby/ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a312c9905bee5..0b31cae21afa6 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -258,10 +258,10 @@ def groups(self): if len(self.groupings) == 1: return self.groupings[0].groups else: - def is_base_grouper(self, obj): + def is_basegrouper(self, obj): return obj.__class__.__name__ == self.__class__.__name__ - to_groupby = zip(*(ping.grouper if not is_base_grouper(self, - ping.grouper) + to_groupby = zip(*(ping.grouper if not is_basegrouper(self, + ping.grouper) else ping.grouper.groupings[0].grouper for ping in self.groupings)) to_groupby = Index(to_groupby) From f8f3d8f8b943679e11116b5da9826c10aa4a2384 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Thu, 16 May 2019 21:42:52 +0530 Subject: [PATCH 06/12] patched solution with modification of BaseGrouper --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/groupby/grouper.py | 2 +- pandas/core/groupby/ops.py | 24 +++++++++++++++++++----- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 4d0df594c0f27..a7526fd22c6e7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -443,6 +443,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) +- Fixed bug where assigning a :class:`pandas.core.arrays.numpy_.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26326`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 63931dda6acb2..04d407ebc670d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -280,7 +280,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj - self.grouper = grouper + self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0b31cae21afa6..a450d9e18b3bd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -150,6 +150,15 @@ def _get_splitter(self, data, axis=0): comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) + def _get_grouper(self): + """ + We are a grouper as part of another's groupings. + + We have a specific method of grouping, so cannot + convert to a Index for our grouper. + """ + return self.groupings[0].grouper + def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] @@ -260,10 +269,7 @@ def groups(self): else: def is_basegrouper(self, obj): return obj.__class__.__name__ == self.__class__.__name__ - to_groupby = zip(*(ping.grouper if not is_basegrouper(self, - ping.grouper) - else ping.grouper.groupings[0].grouper for ping - in self.groupings)) + to_groupby = zip(*(ping.grouper for ping in self.groupings)) to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby) @@ -712,6 +718,15 @@ def groups(self): def nkeys(self): return 1 + def _get_grouper(self): + """ + We are a grouper as part of another's groupings. + + We have a specific method of grouping, so cannot + convert to a Index for our grouper. + """ + return self + def get_iterator(self, data, axis=0): """ Groupby iterator @@ -792,7 +807,6 @@ def agg_series(self, obj, func): grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() - def _get_axes(group): if isinstance(group, Series): return [group.index] From bfdea4cc31727a4d3b095964765ab49aa1c147be Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Thu, 16 May 2019 21:57:19 +0530 Subject: [PATCH 07/12] fixed whatsnew release note message --- doc/source/whatsnew/v0.25.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a7526fd22c6e7..564266b4b8690 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -443,8 +443,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) -- Fixed bug where assigning a :class:`pandas.core.arrays.numpy_.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26326`) - +- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups Reshaping ^^^^^^^^^ From 49615db1607bb2da478d58e9048e9b3f2fdf389f Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Thu, 16 May 2019 22:02:47 +0530 Subject: [PATCH 08/12] added issue number to message --- doc/source/whatsnew/v0.25.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 564266b4b8690..6ed4a834fe6c9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -443,7 +443,8 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) -- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups +- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups (:issue:`26326`) + Reshaping ^^^^^^^^^ From 050fb893e3cd46018bb25f092b84d202687aa009 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Thu, 16 May 2019 22:46:19 +0530 Subject: [PATCH 09/12] fixed linting --- pandas/core/groupby/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a450d9e18b3bd..1b914daed7c88 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -807,6 +807,7 @@ def agg_series(self, obj, func): grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() + def _get_axes(group): if isinstance(group, Series): return [group.index] From af63e6e4a174e6dc3006f63f40fd79c1a1bc41e0 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Mon, 20 May 2019 03:20:57 +0530 Subject: [PATCH 10/12] fixed release note msg --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6ed4a834fe6c9..b775375978ae1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -443,7 +443,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) -- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups (:issue:`26326`) +- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) Reshaping ^^^^^^^^^ From fe0405a4ab74e992d709786701c3c995a57119df Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Mon, 20 May 2019 03:28:03 +0530 Subject: [PATCH 11/12] Added test message, refactor test code --- pandas/tests/groupby/test_groupby.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 32d9057dcf70d..c77437dac585d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1740,14 +1740,15 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): def test_groupby_groups_in_BaseGrouper(): # GH 26326 + # Test if DataFrame grouped with a pandas.Grouper has correct groups mi = pd.MultiIndex.from_product([['A', 'B'], ['C', 'D']], names=['alpha', 'beta']) df = pd.DataFrame({'foo': [1, 2, 1, 2], 'bar': [1, 2, 3, 4]}, index=mi) - grp1 = df.groupby([pd.Grouper(level='alpha'), 'beta']) - grp2 = df.groupby(['beta', pd.Grouper(level='alpha')]) - nbggrp1 = df.groupby(['alpha', 'beta']) - nbggrp2 = df.groupby(['beta', 'alpha']) + result = df.groupby([pd.Grouper(level='alpha'), 'beta']) + expected = df.groupby(['alpha', 'beta']) + assert(result.groups == expected.groups) - assert(grp1.groups == nbggrp1.groups) - assert(grp2.groups == nbggrp2.groups) + result = df.groupby(['beta', pd.Grouper(level='alpha')]) + expected = df.groupby(['beta', 'alpha']) + assert(result.groups == expected.groups) From ee4dffe3b9e37c6e007707f254323f5accd11fb6 Mon Sep 17 00:00:00 2001 From: Shantanu Gontia Date: Mon, 20 May 2019 03:30:02 +0530 Subject: [PATCH 12/12] remove extraneous function --- pandas/core/groupby/ops.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1b914daed7c88..34ce690004b1a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -267,8 +267,6 @@ def groups(self): if len(self.groupings) == 1: return self.groupings[0].groups else: - def is_basegrouper(self, obj): - return obj.__class__.__name__ == self.__class__.__name__ to_groupby = zip(*(ping.grouper for ping in self.groupings)) to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby)