From d9579bcd6392f74bcea175d8bc6a52899cd97068 Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Mon, 16 Nov 2020 20:05:46 -0500 Subject: [PATCH 01/11] BUG: fix GH33548 --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/groupby/grouper.py | 12 +++++++++--- pandas/tests/resample/test_resampler_grouper.py | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 28f7df98cb86b..260e1622eed55 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -661,6 +661,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) - Bug in :meth:`DataFrameGroupBy.head`, :meth:`DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) - Bug in :meth:`DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) +- Bug in :meth:`DataFrame.groupby(...).resample(...)` using .agg with sum produced different result than just calling .sum (:issue:`33548`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e8af9da30a298..01ad52a46fc88 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -340,6 +340,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper + self._indexer = self.indexer # the key must be a valid info item if self.key is not None: @@ -348,9 +349,14 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): - # pandas\core\groupby\grouper.py:348: error: Item "None" of - # "Optional[Any]" has no attribute "take" [union-attr] - ax = self._grouper.take(obj.index) # type: ignore[union-attr] + if self.indexer is not None: + reverse_indexer = self._indexer.argsort() + unsorted_ax = self._grouper.take(reverse_indexer) + ax = unsorted_ax.take(obj.index) + else: + # pandas\core\groupby\grouper.py:348: error: Item "None" of + # "Optional[Any]" has no attribute "take" [union-attr] + ax = self._grouper.take(obj.index) # type: ignore[union-attr] else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 15dd49f8bf182..075b13f1a0392 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -362,3 +362,19 @@ def test_apply_to_one_column_of_df(): tm.assert_series_equal(result, expected) result = df.resample("H").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) + + +def test_resample_groupby_agg(): + # GH: 33548 + df = pd.DataFrame({ + 'cat': ['cat_1', 'cat_1', 'cat_2', 'cat_1', 'cat_2', 'cat_1', 'cat_2', 'cat_1'], + 'num': [5, 20, 22, 3, 4, 30, 10, 50], + 'date': ['2019-2-1', '2018-02-03', '2020-3-11', '2019-2-2', '2019-2-2', '2018-12-4', '2020-3-11', '2020-12-12'] + }) + df['date'] = pd.to_datetime(df['date']) + + resampled = df.groupby('cat').resample('Y', on='date') + expected = resampled.sum() + result = resampled.agg({'num': 'sum'}) + + tm.assert_frame_equal(result, expected) From 922af24aee2813617e1e270856b0078febb08234 Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Mon, 16 Nov 2020 20:22:54 -0500 Subject: [PATCH 02/11] reformat test_resampler_grouper.by using black --- .../tests/resample/test_resampler_grouper.py | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 075b13f1a0392..da5bb0eb59f70 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -366,15 +366,35 @@ def test_apply_to_one_column_of_df(): def test_resample_groupby_agg(): # GH: 33548 - df = pd.DataFrame({ - 'cat': ['cat_1', 'cat_1', 'cat_2', 'cat_1', 'cat_2', 'cat_1', 'cat_2', 'cat_1'], - 'num': [5, 20, 22, 3, 4, 30, 10, 50], - 'date': ['2019-2-1', '2018-02-03', '2020-3-11', '2019-2-2', '2019-2-2', '2018-12-4', '2020-3-11', '2020-12-12'] - }) - df['date'] = pd.to_datetime(df['date']) - - resampled = df.groupby('cat').resample('Y', on='date') + df = DataFrame( + { + "cat": [ + "cat_1", + "cat_1", + "cat_2", + "cat_1", + "cat_2", + "cat_1", + "cat_2", + "cat_1", + ], + "num": [5, 20, 22, 3, 4, 30, 10, 50], + "date": [ + "2019-2-1", + "2018-02-03", + "2020-3-11", + "2019-2-2", + "2019-2-2", + "2018-12-4", + "2020-3-11", + "2020-12-12", + ], + } + ) + df["date"] = pd.to_datetime(df["date"]) + + resampled = df.groupby("cat").resample("Y", on="date") expected = resampled.sum() - result = resampled.agg({'num': 'sum'}) + result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) From 1634a281aa452bec82745696f1ce18f5988e2071 Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Mon, 16 Nov 2020 20:48:45 -0500 Subject: [PATCH 03/11] fix mypy failure --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 01ad52a46fc88..4fa9f02f904fe 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -350,8 +350,8 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): obj, ABCSeries ): if self.indexer is not None: - reverse_indexer = self._indexer.argsort() - unsorted_ax = self._grouper.take(reverse_indexer) + reverse_indexer = self._indexer.argsort() # type: ignore[union-attr] + unsorted_ax = self._grouper.take(reverse_indexer) # type: ignore[union-attr] ax = unsorted_ax.take(obj.index) else: # pandas\core\groupby\grouper.py:348: error: Item "None" of From 0e1806f3b11a5b3983a4370b1802e215b56c3e40 Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Mon, 16 Nov 2020 21:40:28 -0500 Subject: [PATCH 04/11] undo mypy ignore comments since it breaks flake8 --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 4fa9f02f904fe..01ad52a46fc88 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -350,8 +350,8 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): obj, ABCSeries ): if self.indexer is not None: - reverse_indexer = self._indexer.argsort() # type: ignore[union-attr] - unsorted_ax = self._grouper.take(reverse_indexer) # type: ignore[union-attr] + reverse_indexer = self._indexer.argsort() + unsorted_ax = self._grouper.take(reverse_indexer) ax = unsorted_ax.take(obj.index) else: # pandas\core\groupby\grouper.py:348: error: Item "None" of From 7134d7c3afaa40fcb26f60b900e7da37578cf5ff Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Mon, 16 Nov 2020 22:48:03 -0500 Subject: [PATCH 05/11] make requested changes to whatsnew/v1.2.0.rst --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 260e1622eed55..b0dee5fe6abd2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -661,7 +661,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) - Bug in :meth:`DataFrameGroupBy.head`, :meth:`DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) - Bug in :meth:`DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) -- Bug in :meth:`DataFrame.groupby(...).resample(...)` using .agg with sum produced different result than just calling .sum (:issue:`33548`) +- Bug in :meth:`DataFrameGroupBy.resample` using ``.agg`` with sum produced different result than just calling ``.sum`` (:issue:`33548`) Reshaping ^^^^^^^^^ From 34e00be56ac0580bfcd18111858c447c16d3e21c Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Mon, 16 Nov 2020 22:49:06 -0500 Subject: [PATCH 06/11] fix mypy issues and add some comments --- pandas/core/groupby/grouper.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 01ad52a46fc88..7d54350946db3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -349,14 +349,19 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): + # Sometimes (when self.indexer is not None) self._grouper will be + # sorted while obj is not. In this case there is a mismatch when we + # call self._grouper.take(obj.index) so we need to undo the sorting + # before we call _grouper.take. if self.indexer is not None: + assert self._indexer is not None + assert self._grouper is not None reverse_indexer = self._indexer.argsort() unsorted_ax = self._grouper.take(reverse_indexer) ax = unsorted_ax.take(obj.index) else: - # pandas\core\groupby\grouper.py:348: error: Item "None" of - # "Optional[Any]" has no attribute "take" [union-attr] - ax = self._grouper.take(obj.index) # type: ignore[union-attr] + assert self._grouper is not None + ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") From 6ecbf4e403d5457a35097762940e018dd7c062aa Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Wed, 18 Nov 2020 15:40:19 -0500 Subject: [PATCH 07/11] add grouper_resorted attribute to TimeGrouper --- pandas/core/groupby/grouper.py | 7 ++++--- pandas/core/resample.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7d54350946db3..4edc3e8023849 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -349,11 +349,11 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): - # Sometimes (when self.indexer is not None) self._grouper will be - # sorted while obj is not. In this case there is a mismatch when we + # Sometimes self._grouper will have been resorted while + # obj has not. In this case there is a mismatch when we # call self._grouper.take(obj.index) so we need to undo the sorting # before we call _grouper.take. - if self.indexer is not None: + if self.grouper_resorted: assert self._indexer is not None assert self._grouper is not None reverse_indexer = self._indexer.argsort() @@ -388,6 +388,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) + self.grouper_resorted = True self.obj = obj self.grouper = ax diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fccedd75c4531..a07ad924e8a1e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1404,6 +1404,8 @@ def __init__( self.fill_method = fill_method self.limit = limit + self.grouper_resorted = False + if origin in ("epoch", "start", "start_day"): self.origin = origin else: From 49063c9f781d3025b94ea13adfdb0a2ed38ad64f Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Wed, 18 Nov 2020 16:02:01 -0500 Subject: [PATCH 08/11] set self._grouper = None in Grouper init --- pandas/core/groupby/grouper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 4edc3e8023849..6bf57d6d565a2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -288,6 +288,7 @@ def __init__( self.indexer = None self.binner = None self._grouper = None + self._indexer = None self.dropna = dropna @property From ed8a5ca005bc20940ce4fb2d578332f60eed763d Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Wed, 18 Nov 2020 16:51:14 -0500 Subject: [PATCH 09/11] set grouper_resorted attribute on Grouper instead of TimeGrouper to fix mypy error --- pandas/core/groupby/grouper.py | 1 + pandas/core/resample.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6bf57d6d565a2..8459a4243b87c 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -287,6 +287,7 @@ def __init__( self.obj = None self.indexer = None self.binner = None + self.grouper_resorted = False self._grouper = None self._indexer = None self.dropna = dropna diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a07ad924e8a1e..fccedd75c4531 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1404,8 +1404,6 @@ def __init__( self.fill_method = fill_method self.limit = limit - self.grouper_resorted = False - if origin in ("epoch", "start", "start_day"): self.origin = origin else: From e66712b45aec8af0bc9922157f5be6492bd08e14 Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Sun, 22 Nov 2020 12:51:03 -0500 Subject: [PATCH 10/11] get rid of grouper_resorted attribute and use _indexer is not None as condition instead --- pandas/core/groupby/grouper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8459a4243b87c..1cd910c26662d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -287,7 +287,6 @@ def __init__( self.obj = None self.indexer = None self.binner = None - self.grouper_resorted = False self._grouper = None self._indexer = None self.dropna = dropna @@ -355,7 +354,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # obj has not. In this case there is a mismatch when we # call self._grouper.take(obj.index) so we need to undo the sorting # before we call _grouper.take. - if self.grouper_resorted: + if self._indexer is not None: assert self._indexer is not None assert self._grouper is not None reverse_indexer = self._indexer.argsort() @@ -390,7 +389,6 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) obj = obj.take(indexer, axis=self.axis) - self.grouper_resorted = True self.obj = obj self.grouper = ax From 81bbc17732e6a2711bd16e2795d667f1b157ecb6 Mon Sep 17 00:00:00 2001 From: jalmaguer Date: Sun, 29 Nov 2020 18:27:38 -0500 Subject: [PATCH 11/11] remove unnecessary assert on self._indexer --- pandas/core/groupby/grouper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1cd910c26662d..4c0aa644f0b61 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -355,7 +355,6 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # call self._grouper.take(obj.index) so we need to undo the sorting # before we call _grouper.take. if self._indexer is not None: - assert self._indexer is not None assert self._grouper is not None reverse_indexer = self._indexer.argsort() unsorted_ax = self._grouper.take(reverse_indexer)