From e4d67354aa0cc0404da22d6d4c79b2c7d0b354c7 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 7 Sep 2020 21:23:52 +0200 Subject: [PATCH 1/9] Bug in groupby and resample miscalculatet aggregation or raised error if no RangeIndex was used --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/grouper.py | 14 +++++ .../tests/resample/test_resampler_grouper.py | 62 +++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b1229a5d5823d..767609eec80ac 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -314,7 +314,7 @@ Groupby/resample/rolling - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) -- +- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and restricting to `Series` or using `agg` did miscalculate the aggregation (:issue:`27343`, :issue:`33548`, :issue:`35275`). Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 59ea7781025c4..0d2a744afa7da 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -338,6 +338,20 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): + # In some cases the self._grouper may be sorted differently than obj. + # self.obj has the right order with the old index in the first go around. + # We align the index from obj with the self.obj index to select the correct + # values. Additionally we have to sort obj.index correctly to aggregate + # correctly. + if not hasattr(self, "_index_mapping"): + self._index_mapping = DataFrame( + self.obj.index, columns=["index"] + ).sort_values(by="index") + obj.sort_index(inplace=True) + obj.index = self._index_mapping.loc[ + self._index_mapping["index"].isin(obj.index) + ].index + obj.sort_index(inplace=True) ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 73bf7dafac254..d95deaad2b36e 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -347,3 +347,65 @@ def test_median_duplicate_columns(): result = df.resample("5s").median() expected.columns = result.columns tm.assert_frame_equal(result, expected) + + +def test_resample_different_result_with_agg(): + data = pd.DataFrame( + { + "cat": ["cat1", "cat1", "cat2", "cat1", "cat2", "cat1", "cat2", "cat1"], + "num": [5, 20, 22, 3, 4, 30, 10, 50], + "date": [ + "2019-2-1", + "2018-02-03", + "2020-3-11", + "2019-2-2", + "2019-2-2", + "2018-12-4", + "2020-3-11", + "2020-12-12", + ], + } + ) + data["date"] = pd.to_datetime(data["date"]) + + resampled = data.groupby("cat").resample("Y", on="date") + + index = pd.MultiIndex.from_tuples( + [ + ("cat1", "2018-12-31"), + ("cat1", "2019-12-31"), + ("cat1", "2020-12-31"), + ("cat2", "2019-12-31"), + ("cat2", "2020-12-31"), + ], + names=["cat", "date"], + ) + index = index.set_levels([index.levels[0], pd.to_datetime(index.levels[1])]) + expected = DataFrame([25, 4, 50, 4, 16], columns=pd.Index(["num"]), index=index) + result = resampled.agg({"num": "mean"}) + tm.assert_frame_equal(result, expected) + result = resampled["num"].mean() + tm.assert_series_equal(result, expected["num"]) + result = resampled.mean() + tm.assert_frame_equal(result, expected) + + +def test_resample_agg_different_results_on_keyword(): + df = pd.DataFrame.from_records( + { + "ref": ["a", "a", "a", "b", "b"], + "time": [ + "2014-12-31", + "2015-12-31", + "2016-12-31", + "2012-12-31", + "2014-12-31", + ], + "value": 5 * [1], + } + ) + df["time"] = pd.to_datetime(df["time"]) + + expected = df.set_index("time").groupby("ref").resample(rule="M")["value"].sum() + result = df.groupby("ref").resample(rule="M", on="time")["value"].sum() + tm.assert_series_equal(result, expected) From bc127f233212c3d06335da277b435b2eca3de594 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 7 Sep 2020 21:32:30 +0200 Subject: [PATCH 2/9] Fix Pep8 issues --- pandas/core/groupby/grouper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 0d2a744afa7da..0f050a4c10893 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -339,10 +339,10 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): obj, ABCSeries ): # In some cases the self._grouper may be sorted differently than obj. - # self.obj has the right order with the old index in the first go around. - # We align the index from obj with the self.obj index to select the correct - # values. Additionally we have to sort obj.index correctly to aggregate - # correctly. + # self.obj has the right order with the old index in the first go + # around. We align the index from obj with the self.obj index to + # select the correct values. Additionally we have to sort + # obj.index correctly to aggregate correctly. if not hasattr(self, "_index_mapping"): self._index_mapping = DataFrame( self.obj.index, columns=["index"] From fcc11e7adb1a5e63d7a04bf23b9f55e629617018 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 7 Sep 2020 21:41:40 +0200 Subject: [PATCH 3/9] Add gh numbers to tests --- pandas/tests/resample/test_resampler_grouper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index d95deaad2b36e..e90052eb941c1 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -350,6 +350,7 @@ def test_median_duplicate_columns(): def test_resample_different_result_with_agg(): + # GH: 35275 and 33548 data = pd.DataFrame( { "cat": ["cat1", "cat1", "cat2", "cat1", "cat2", "cat1", "cat2", "cat1"], @@ -391,6 +392,7 @@ def test_resample_different_result_with_agg(): def test_resample_agg_different_results_on_keyword(): + # GH: 27343 df = pd.DataFrame.from_records( { "ref": ["a", "a", "a", "b", "b"], From d76f1c517f5e5c59e59432080358c020853ad967 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 8 Sep 2020 16:18:28 +0200 Subject: [PATCH 4/9] Use take on indexer based on group indices --- pandas/core/groupby/grouper.py | 26 +++++++++++++++----------- pandas/core/resample.py | 4 ++-- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 0f050a4c10893..959432bed7716 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -311,7 +311,7 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False, _group_indices: Dict = None): """ given an object and the specifications, setup the internal grouper for this particular specification @@ -330,6 +330,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper + self._indexer = self.indexer # the key must be a valid info item if self.key is not None: @@ -343,16 +344,19 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # around. We align the index from obj with the self.obj index to # select the correct values. Additionally we have to sort # obj.index correctly to aggregate correctly. - if not hasattr(self, "_index_mapping"): - self._index_mapping = DataFrame( - self.obj.index, columns=["index"] - ).sort_values(by="index") - obj.sort_index(inplace=True) - obj.index = self._index_mapping.loc[ - self._index_mapping["index"].isin(obj.index) - ].index - obj.sort_index(inplace=True) - ax = self._grouper.take(obj.index) + # if not hasattr(self, "_index_mapping"): + # self._index_mapping = DataFrame( + # self.obj.index, columns=["index"] + # ).sort_values(by="index") + # from pandas.core.sorting import get_indexer_dict + # obj.sort_index(inplace=True) + indices = _group_indices.get(obj.name) + ax = self._grouper.take(self._indexer.argsort()).take(indices) + # obj.index = self._index_mapping.loc[ + # self._index_mapping["index"].isin(obj.index) + # ].index + # obj.sort_index(inplace=True) + # ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7b5154756e613..45e5c18bfcac2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -91,7 +91,7 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.grouper = None if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True) + self.groupby._set_grouper(self._convert_obj(obj), sort=True, _group_indices=kwargs.get("_group_indices")) def __str__(self) -> str: """ @@ -980,7 +980,7 @@ def _apply(self, f, grouper=None, *args, **kwargs): """ def func(x): - x = self._shallow_copy(x, groupby=self.groupby) + x = self._shallow_copy(x, groupby=self.groupby, _group_indices=self._groupby.indices) if isinstance(f, str): return getattr(x, f)(**kwargs) From d4d02d84aea5d2571712832bd8ba9b5593f5a472 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 8 Sep 2020 18:38:03 +0200 Subject: [PATCH 5/9] Refactor code and parametrize test --- pandas/core/groupby/grouper.py | 27 +++++-------------- pandas/core/resample.py | 4 +-- .../tests/resample/test_resampler_grouper.py | 6 +++-- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 959432bed7716..f92fddf6b76b0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -311,7 +311,7 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj: FrameOrSeries, sort: bool = False, _group_indices: Dict = None): + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False, group_indices: Dict = None): """ given an object and the specifications, setup the internal grouper for this particular specification @@ -327,7 +327,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False, _group_indices: D if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") - # Keep self.grouper value before overriding + # Keep self.grouper and self.indexer value before overriding if self._grouper is None: self._grouper = self.grouper self._indexer = self.indexer @@ -339,24 +339,11 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False, _group_indices: D if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): - # In some cases the self._grouper may be sorted differently than obj. - # self.obj has the right order with the old index in the first go - # around. We align the index from obj with the self.obj index to - # select the correct values. Additionally we have to sort - # obj.index correctly to aggregate correctly. - # if not hasattr(self, "_index_mapping"): - # self._index_mapping = DataFrame( - # self.obj.index, columns=["index"] - # ).sort_values(by="index") - # from pandas.core.sorting import get_indexer_dict - # obj.sort_index(inplace=True) - indices = _group_indices.get(obj.name) - ax = self._grouper.take(self._indexer.argsort()).take(indices) - # obj.index = self._index_mapping.loc[ - # self._index_mapping["index"].isin(obj.index) - # ].index - # obj.sort_index(inplace=True) - # ax = self._grouper.take(obj.index) + indices = group_indices.get(obj.name) + if self._indexer is not None: + ax = self._grouper.take(self._indexer.argsort()).take(indices) + else: + ax = self._grouper.take(indices) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 45e5c18bfcac2..95ab505a0c3b5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -91,7 +91,7 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.grouper = None if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True, _group_indices=kwargs.get("_group_indices")) + self.groupby._set_grouper(self._convert_obj(obj), sort=True, group_indices=kwargs.get("group_indices")) def __str__(self) -> str: """ @@ -980,7 +980,7 @@ def _apply(self, f, grouper=None, *args, **kwargs): """ def func(x): - x = self._shallow_copy(x, groupby=self.groupby, _group_indices=self._groupby.indices) + x = self._shallow_copy(x, groupby=self.groupby, group_indices=self._groupby.indices) if isinstance(f, str): return getattr(x, f)(**kwargs) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index e90052eb941c1..1f701cc6c298d 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -124,10 +124,12 @@ def test_getitem_multiple(): tm.assert_series_equal(result, expected) -def test_groupby_resample_on_api_with_getitem(): +@pytest.mark.parametrize("index_values", [[0, 1, 2, 3, 4], ["a", "b", "c", "d", "e"]]) +def test_groupby_resample_on_api_with_getitem(index_values): # GH 17813 df = pd.DataFrame( - {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1} + {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1}, + index=pd.Index(index_values) ) exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() result = df.groupby("id").resample("2D", on="date")["data"].sum() From 087c6828a5c44a11f249782dba81f2bdce3e6d90 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 8 Sep 2020 18:44:26 +0200 Subject: [PATCH 6/9] Run black pas --- pandas/core/groupby/grouper.py | 4 +++- pandas/core/resample.py | 10 ++++++++-- pandas/tests/resample/test_resampler_grouper.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f92fddf6b76b0..c6b9b21f762e2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -311,7 +311,9 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj: FrameOrSeries, sort: bool = False, group_indices: Dict = None): + def _set_grouper( + self, obj: FrameOrSeries, sort: bool = False, group_indices: Dict = None + ): """ given an object and the specifications, setup the internal grouper for this particular specification diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 95ab505a0c3b5..06e75e0d36a8f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -91,7 +91,11 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.grouper = None if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True, group_indices=kwargs.get("group_indices")) + self.groupby._set_grouper( + self._convert_obj(obj), + sort=True, + group_indices=kwargs.get("group_indices"), + ) def __str__(self) -> str: """ @@ -980,7 +984,9 @@ def _apply(self, f, grouper=None, *args, **kwargs): """ def func(x): - x = self._shallow_copy(x, groupby=self.groupby, group_indices=self._groupby.indices) + x = self._shallow_copy( + x, groupby=self.groupby, group_indices=self._groupby.indices + ) if isinstance(f, str): return getattr(x, f)(**kwargs) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 1f701cc6c298d..4e1d80fd8a880 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -129,7 +129,7 @@ def test_groupby_resample_on_api_with_getitem(index_values): # GH 17813 df = pd.DataFrame( {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1}, - index=pd.Index(index_values) + index=pd.Index(index_values), ) exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() result = df.groupby("id").resample("2D", on="date")["data"].sum() From 1e43de6151e1ccf9da993ee900fae2371b967860 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 8 Sep 2020 19:11:33 +0200 Subject: [PATCH 7/9] Fix wrong Type hint --- pandas/core/groupby/grouper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c6b9b21f762e2..ce138a2a126ee 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -312,7 +312,10 @@ def _get_grouper(self, obj, validate: bool = True): return self.binner, self.grouper, self.obj def _set_grouper( - self, obj: FrameOrSeries, sort: bool = False, group_indices: Dict = None + self, + obj: FrameOrSeries, + sort: bool = False, + group_indices: Optional[Dict] = None, ): """ given an object and the specifications, setup the internal grouper From f08d56eee02dd46379cc9090dace63ae2c2cb48b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 8 Sep 2020 20:50:21 +0200 Subject: [PATCH 8/9] Catch group_indices None case --- pandas/core/groupby/grouper.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ce138a2a126ee..169e51bfab2a9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -344,11 +344,14 @@ def _set_grouper( if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): - indices = group_indices.get(obj.name) - if self._indexer is not None: - ax = self._grouper.take(self._indexer.argsort()).take(indices) + if group_indices is None: + ax = self._grouper.take(obj.index) else: - ax = self._grouper.take(indices) + indices = group_indices.get(obj.name) + if self._indexer is not None: + ax = self._grouper.take(self._indexer.argsort()).take(indices) + else: + ax = self._grouper.take(indices) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") From 19569c4784f9271be9fb78e716496fe188b1c727 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 9 Sep 2020 09:42:11 +0200 Subject: [PATCH 9/9] Change whats new entry --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e82705c9a9273..1ddc46d4b0679 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -314,7 +314,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and restricting to `Series` or using `agg` did miscalculate the aggregation (:issue:`27343`, :issue:`33548`, :issue:`35275`). +- Bug in :meth:`DataFrame.groupby(...).resample(...)` when restricting to `Series` or using `agg` did miscalculate the aggregation (:issue:`27343`, :issue:`33548`, :issue:`35275`). - Reshaping