diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9a778acba4764..1ddc46d4b0679 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -314,6 +314,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Bug in :meth:`DataFrame.groupby(...).resample(...)` when restricting to `Series` or using `agg` did miscalculate the aggregation (:issue:`27343`, :issue:`33548`, :issue:`35275`). - Reshaping diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 59ea7781025c4..169e51bfab2a9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -311,7 +311,12 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): + def _set_grouper( + self, + obj: FrameOrSeries, + sort: bool = False, + group_indices: Optional[Dict] = None, + ): """ given an object and the specifications, setup the internal grouper for this particular specification @@ -327,9 +332,10 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") - # Keep self.grouper value before overriding + # Keep self.grouper and self.indexer value before overriding if self._grouper is None: self._grouper = self.grouper + self._indexer = self.indexer # the key must be a valid info item if self.key is not None: @@ -338,7 +344,14 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if getattr(self.grouper, "name", None) == key and isinstance( obj, ABCSeries ): - ax = self._grouper.take(obj.index) + if group_indices is None: + ax = self._grouper.take(obj.index) + else: + indices = group_indices.get(obj.name) + if self._indexer is not None: + ax = self._grouper.take(self._indexer.argsort()).take(indices) + else: + ax = self._grouper.take(indices) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7b5154756e613..06e75e0d36a8f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -91,7 +91,11 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.grouper = None if self.groupby is not None: - self.groupby._set_grouper(self._convert_obj(obj), sort=True) + self.groupby._set_grouper( + self._convert_obj(obj), + sort=True, + group_indices=kwargs.get("group_indices"), + ) def __str__(self) -> str: """ @@ -980,7 +984,9 @@ def _apply(self, f, grouper=None, *args, **kwargs): """ def func(x): - x = self._shallow_copy(x, groupby=self.groupby) + x = self._shallow_copy( + x, groupby=self.groupby, group_indices=self._groupby.indices + ) if isinstance(f, str): return getattr(x, f)(**kwargs) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 73bf7dafac254..4e1d80fd8a880 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -124,10 +124,12 @@ def test_getitem_multiple(): tm.assert_series_equal(result, expected) -def test_groupby_resample_on_api_with_getitem(): +@pytest.mark.parametrize("index_values", [[0, 1, 2, 3, 4], ["a", "b", "c", "d", "e"]]) +def test_groupby_resample_on_api_with_getitem(index_values): # GH 17813 df = pd.DataFrame( - {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1} + {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1}, + index=pd.Index(index_values), ) exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() result = df.groupby("id").resample("2D", on="date")["data"].sum() @@ -347,3 +349,67 @@ def test_median_duplicate_columns(): result = df.resample("5s").median() expected.columns = result.columns tm.assert_frame_equal(result, expected) + + +def test_resample_different_result_with_agg(): + # GH: 35275 and 33548 + data = pd.DataFrame( + { + "cat": ["cat1", "cat1", "cat2", "cat1", "cat2", "cat1", "cat2", "cat1"], + "num": [5, 20, 22, 3, 4, 30, 10, 50], + "date": [ + "2019-2-1", + "2018-02-03", + "2020-3-11", + "2019-2-2", + "2019-2-2", + "2018-12-4", + "2020-3-11", + "2020-12-12", + ], + } + ) + data["date"] = pd.to_datetime(data["date"]) + + resampled = data.groupby("cat").resample("Y", on="date") + + index = pd.MultiIndex.from_tuples( + [ + ("cat1", "2018-12-31"), + ("cat1", "2019-12-31"), + ("cat1", "2020-12-31"), + ("cat2", "2019-12-31"), + ("cat2", "2020-12-31"), + ], + names=["cat", "date"], + ) + index = index.set_levels([index.levels[0], pd.to_datetime(index.levels[1])]) + expected = DataFrame([25, 4, 50, 4, 16], columns=pd.Index(["num"]), index=index) + result = resampled.agg({"num": "mean"}) + tm.assert_frame_equal(result, expected) + result = resampled["num"].mean() + tm.assert_series_equal(result, expected["num"]) + result = resampled.mean() + tm.assert_frame_equal(result, expected) + + +def test_resample_agg_different_results_on_keyword(): + # GH: 27343 + df = pd.DataFrame.from_records( + { + "ref": ["a", "a", "a", "b", "b"], + "time": [ + "2014-12-31", + "2015-12-31", + "2016-12-31", + "2012-12-31", + "2014-12-31", + ], + "value": 5 * [1], + } + ) + df["time"] = pd.to_datetime(df["time"]) + + expected = df.set_index("time").groupby("ref").resample(rule="M")["value"].sum() + result = df.groupby("ref").resample(rule="M", on="time")["value"].sum() + tm.assert_series_equal(result, expected)