From cd6e84775a6aab65628e184ea15c2c45c3f7134e Mon Sep 17 00:00:00 2001 From: Richard Date: Fri, 14 Aug 2020 11:44:45 -0400 Subject: [PATCH 1/2] REGR: Using agg with non-aggregating functions --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/groupby/generic.py | 24 +++++++++++-------- pandas/core/groupby/groupby.py | 10 ++++---- .../tests/groupby/aggregate/test_aggregate.py | 13 ++++++++++ 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index b37103910afab..81057297cbb71 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7280a9f7db3c..9d636ad4bc32f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -322,11 +322,13 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - output = self._wrap_aggregated_output(results) + output = self._wrap_aggregated_output(results, index=None) return self.obj._constructor_expanddim(output, columns=columns) def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -335,7 +337,7 @@ def _wrap_series_output( ---------- output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. - index : pd.Index + index : pd.Index or None Index to apply to the output. Returns @@ -363,8 +365,11 @@ def _wrap_series_output( return result + # TODO: Remove index argument, use self.grouper.result_index, see GH 35490 def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -383,9 +388,7 @@ def _wrap_aggregated_output( In the vast majority of cases output will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - result = self._wrap_series_output( - output=output, index=self.grouper.result_index - ) + result = self._wrap_series_output(output=output, index=index) return self._reindex_output(result) def _wrap_transformed_output( @@ -1720,7 +1723,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1745,8 +1750,7 @@ def _wrap_aggregated_output( self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: - index = self.grouper.result_index - result.index = index + result.index = self.grouper.result_index if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4597afeeaddbf..0047877ef78ee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -974,7 +974,9 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index] + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): @@ -1049,7 +1051,7 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs @@ -1102,7 +1104,7 @@ def _python_agg_general( output[key] = maybe_cast_result(values[mask], result) - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -2534,7 +2536,7 @@ def _get_cythonized_result( raise TypeError(error_msg) if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: return self._wrap_transformed_output(output) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40a20c8210052..7ddc7213f78a4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1061,3 +1061,16 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +# TODO: agg should raise +def test_nonagg_agg(): + # GH 35490 - Single/Multiple agg of non-agg function give same results + df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + g = df.groupby("a") + + result = g.agg(["cumsum"]) + result.columns = result.columns.droplevel(-1) + expected = g.agg("cumsum") + + tm.assert_frame_equal(result, expected) From a4538649e155498ee6c9fca2838256bfe2fce111 Mon Sep 17 00:00:00 2001 From: Richard Date: Fri, 14 Aug 2020 11:55:49 -0400 Subject: [PATCH 2/2] Improved TODOs --- pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9d636ad4bc32f..b806d9856d20f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -325,6 +325,7 @@ def _aggregate_multiple_funcs(self, arg): output = self._wrap_aggregated_output(results, index=None) return self.obj._constructor_expanddim(output, columns=columns) + # TODO: index should not be Optional - see GH 35490 def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 7ddc7213f78a4..ce9d4b892d775 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,9 +1063,9 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) -# TODO: agg should raise def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results + # TODO: agg should raise for functions that don't aggregate df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) g = df.groupby("a")