From ea7d644668f629c165032bd5da23fc1cc5e48063 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 22 Apr 2023 19:22:04 +0200 Subject: [PATCH 1/2] REGR: SeriesGroupBy.agg with multiple categoricals, as_index=False, and a list fails (#52850) (cherry picked from commit 2ac0da4e129c78b41a1bf9c8c4a5b751c0bae408) --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/groupby/generic.py | 4 +- pandas/tests/groupby/test_categorical.py | 60 ++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index d340b034b6c2c..7768900a0f82e 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_values` not resetting index when :class:`DataFrame` is already sorted and ``ignore_index=True`` (:issue:`52553`) - Fixed regression in :meth:`MultiIndex.isin` raising ``TypeError`` for ``Generator`` (:issue:`52568`) - Fixed regression in :meth:`Series.describe` showing ``RuntimeWarning`` for extension dtype :class:`Series` with one element (:issue:`52515`) +- Fixed regression in :meth:`SeriesGroupBy.agg` failing when grouping with categorical data, multiple groupings, ``as_index=False``, and a list of aggregations (:issue:`52760`) .. --------------------------------------------------------------------------- .. _whatsnew_201.bug_fixes: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 39cf1172403b4..d11a00972b16b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -241,8 +241,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) assert columns is not None # for mypy ret.columns = columns if not self.as_index: - ret = self._insert_inaxis_grouper(ret) - ret.index = default_index(len(ret)) + ret = ret.reset_index() return ret else: @@ -328,7 +327,6 @@ def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame: output = self.obj._constructor_expanddim(indexed_output, index=None) output.columns = Index(key.label for key in results) - output = self._reindex_output(output) return output def _wrap_applied_output( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fa8df166d56ac..7b874fbb1c964 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -14,6 +14,7 @@ qcut, ) import pandas._testing as tm +from pandas.api.typing import SeriesGroupBy from pandas.tests.groupby import get_groupby_method_args @@ -2007,3 +2008,62 @@ def test_many_categories(as_index, sort, index_kind, ordered): expected = DataFrame({"a": Series(index), "b": data}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]]) +@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]]) +def test_groupby_default_depr(cat_columns, keys): + # GH#43999 + df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]}) + df[cat_columns] = df[cat_columns].astype("category") + msg = "The default of observed=False is deprecated" + klass = FutureWarning if set(cat_columns) & set(keys) else None + with tm.assert_produces_warning(klass, match=msg): + df.groupby(keys) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_agg_list(request, as_index, observed, reduction_func, test_series, keys): + # GH#52760 + if test_series and reduction_func == "corrwith": + assert not hasattr(SeriesGroupBy, "corrwith") + pytest.skip("corrwith not implemented for SeriesGroupBy") + elif reduction_func == "corrwith": + msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + elif ( + reduction_func == "nunique" + and not test_series + and len(keys) != 1 + and not observed + and not as_index + ): + msg = "GH#52848 - raises a ValueError" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + + df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) + df = df.astype({"a1": "category", "a2": "category"}) + if "a2" not in keys: + df = df.drop(columns="a2") + gb = df.groupby(by=keys, as_index=as_index, observed=observed) + if test_series: + gb = gb["b"] + args = get_groupby_method_args(reduction_func, df) + + result = gb.agg([reduction_func], *args) + expected = getattr(gb, reduction_func)(*args) + + if as_index and (test_series or reduction_func == "size"): + expected = expected.to_frame(reduction_func) + if not test_series: + if not as_index: + # TODO: GH#52849 - as_index=False is not respected + expected = expected.set_index(keys) + expected.columns = MultiIndex( + levels=[["b"], [reduction_func]], codes=[[0], [0]] + ) + elif not as_index: + expected.columns = keys + [reduction_func] + + tm.assert_equal(result, expected) From 7ab8df0a606de12de2fb16c588436441dc5c01be Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 22 Apr 2023 15:29:19 -0400 Subject: [PATCH 2/2] Fixup for 2.0.x --- pandas/tests/groupby/test_categorical.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7b874fbb1c964..acb55b13ff057 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -14,7 +14,7 @@ qcut, ) import pandas._testing as tm -from pandas.api.typing import SeriesGroupBy +from pandas.core.groupby.generic import SeriesGroupBy from pandas.tests.groupby import get_groupby_method_args @@ -2010,18 +2010,6 @@ def test_many_categories(as_index, sort, index_kind, ordered): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("cat_columns", ["a", "b", ["a", "b"]]) -@pytest.mark.parametrize("keys", ["a", "b", ["a", "b"]]) -def test_groupby_default_depr(cat_columns, keys): - # GH#43999 - df = DataFrame({"a": [1, 1, 2, 3], "b": [4, 5, 6, 7]}) - df[cat_columns] = df[cat_columns].astype("category") - msg = "The default of observed=False is deprecated" - klass = FutureWarning if set(cat_columns) & set(keys) else None - with tm.assert_produces_warning(klass, match=msg): - df.groupby(keys) - - @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) def test_agg_list(request, as_index, observed, reduction_func, test_series, keys):