From de6cf9c79e48e075208ed2c840adb70adcfe8006 Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Mon, 15 May 2023 18:01:32 +0000 Subject: [PATCH 1/8] fix dataframegroupby.agg not respecting as_index=false --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/groupby/generic.py | 38 +++--------------------- pandas/core/groupby/groupby.py | 29 ++++++++++++++++-- pandas/tests/groupby/test_categorical.py | 7 ++--- 4 files changed, 33 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b6c13c287d5f9..8880ec5ce28da 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -413,6 +413,7 @@ Groupby/resample/rolling grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, the function operated on the whole index rather than each element of the index. (:issue:`51979`) +- Bug in :meth:`DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`) - Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) - Bug in :meth:`GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 36ad6e4a11123..627dbaf1b64e8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1170,41 +1170,13 @@ def cov( return result @property + @doc(Series.is_monotonic_increasing.__doc__) def is_monotonic_increasing(self) -> Series: - """ - Return whether each group's values are monotonically increasing. - - Returns - ------- - Series - - Examples - -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) - >>> s.groupby(level=0).is_monotonic_increasing - Falcon False - Parrot True - dtype: bool - """ return self.apply(lambda ser: ser.is_monotonic_increasing) @property + @doc(Series.is_monotonic_decreasing.__doc__) def is_monotonic_decreasing(self) -> Series: - """ - Return whether each group's values are monotonically decreasing. - - Returns - ------- - Series - - Examples - -------- - >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) - >>> s.groupby(level=0).is_monotonic_decreasing - Falcon True - Parrot False - dtype: bool - """ return self.apply(lambda ser: ser.is_monotonic_decreasing) @doc(Series.hist.__doc__) @@ -1357,9 +1329,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) op = GroupByApply(self, func, args=args, kwargs=kwargs) result = op.agg() - if not is_dict_like(func) and result is not None: - return result - elif relabeling: + if relabeling and (is_dict_like(func) or result is None): # this should be the only (non-raising) case with relabeling # used reordered index of columns result = cast(DataFrame, result) @@ -1411,7 +1381,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = self._obj_with_exclusions.columns.copy() if not self.as_index: - result = self._insert_inaxis_grouper(result) + result = self._insert_inaxis_grouper(result, finalize=True) result.index = default_index(len(result)) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5928c32e22b7f..6f49df1cfa400 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1222,15 +1222,38 @@ def _set_result_index_ordered( return result @final - def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: + def _insert_inaxis_grouper( + self, result: Series | DataFrame, finalize: bool = False + ) -> DataFrame: if isinstance(result, Series): result = result.to_frame() + # GH #52849: when called with finalize=True, this means we are dealing + # with as_index=False after the result has been set. For categorical data, + # the result would have already included unused categories, so calling + # get_group_levels is not feasible. + if ( + finalize + and not self.observed + and len(self.grouper.groupings) != 1 + and any( + isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) + for ping in self.grouper.groupings + ) + ): + from pandas.core.reshape.util import cartesian_product + + group_levels = cartesian_product( + [ping.group_index for ping in self.grouper.groupings] + ) + else: + group_levels = self.grouper.get_group_levels() + # zip in reverse so we can always insert at loc 0 columns = result.columns for name, lev, in_axis in zip( reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), + reversed(group_levels), reversed([grp.in_axis for grp in self.grouper.groupings]), ): # GH #28549 @@ -4086,7 +4109,7 @@ def _reindex_output( for ping in groupings ): return output - + # breakpoint() levels_list = [ping.group_index for ping in groupings] names = self.grouper.names if qs is not None: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6ce17e406db65..c0704d9684574 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -2067,11 +2067,8 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys if as_index and (test_series or reduction_func == "size"): expected = expected.to_frame(reduction_func) if not test_series: - if not as_index: - # TODO: GH#52849 - as_index=False is not respected - expected = expected.set_index(keys) - expected.columns = MultiIndex( - levels=[["b"], [reduction_func]], codes=[[0], [0]] + expected.columns = MultiIndex.from_tuples( + [(ind, "") for ind in expected.columns[:-1]] + [("b", reduction_func)] ) elif not as_index: expected.columns = keys + [reduction_func] From d152c7064c6fc41e739aa0acd4d423f295e891bb Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Mon, 15 May 2023 18:51:30 +0000 Subject: [PATCH 2/8] cleaned up code, removed breakpoints --- pandas/core/groupby/generic.py | 28 ++++++++++++++++++++++++++-- pandas/core/groupby/groupby.py | 1 - 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 627dbaf1b64e8..183d7d5157cc4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1170,13 +1170,37 @@ def cov( return result @property - @doc(Series.is_monotonic_increasing.__doc__) def is_monotonic_increasing(self) -> Series: + """ + Return whether each group's values are monotonically increasing. + Returns + ------- + Series + Examples + -------- + >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s.groupby(level=0).is_monotonic_increasing + Falcon False + Parrot True + dtype: bool + """ return self.apply(lambda ser: ser.is_monotonic_increasing) @property - @doc(Series.is_monotonic_decreasing.__doc__) def is_monotonic_decreasing(self) -> Series: + """ + Return whether each group's values are monotonically decreasing. + Returns + ------- + Series + Examples + -------- + >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) + >>> s.groupby(level=0).is_monotonic_decreasing + Falcon True + Parrot False + dtype: bool + """ return self.apply(lambda ser: ser.is_monotonic_decreasing) @doc(Series.hist.__doc__) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6f49df1cfa400..f0172db44dbe8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4109,7 +4109,6 @@ def _reindex_output( for ping in groupings ): return output - # breakpoint() levels_list = [ping.group_index for ping in groupings] names = self.grouper.names if qs is not None: From 9f5532d6af8d3cd30c133e949a862a5a9d85238b Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Mon, 15 May 2023 18:56:44 +0000 Subject: [PATCH 3/8] cleaning up codes --- pandas/core/groupby/generic.py | 2 ++ pandas/core/groupby/groupby.py | 1 + 2 files changed, 3 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 183d7d5157cc4..06d0aeefe5c87 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1173,6 +1173,7 @@ def cov( def is_monotonic_increasing(self) -> Series: """ Return whether each group's values are monotonically increasing. + Returns ------- Series @@ -1190,6 +1191,7 @@ def is_monotonic_increasing(self) -> Series: def is_monotonic_decreasing(self) -> Series: """ Return whether each group's values are monotonically decreasing. + Returns ------- Series diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f0172db44dbe8..94d9d43eeb4a7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4109,6 +4109,7 @@ def _reindex_output( for ping in groupings ): return output + levels_list = [ping.group_index for ping in groupings] names = self.grouper.names if qs is not None: From 495492173621ebb38b9c978000df626069b5524d Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Mon, 15 May 2023 19:00:45 +0000 Subject: [PATCH 4/8] final cleanup of code, ready for review, sry abt that --- pandas/core/groupby/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 06d0aeefe5c87..d71f0d8906de2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1177,6 +1177,7 @@ def is_monotonic_increasing(self) -> Series: Returns ------- Series + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) @@ -1195,6 +1196,7 @@ def is_monotonic_decreasing(self) -> Series: Returns ------- Series + Examples -------- >>> s = pd.Series([2, 1, 3, 4], index=['Falcon', 'Falcon', 'Parrot', 'Parrot']) From 2cf627833a8d207b20bfbe786995eb310a54eefe Mon Sep 17 00:00:00 2001 From: Yao Xiao Date: Tue, 16 May 2023 01:32:39 +0000 Subject: [PATCH 5/8] resolve typing (not sure) --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 94d9d43eeb4a7..57b6a1d1d0475 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1232,6 +1232,7 @@ def _insert_inaxis_grouper( # with as_index=False after the result has been set. For categorical data, # the result would have already included unused categories, so calling # get_group_levels is not feasible. + group_levels: Sequence[ExtensionArray | np.ndarray] if ( finalize and not self.observed From b0e0da37b8ab6c3caa356e0d562747ec6609ff1e Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Thu, 18 May 2023 21:29:08 +0800 Subject: [PATCH 6/8] used better approach --- pandas/core/groupby/generic.py | 11 +++++++++-- pandas/core/groupby/groupby.py | 28 ++-------------------------- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index acd4d5bbe81cd..fd39f6c97fe93 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -44,6 +44,7 @@ is_bool, is_dict_like, is_integer_dtype, + is_list_like, is_numeric_dtype, is_scalar, ) @@ -1396,7 +1397,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) op = GroupByApply(self, func, args=args, kwargs=kwargs) result = op.agg() - if relabeling and (is_dict_like(func) or result is None): + if not is_dict_like(func) and result is not None: + # GH #52849 + if is_list_like(func) and not self.as_index: + return result.reset_index() + else: + return result + elif relabeling: # this should be the only (non-raising) case with relabeling # used reordered index of columns result = cast(DataFrame, result) @@ -1448,7 +1455,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result.columns = self._obj_with_exclusions.columns.copy() if not self.as_index: - result = self._insert_inaxis_grouper(result, finalize=True) + result = self._insert_inaxis_grouper(result) result.index = default_index(len(result)) return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c44db4e25f2f3..bdab641719ded 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1222,39 +1222,15 @@ def _set_result_index_ordered( return result @final - def _insert_inaxis_grouper( - self, result: Series | DataFrame, finalize: bool = False - ) -> DataFrame: + def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame: if isinstance(result, Series): result = result.to_frame() - # GH #52849: when called with finalize=True, this means we are dealing - # with as_index=False after the result has been set. For categorical data, - # the result would have already included unused categories, so calling - # get_group_levels is not feasible. - group_levels: Sequence[ExtensionArray | np.ndarray] - if ( - finalize - and not self.observed - and len(self.grouper.groupings) != 1 - and any( - isinstance(ping.grouping_vector, (Categorical, CategoricalIndex)) - for ping in self.grouper.groupings - ) - ): - from pandas.core.reshape.util import cartesian_product - - group_levels = cartesian_product( - [ping.group_index for ping in self.grouper.groupings] - ) - else: - group_levels = self.grouper.get_group_levels() - # zip in reverse so we can always insert at loc 0 columns = result.columns for name, lev, in_axis in zip( reversed(self.grouper.names), - reversed(group_levels), + reversed(self.grouper.get_group_levels()), reversed([grp.in_axis for grp in self.grouper.groupings]), ): # GH #28549 From 3d34a06ed9aa5f5408eb572c12699595cd15b1cc Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Fri, 19 May 2023 11:36:39 +0800 Subject: [PATCH 7/8] new test added; enh short circuiting --- pandas/core/groupby/generic.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fd39f6c97fe93..37ef04f17a2e5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1399,7 +1399,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) result = op.agg() if not is_dict_like(func) and result is not None: # GH #52849 - if is_list_like(func) and not self.as_index: + if not self.as_index and is_list_like(func): return result.reset_index() else: return result diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 24b42421b3208..8b4f1deaa48d4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1586,3 +1586,18 @@ def test_agg_multiple_with_as_index_false_subset_to_a_single_column(): result = gb.agg(["sum", "mean"]) expected = DataFrame({"a": [1, 2], "sum": [7, 5], "mean": [3.5, 5.0]}) tm.assert_frame_equal(result, expected) + + +def test_agg_with_as_index_false_with_list(): + # GH#52849 + df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) + gb = df.groupby(by=["a1", "a2"], as_index=False) + result = gb.agg(["sum"]) + + expected = DataFrame( + data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]], + columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]), + ) + print(result) + print(expected) + tm.assert_frame_equal(result, expected) From 19107d806d780eb87b69a407cbc47acdd861469c Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Fri, 19 May 2023 11:37:21 +0800 Subject: [PATCH 8/8] clean up code --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8b4f1deaa48d4..185b50e9d7833 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1598,6 +1598,4 @@ def test_agg_with_as_index_false_with_list(): data=[[0, 2, 4], [0, 3, 5], [1, 3, 6]], columns=MultiIndex.from_tuples([("a1", ""), ("a2", ""), ("b", "sum")]), ) - print(result) - print(expected) tm.assert_frame_equal(result, expected)