From 132585545393d0cdaaeee4ca47b095cb2a1079a2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 19 Jul 2023 18:46:10 -0400 Subject: [PATCH 1/8] BUG: groupby.idxmax/idxmin consistently raise on unobserved categorical --- .github/workflows/code-checks.yml | 2 +- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/groupby/generic.py | 34 ++++--- pandas/core/groupby/groupby.py | 100 +++++++++++++++++++- pandas/tests/groupby/test_categorical.py | 52 ++++++++-- pandas/tests/groupby/test_function.py | 33 +++++++ pandas/tests/groupby/test_groupby.py | 31 +++--- pandas/tests/groupby/test_groupby_dropna.py | 27 +++--- pandas/tests/groupby/test_raises.py | 55 ++++++----- 9 files changed, 248 insertions(+), 88 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 3bd68c07dcbc3..4260c0836bbea 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 930e03ae7d75a..3fbc6da21f269 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -328,7 +328,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`) - Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dbabd04a87c36..1ab7b15e13a03 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,15 +1185,23 @@ def nsmallest( def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "idxmin") + else: + axis = 0 + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, "idxmax") + else: + axis = 0 + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) @doc(Series.corr.__doc__) def corr( @@ -2195,14 +2203,9 @@ def idxmax( else: axis = self.axis - def func(df): - return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmax" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result def idxmin( self, @@ -2290,14 +2293,9 @@ def idxmin( else: axis = self.axis - def func(df): - return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmin" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a022bfd1bd9bc..0c425b0673abf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2015,10 +2015,16 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with com.temp_setattr(self, "as_index", True): # GH#49834 - result needs groups in the index for # _wrap_transform_fast_result - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin( + func, ignore_unobserved=True, axis=self.axis, **kwargs + ) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) return self._wrap_transform_fast_result(result) @@ -5720,6 +5726,92 @@ def sample( sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + axis: Axis = 0, + numeric_only: bool = False, + skipna: bool = True, + ignore_unobserved: bool = False, + ): + """Compute idxmax/idxmin. + + Parameters + ---------- + how: {"idxmin", "idxmax"} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + assert len(self.grouper.result_index) <= expected_len + # result_index only contains observed groups + has_unobserved = len(self.grouper.result_index) < expected_len + raise_err = not ignore_unobserved and has_unobserved + else: + raise_err = False + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + + try: + if self.obj.ndim == 1: + result = self._op_via_apply(how, skipna=skipna) + else: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + raise + + result = result.astype(self.obj.index.dtype) if result.empty else result + + if not skipna: + has_na_value = result.isnull().any(axis=None) + if has_na_value: + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + @doc(GroupBy) def get_groupby( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b11240c841420..11291bb89b604 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): return agg = getattr(series_groupby, reduction_func) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) assert len(result) == expected_length @@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) + + if reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) args = get_groupby_method_args(reduction_func, df) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(df_grp, reduction_func)(*args) + return + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1883,14 +1910,7 @@ def test_category_order_reducer( request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if ( - reduction_func in ("idxmax", "idxmin") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmax/min fail with unused categories" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index: msg = "GH#49950 - corrwith with as_index=False may not have grouping column" request.node.add_marker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: @@ -1912,6 +1932,15 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb, reduction_func)(*args) + return + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories @@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys gb = gb["b"] args = get_groupby_method_args(reduction_func, df) + if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + gb.agg([reduction_func], *args) + return + result = gb.agg([reduction_func], *args) expected = getattr(gb, reduction_func)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 41bbfcf6840a9..8829a6ece81c2 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1(): gb2.idxmax(axis=1) +@pytest.mark.parametrize( + "func, values, expected_values, warn", + [ + ("idxmin", [0, 1, 2], [0, 2], None), + ("idxmax", [0, 1, 2], [1, 2], None), + ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), + ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): + # GH#54234 + df = DataFrame( + { + "a": [1, 1, 2], + "b": values, + } + ) + gb = df.groupby("a") + index = Index([1, 2], name="a") + expected = DataFrame({"b": expected_values}, index=index) + if test_series: + gb = gb["b"] + expected = expected["b"] + klass = "Series" if test_series else "DataFrame" + msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, func)(skipna=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9132be50d5857..cccf9f436ee56 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby( - columns, keys, values, method, op, request, using_array_manager, dropna -): +def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): # GH8093 & GH26411 override_dtype = None - if ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["idxmax", "idxmin"] - ): - mark = pytest.mark.xfail( - raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" - ) - request.node.add_marker(mark) - if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected(): is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) - if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: - msg = f"Cannot perform {op} with non-ordered Categorical" - with pytest.raises(TypeError, match=msg): + if ( + isinstance(values, Categorical) + and not values.ordered + and op in ["min", "max", "idxmin", "idxmax"] + ): + if op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + klass = TypeError + else: + msg = f"Can't get {op} of an empty group due to unobserved categories" + klass = ValueError + with pytest.raises(klass, match=msg): get_result() - if isinstance(columns, list): + if op in ["min", "max"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d82278c277d48..8065aa63dff81 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -503,18 +503,7 @@ def test_null_is_null_for_dtype( @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_categorical_reducers( - request, reduction_func, observed, sort, as_index, index_kind -): - # GH#36327 - if ( - reduction_func in ("idxmin", "idxmax") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmin/max broken for categorical with observed=False" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - +def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): # Ensure there is at least one null value by appending to the end values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -544,6 +533,17 @@ def test_categorical_reducers( args = (args[0].drop(columns=keys),) args_filled = (args_filled[0].drop(columns=keys),) + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb_keepna, reduction_func)(*args) + return + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].replace(4, None) @@ -573,9 +573,6 @@ def test_categorical_reducers( if as_index: expected = expected["size"].rename(None) - gb_keepna = df.groupby( - keys, dropna=False, observed=observed, sort=sort, as_index=as_index - ) if as_index or index_kind == "range" or reduction_func == "size": warn = None else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9a2b3d44b117..46bf324fad1d7 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -97,22 +97,24 @@ def df_with_cat_col(): return df -def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None: - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) - else: - with pytest.raises(klass, match=msg): +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning + with tm.assert_produces_warning(warn_klass, match=warn_msg): + if klass is None: if how == "method": getattr(gb, groupby_func)(*args) elif how == "agg": gb.agg(groupby_func, *args) else: gb.transform(groupby_func, *args) + else: + with pytest.raises(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -233,8 +235,7 @@ def test_groupby_raises_string_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -297,13 +298,11 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support var operations"), }[groupby_func] - warn = None - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" if groupby_func in ["any", "all"]: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func, args) + warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -342,8 +341,7 @@ def test_groupby_raises_datetime_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -540,8 +538,7 @@ def test_groupby_raises_category_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -572,6 +569,16 @@ def test_groupby_raises_category_on_category( return empty_groups = any(group.empty for group in gb.groups.values()) + if ( + not observed + and how != "transform" + and isinstance(by, list) + and isinstance(by[0], str) + and by == ["a", "b"] + ): + assert not empty_groups + # TODO: empty_groups should be true due to unobserved categorical combinations + empty_groups = True klass, msg = { "all": (None, ""), @@ -617,10 +624,10 @@ def test_groupby_raises_category_on_category( if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (ValueError, "attempt to get argmax of an empty sequence") + "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), - "idxmin": (ValueError, "attempt to get argmin of an empty sequence") + "idxmin": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), "last": (None, ""), From e2dd88a633cb48c2cd3680cdcc570e9da0116355 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 24 Sep 2023 16:10:37 -0400 Subject: [PATCH 2/8] cleanups --- pandas/core/groupby/groupby.py | 10 +++++++--- pandas/tests/groupby/test_groupby.py | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0c425b0673abf..3d04bd634ba61 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5763,9 +5763,13 @@ def _idxmax_idxmin( expected_len = np.prod( [len(ping.group_index) for ping in self.grouper.groupings] ) - assert len(self.grouper.result_index) <= expected_len + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + result_len = len(self.grouper.result_index) + assert result_len <= expected_len # result_index only contains observed groups - has_unobserved = len(self.grouper.result_index) < expected_len + has_unobserved = result_len < expected_len raise_err = not ignore_unobserved and has_unobserved else: raise_err = False @@ -5794,7 +5798,7 @@ def func(df): raise ValueError( f"Can't get {how} of an empty group due to unobserved categories. " "Specify observed=True in groupby instead." - ) + ) from None raise result = result.astype(self.obj.index.dtype) if result.empty else result diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cccf9f436ee56..16c66b60f06bd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2068,6 +2068,11 @@ def get_categorical_invalid_expected(): result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() tm.assert_equal(result, expected) + elif op in ["idxmin", "idxmax"] and isinstance(columns, list): + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + get_result(numeric_only=True) return if op in ["prod", "sum", "skew"]: From 5daaaedefa469448b7088c8924999ff0d8a111b3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 24 Sep 2023 16:31:20 -0400 Subject: [PATCH 3/8] fixups --- pandas/core/groupby/groupby.py | 11 ++++++++++- pandas/tests/groupby/test_groupby.py | 5 ----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3d04bd634ba61..b6fb6580fe82c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5766,11 +5766,20 @@ def _idxmax_idxmin( if len(self.grouper.groupings) == 1: result_len = len(self.grouper.groupings[0].grouping_vector.unique()) else: + # result_index only contains observed groups in this case result_len = len(self.grouper.result_index) assert result_len <= expected_len - # result_index only contains observed groups has_unobserved = result_len < expected_len + raise_err = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err &= len(data.columns) > 0 else: raise_err = False if raise_err: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 16c66b60f06bd..cccf9f436ee56 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2068,11 +2068,6 @@ def get_categorical_invalid_expected(): result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() tm.assert_equal(result, expected) - elif op in ["idxmin", "idxmax"] and isinstance(columns, list): - with pytest.raises( - ValueError, match="empty group due to unobserved categories" - ): - get_result(numeric_only=True) return if op in ["prod", "sum", "skew"]: From c730ddba40de6fe644ba6857f45086db0145e105 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 24 Sep 2023 22:15:47 -0400 Subject: [PATCH 4/8] type-hint fixup --- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 427687d9614f9..7cd3891605d59 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11895,7 +11895,7 @@ def _logical_func( def any( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b6fb6580fe82c..1082ce81ddd25 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5771,7 +5771,7 @@ def _idxmax_idxmin( assert result_len <= expected_len has_unobserved = result_len < expected_len - raise_err = not ignore_unobserved and has_unobserved + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved # Only raise an error if there are columns to compute; otherwise we return # an empty DataFrame with an index (possibly including unobserved) but no # columns From 2bcdd81fcc54ffd5c3d2d61a79779a8c9c9da3c3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 29 Sep 2023 14:28:33 -0400 Subject: [PATCH 5/8] simplify --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b6fb6580fe82c..82429f2cb149b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5776,10 +5776,10 @@ def _idxmax_idxmin( # an empty DataFrame with an index (possibly including unobserved) but no # columns data = self._obj_with_exclusions - if isinstance(data, DataFrame): + if not raise_err and isinstance(data, DataFrame): if numeric_only: data = data._get_numeric_data() - raise_err &= len(data.columns) > 0 + raise_err = len(data.columns) > 0 else: raise_err = False if raise_err: From d62297c399ac6468a584c8cb965236d293c75a02 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 29 Sep 2023 14:46:48 -0400 Subject: [PATCH 6/8] fixup --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5eff6e9900b40..bab6a72042d3f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5776,7 +5776,7 @@ def _idxmax_idxmin( # an empty DataFrame with an index (possibly including unobserved) but no # columns data = self._obj_with_exclusions - if not raise_err and isinstance(data, DataFrame): + if raise_err and isinstance(data, DataFrame): if numeric_only: data = data._get_numeric_data() raise_err = len(data.columns) > 0 From b2a36ebf96ea2c5c00e9a72c7b701d5e6ce27830 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 7 Oct 2023 11:32:45 -0400 Subject: [PATCH 7/8] Fix passing through *args --- pandas/core/groupby/generic.py | 30 ++----------------- pandas/core/groupby/groupby.py | 16 ++++++---- .../tests/groupby/transform/test_transform.py | 16 ++++++++++ 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7185cd724eaf2..95834f11dea1f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,22 +1185,12 @@ def nsmallest( def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmin") - else: - axis = 0 return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - if axis is not lib.no_default: - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmax") - else: - axis = 0 return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) @doc(Series.corr.__doc__) @@ -2119,7 +2109,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: def idxmax( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: @@ -2195,21 +2185,13 @@ def idxmax( Beef co2_emissions dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmax") - else: - axis = self.axis - return self._idxmax_idxmin( "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna ) def idxmin( self, - axis: Axis | None | lib.NoDefault = lib.no_default, + axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: @@ -2285,14 +2267,6 @@ def idxmin( Beef consumption dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmin") - else: - axis = self.axis - return self._idxmax_idxmin( "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna ) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bab6a72042d3f..ef0246ab63fb8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2017,9 +2017,7 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # _wrap_transform_fast_result if func in ["idxmin", "idxmax"]: func = cast(Literal["idxmin", "idxmax"], func) - result = self._idxmax_idxmin( - func, ignore_unobserved=True, axis=self.axis, **kwargs - ) + result = self._idxmax_idxmin(func, True, *args, **kwargs) else: if engine is not None: kwargs["engine"] = engine @@ -5729,10 +5727,10 @@ def sample( def _idxmax_idxmin( self, how: Literal["idxmax", "idxmin"], - axis: Axis = 0, - numeric_only: bool = False, - skipna: bool = True, ignore_unobserved: bool = False, + axis: Axis | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, ): """Compute idxmax/idxmin. @@ -5757,6 +5755,12 @@ def _idxmax_idxmin( Series or DataFrame idxmax or idxmin for the groupby operation. """ + if axis is not lib.no_default: + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + if not self.observed and any( ping._passed_categorical for ping in self.grouper.groupings ): diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cf3f41e04902c..4a493ef3fd52c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1637,3 +1637,19 @@ def test_as_index_no_change(keys, df, groupby_func): result = gb_as_index_true.transform(groupby_func, *args) expected = gb_as_index_false.transform(groupby_func, *args) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): + # GH#55268 - ensure *args are passed through when calling transform + df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) + gb = df.groupby("a") + msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.transform(how, 0, skipna, numeric_only) + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrame.{how} with .* any-NA and skipna=False" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) From a1954d62845b0e53aed4faeed625a5ca29f6806d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 7 Oct 2023 11:36:25 -0400 Subject: [PATCH 8/8] fixup --- pandas/core/groupby/generic.py | 4 ++-- pandas/core/groupby/groupby.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 95834f11dea1f..a2f556eba08a4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2109,7 +2109,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: def idxmax( self, - axis: Axis | lib.NoDefault = lib.no_default, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: @@ -2191,7 +2191,7 @@ def idxmax( def idxmin( self, - axis: Axis | lib.NoDefault = lib.no_default, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ef0246ab63fb8..e33c4b3579c69 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -5728,7 +5728,7 @@ def _idxmax_idxmin( self, how: Literal["idxmax", "idxmin"], ignore_unobserved: bool = False, - axis: Axis | lib.NoDefault = lib.no_default, + axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, numeric_only: bool = False, ): @@ -5756,6 +5756,8 @@ def _idxmax_idxmin( idxmax or idxmin for the groupby operation. """ if axis is not lib.no_default: + if axis is None: + axis = self.axis axis = self.obj._get_axis_number(axis) self._deprecate_axis(axis, how) else: