diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 3bd68c07dcbc3..4260c0836bbea 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -124,7 +124,7 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 017a28ffb573a..50d89abbb286a 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -364,6 +364,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e131d689b6a40..210ce8ce9fcbf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11910,7 +11910,7 @@ def _logical_func( def any( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 904ab9bdfc6dd..a2f556eba08a4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1185,15 +1185,13 @@ def nsmallest( def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) @doc(Series.corr.__doc__) def corr( @@ -2187,22 +2185,9 @@ def idxmax( Beef co2_emissions dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmax") - else: - axis = self.axis - - def func(df): - return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmax" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result def idxmin( self, @@ -2282,22 +2267,9 @@ def idxmin( Beef consumption dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmin") - else: - axis = self.axis - - def func(df): - return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmin" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a022bfd1bd9bc..e33c4b3579c69 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2015,10 +2015,14 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): with com.temp_setattr(self, "as_index", True): # GH#49834 - result needs groups in the index for # _wrap_transform_fast_result - if engine is not None: - kwargs["engine"] = engine - kwargs["engine_kwargs"] = engine_kwargs - result = getattr(self, func)(*args, **kwargs) + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: + if engine is not None: + kwargs["engine"] = engine + kwargs["engine_kwargs"] = engine_kwargs + result = getattr(self, func)(*args, **kwargs) return self._wrap_transform_fast_result(result) @@ -5720,6 +5724,113 @@ def sample( sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ): + """Compute idxmax/idxmin. + + Parameters + ---------- + how: {"idxmin", "idxmax"} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self.grouper.groupings + ): + expected_len = np.prod( + [len(ping.group_index) for ping in self.grouper.groupings] + ) + if len(self.grouper.groupings) == 1: + result_len = len(self.grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self.grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + else: + raise_err = False + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + + try: + if self.obj.ndim == 1: + result = self._op_via_apply(how, skipna=skipna) + else: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) from None + raise + + result = result.astype(self.obj.index.dtype) if result.empty else result + + if not skipna: + has_na_value = result.isnull().any(axis=None) + if has_na_value: + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return result + @doc(GroupBy) def get_groupby( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b11240c841420..11291bb89b604 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): return agg = getattr(series_groupby, reduction_func) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) assert len(result) == expected_length @@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) + + if reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) args = get_groupby_method_args(reduction_func, df) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(df_grp, reduction_func)(*args) + return + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1883,14 +1910,7 @@ def test_category_order_reducer( request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if ( - reduction_func in ("idxmax", "idxmin") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmax/min fail with unused categories" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index: msg = "GH#49950 - corrwith with as_index=False may not have grouping column" request.node.add_marker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: @@ -1912,6 +1932,15 @@ def test_category_order_reducer( df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb, reduction_func)(*args) + return + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories @@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys gb = gb["b"] args = get_groupby_method_args(reduction_func, df) + if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + gb.agg([reduction_func], *args) + return + result = gb.agg([reduction_func], *args) expected = getattr(gb, reduction_func)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a92880c87b847..08372541988d0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1(): gb2.idxmax(axis=1) +@pytest.mark.parametrize( + "func, values, expected_values, warn", + [ + ("idxmin", [0, 1, 2], [0, 2], None), + ("idxmax", [0, 1, 2], [1, 2], None), + ("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning), + ("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning), + ("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series): + # GH#54234 + df = DataFrame( + { + "a": [1, 1, 2], + "b": values, + } + ) + gb = df.groupby("a") + index = Index([1, 2], name="a") + expected = DataFrame({"b": expected_values}, index=index) + if test_series: + gb = gb["b"] + expected = expected["b"] + klass = "Series" if test_series else "DataFrame" + msg = f"The behavior of {klass}GroupBy.{func} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, func)(skipna=False) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_axis1_numeric_only(request, groupby_func, numeric_only): if groupby_func in ("idxmax", "idxmin"): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4ca8b0e317bd2..e3e4a7efaa307 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2001,22 +2001,10 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby( - columns, keys, values, method, op, request, using_array_manager, dropna -): +def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): # GH8093 & GH26411 override_dtype = None - if ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["idxmax", "idxmin"] - ): - mark = pytest.mark.xfail( - raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" - ) - request.node.add_marker(mark) - if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -2061,12 +2049,21 @@ def get_categorical_invalid_expected(): is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) - if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: - msg = f"Cannot perform {op} with non-ordered Categorical" - with pytest.raises(TypeError, match=msg): + if ( + isinstance(values, Categorical) + and not values.ordered + and op in ["min", "max", "idxmin", "idxmax"] + ): + if op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + klass = TypeError + else: + msg = f"Can't get {op} of an empty group due to unobserved categories" + klass = ValueError + with pytest.raises(klass, match=msg): get_result() - if isinstance(columns, list): + if op in ["min", "max"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d82278c277d48..8065aa63dff81 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -503,18 +503,7 @@ def test_null_is_null_for_dtype( @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_categorical_reducers( - request, reduction_func, observed, sort, as_index, index_kind -): - # GH#36327 - if ( - reduction_func in ("idxmin", "idxmax") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmin/max broken for categorical with observed=False" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - +def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): # Ensure there is at least one null value by appending to the end values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -544,6 +533,17 @@ def test_categorical_reducers( args = (args[0].drop(columns=keys),) args_filled = (args_filled[0].drop(columns=keys),) + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb_keepna, reduction_func)(*args) + return + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() expected["x"] = expected["x"].replace(4, None) @@ -573,9 +573,6 @@ def test_categorical_reducers( if as_index: expected = expected["size"].rename(None) - gb_keepna = df.groupby( - keys, dropna=False, observed=observed, sort=sort, as_index=as_index - ) if as_index or index_kind == "range" or reduction_func == "size": warn = None else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f9a2b3d44b117..46bf324fad1d7 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -97,22 +97,24 @@ def df_with_cat_col(): return df -def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None: - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) - else: - with pytest.raises(klass, match=msg): +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning + with tm.assert_produces_warning(warn_klass, match=warn_msg): + if klass is None: if how == "method": getattr(gb, groupby_func)(*args) elif how == "agg": gb.agg(groupby_func, *args) else: gb.transform(groupby_func, *args) + else: + with pytest.raises(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -233,8 +235,7 @@ def test_groupby_raises_string_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -297,13 +298,11 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support var operations"), }[groupby_func] - warn = None - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" if groupby_func in ["any", "all"]: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func, args) + warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -342,8 +341,7 @@ def test_groupby_raises_datetime_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -540,8 +538,7 @@ def test_groupby_raises_category_np( warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -572,6 +569,16 @@ def test_groupby_raises_category_on_category( return empty_groups = any(group.empty for group in gb.groups.values()) + if ( + not observed + and how != "transform" + and isinstance(by, list) + and isinstance(by[0], str) + and by == ["a", "b"] + ): + assert not empty_groups + # TODO: empty_groups should be true due to unobserved categorical combinations + empty_groups = True klass, msg = { "all": (None, ""), @@ -617,10 +624,10 @@ def test_groupby_raises_category_on_category( if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (ValueError, "attempt to get argmax of an empty sequence") + "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), - "idxmin": (ValueError, "attempt to get argmin of an empty sequence") + "idxmin": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), "last": (None, ""), diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cf3f41e04902c..4a493ef3fd52c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1637,3 +1637,19 @@ def test_as_index_no_change(keys, df, groupby_func): result = gb_as_index_true.transform(groupby_func, *args) expected = gb_as_index_false.transform(groupby_func, *args) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): + # GH#55268 - ensure *args are passed through when calling transform + df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) + gb = df.groupby("a") + msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.transform(how, 0, skipna, numeric_only) + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrame.{how} with .* any-NA and skipna=False" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected)