Skip to content

Commit 3fac6f2

Browse files
authored
BUG: groupby.idxmax/idxmin consistently raise on unobserved categorical (#55268)
1 parent a0babcb commit 3fac6f2

File tree

11 files changed

+274
-104
lines changed

11 files changed

+274
-104
lines changed

.github/workflows/code-checks.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124
run: |
125125
cd asv_bench
126126
asv machine --yes
127-
asv run --quick --dry-run --durations=30 --python=same
127+
asv run --quick --dry-run --durations=30 --python=same --show-stderr
128128
129129
build_docker_dev_environment:
130130
name: Build Docker Dev Environment

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ Plotting
365365

366366
Groupby/resample/rolling
367367
^^^^^^^^^^^^^^^^^^^^^^^^
368+
- Bug in :meth:`DataFrameGroupBy.idxmax`, :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax`, and :meth:`SeriesGroupBy.idxmin` would not consistently raise when grouping with ``observed=False`` and unobserved categoricals (:issue:`10694`)
368369
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
369370
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
370371
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11910,7 +11910,7 @@ def _logical_func(
1191011910

1191111911
def any(
1191211912
self,
11913-
axis: Axis = 0,
11913+
axis: Axis | None = 0,
1191411914
bool_only: bool_t = False,
1191511915
skipna: bool_t = True,
1191611916
**kwargs,

pandas/core/groupby/generic.py

+6-34
Original file line numberDiff line numberDiff line change
@@ -1185,15 +1185,13 @@ def nsmallest(
11851185
def idxmin(
11861186
self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
11871187
) -> Series:
1188-
result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
1189-
return result.astype(self.obj.index.dtype) if result.empty else result
1188+
return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna)
11901189

11911190
@doc(Series.idxmax.__doc__)
11921191
def idxmax(
11931192
self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True
11941193
) -> Series:
1195-
result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
1196-
return result.astype(self.obj.index.dtype) if result.empty else result
1194+
return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna)
11971195

11981196
@doc(Series.corr.__doc__)
11991197
def corr(
@@ -2187,22 +2185,9 @@ def idxmax(
21872185
Beef co2_emissions
21882186
dtype: object
21892187
"""
2190-
if axis is not lib.no_default:
2191-
if axis is None:
2192-
axis = self.axis
2193-
axis = self.obj._get_axis_number(axis)
2194-
self._deprecate_axis(axis, "idxmax")
2195-
else:
2196-
axis = self.axis
2197-
2198-
def func(df):
2199-
return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
2200-
2201-
func.__name__ = "idxmax"
2202-
result = self._python_apply_general(
2203-
func, self._obj_with_exclusions, not_indexed_same=True
2188+
return self._idxmax_idxmin(
2189+
"idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna
22042190
)
2205-
return result.astype(self.obj.index.dtype) if result.empty else result
22062191

22072192
def idxmin(
22082193
self,
@@ -2282,22 +2267,9 @@ def idxmin(
22822267
Beef consumption
22832268
dtype: object
22842269
"""
2285-
if axis is not lib.no_default:
2286-
if axis is None:
2287-
axis = self.axis
2288-
axis = self.obj._get_axis_number(axis)
2289-
self._deprecate_axis(axis, "idxmin")
2290-
else:
2291-
axis = self.axis
2292-
2293-
def func(df):
2294-
return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
2295-
2296-
func.__name__ = "idxmin"
2297-
result = self._python_apply_general(
2298-
func, self._obj_with_exclusions, not_indexed_same=True
2270+
return self._idxmax_idxmin(
2271+
"idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna
22992272
)
2300-
return result.astype(self.obj.index.dtype) if result.empty else result
23012273

23022274
boxplot = boxplot_frame_groupby
23032275

pandas/core/groupby/groupby.py

+115-4
Original file line numberDiff line numberDiff line change
@@ -2015,10 +2015,14 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
20152015
with com.temp_setattr(self, "as_index", True):
20162016
# GH#49834 - result needs groups in the index for
20172017
# _wrap_transform_fast_result
2018-
if engine is not None:
2019-
kwargs["engine"] = engine
2020-
kwargs["engine_kwargs"] = engine_kwargs
2021-
result = getattr(self, func)(*args, **kwargs)
2018+
if func in ["idxmin", "idxmax"]:
2019+
func = cast(Literal["idxmin", "idxmax"], func)
2020+
result = self._idxmax_idxmin(func, True, *args, **kwargs)
2021+
else:
2022+
if engine is not None:
2023+
kwargs["engine"] = engine
2024+
kwargs["engine_kwargs"] = engine_kwargs
2025+
result = getattr(self, func)(*args, **kwargs)
20222026

20232027
return self._wrap_transform_fast_result(result)
20242028

@@ -5720,6 +5724,113 @@ def sample(
57205724
sampled_indices = np.concatenate(sampled_indices)
57215725
return self._selected_obj.take(sampled_indices, axis=self.axis)
57225726

5727+
def _idxmax_idxmin(
5728+
self,
5729+
how: Literal["idxmax", "idxmin"],
5730+
ignore_unobserved: bool = False,
5731+
axis: Axis | None | lib.NoDefault = lib.no_default,
5732+
skipna: bool = True,
5733+
numeric_only: bool = False,
5734+
):
5735+
"""Compute idxmax/idxmin.
5736+
5737+
Parameters
5738+
----------
5739+
how: {"idxmin", "idxmax"}
5740+
Whether to compute idxmin or idxmax.
5741+
axis : {{0 or 'index', 1 or 'columns'}}, default None
5742+
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
5743+
If axis is not provided, grouper's axis is used.
5744+
numeric_only : bool, default False
5745+
Include only float, int, boolean columns.
5746+
skipna : bool, default True
5747+
Exclude NA/null values. If an entire row/column is NA, the result
5748+
will be NA.
5749+
ignore_unobserved : bool, default False
5750+
When True and an unobserved group is encountered, do not raise. This used
5751+
for transform where unobserved groups do not play an impact on the result.
5752+
5753+
Returns
5754+
-------
5755+
Series or DataFrame
5756+
idxmax or idxmin for the groupby operation.
5757+
"""
5758+
if axis is not lib.no_default:
5759+
if axis is None:
5760+
axis = self.axis
5761+
axis = self.obj._get_axis_number(axis)
5762+
self._deprecate_axis(axis, how)
5763+
else:
5764+
axis = self.axis
5765+
5766+
if not self.observed and any(
5767+
ping._passed_categorical for ping in self.grouper.groupings
5768+
):
5769+
expected_len = np.prod(
5770+
[len(ping.group_index) for ping in self.grouper.groupings]
5771+
)
5772+
if len(self.grouper.groupings) == 1:
5773+
result_len = len(self.grouper.groupings[0].grouping_vector.unique())
5774+
else:
5775+
# result_index only contains observed groups in this case
5776+
result_len = len(self.grouper.result_index)
5777+
assert result_len <= expected_len
5778+
has_unobserved = result_len < expected_len
5779+
5780+
raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved
5781+
# Only raise an error if there are columns to compute; otherwise we return
5782+
# an empty DataFrame with an index (possibly including unobserved) but no
5783+
# columns
5784+
data = self._obj_with_exclusions
5785+
if raise_err and isinstance(data, DataFrame):
5786+
if numeric_only:
5787+
data = data._get_numeric_data()
5788+
raise_err = len(data.columns) > 0
5789+
else:
5790+
raise_err = False
5791+
if raise_err:
5792+
raise ValueError(
5793+
f"Can't get {how} of an empty group due to unobserved categories. "
5794+
"Specify observed=True in groupby instead."
5795+
)
5796+
5797+
try:
5798+
if self.obj.ndim == 1:
5799+
result = self._op_via_apply(how, skipna=skipna)
5800+
else:
5801+
5802+
def func(df):
5803+
method = getattr(df, how)
5804+
return method(axis=axis, skipna=skipna, numeric_only=numeric_only)
5805+
5806+
func.__name__ = how
5807+
result = self._python_apply_general(
5808+
func, self._obj_with_exclusions, not_indexed_same=True
5809+
)
5810+
except ValueError as err:
5811+
name = "argmax" if how == "idxmax" else "argmin"
5812+
if f"attempt to get {name} of an empty sequence" in str(err):
5813+
raise ValueError(
5814+
f"Can't get {how} of an empty group due to unobserved categories. "
5815+
"Specify observed=True in groupby instead."
5816+
) from None
5817+
raise
5818+
5819+
result = result.astype(self.obj.index.dtype) if result.empty else result
5820+
5821+
if not skipna:
5822+
has_na_value = result.isnull().any(axis=None)
5823+
if has_na_value:
5824+
warnings.warn(
5825+
f"The behavior of {type(self).__name__}.{how} with all-NA "
5826+
"values, or any-NA and skipna=False, is deprecated. In a future "
5827+
"version this will raise ValueError",
5828+
FutureWarning,
5829+
stacklevel=find_stack_level(),
5830+
)
5831+
5832+
return result
5833+
57235834

57245835
@doc(GroupBy)
57255836
def get_groupby(

pandas/tests/groupby/test_categorical.py

+44-8
Original file line numberDiff line numberDiff line change
@@ -1416,6 +1416,15 @@ def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
14161416
return
14171417

14181418
agg = getattr(series_groupby, reduction_func)
1419+
1420+
if not observed and reduction_func in ["idxmin", "idxmax"]:
1421+
# idxmin and idxmax are designed to fail on empty inputs
1422+
with pytest.raises(
1423+
ValueError, match="empty group due to unobserved categories"
1424+
):
1425+
agg(*args)
1426+
return
1427+
14191428
result = agg(*args)
14201429

14211430
assert len(result) == expected_length
@@ -1448,6 +1457,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
14481457

14491458
series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
14501459
agg = getattr(series_groupby, reduction_func)
1460+
1461+
if reduction_func in ["idxmin", "idxmax"]:
1462+
# idxmin and idxmax are designed to fail on empty inputs
1463+
with pytest.raises(
1464+
ValueError, match="empty group due to unobserved categories"
1465+
):
1466+
agg(*args)
1467+
return
1468+
14511469
result = agg(*args)
14521470

14531471
zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
@@ -1514,6 +1532,15 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
15141532
df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
15151533

15161534
args = get_groupby_method_args(reduction_func, df)
1535+
1536+
if not observed and reduction_func in ["idxmin", "idxmax"]:
1537+
# idxmin and idxmax are designed to fail on empty inputs
1538+
with pytest.raises(
1539+
ValueError, match="empty group due to unobserved categories"
1540+
):
1541+
getattr(df_grp, reduction_func)(*args)
1542+
return
1543+
15171544
res = getattr(df_grp, reduction_func)(*args)
15181545

15191546
expected = _results_for_groupbys_with_missing_categories[reduction_func]
@@ -1883,14 +1910,7 @@ def test_category_order_reducer(
18831910
request, as_index, sort, observed, reduction_func, index_kind, ordered
18841911
):
18851912
# GH#48749
1886-
if (
1887-
reduction_func in ("idxmax", "idxmin")
1888-
and not observed
1889-
and index_kind != "multi"
1890-
):
1891-
msg = "GH#10694 - idxmax/min fail with unused categories"
1892-
request.node.add_marker(pytest.mark.xfail(reason=msg))
1893-
elif reduction_func == "corrwith" and not as_index:
1913+
if reduction_func == "corrwith" and not as_index:
18941914
msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
18951915
request.node.add_marker(pytest.mark.xfail(reason=msg))
18961916
elif index_kind != "range" and not as_index:
@@ -1912,6 +1932,15 @@ def test_category_order_reducer(
19121932
df = df.set_index(keys)
19131933
args = get_groupby_method_args(reduction_func, df)
19141934
gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
1935+
1936+
if not observed and reduction_func in ["idxmin", "idxmax"]:
1937+
# idxmin and idxmax are designed to fail on empty inputs
1938+
with pytest.raises(
1939+
ValueError, match="empty group due to unobserved categories"
1940+
):
1941+
getattr(gb, reduction_func)(*args)
1942+
return
1943+
19151944
op_result = getattr(gb, reduction_func)(*args)
19161945
if as_index:
19171946
result = op_result.index.get_level_values("a").categories
@@ -2114,6 +2143,13 @@ def test_agg_list(request, as_index, observed, reduction_func, test_series, keys
21142143
gb = gb["b"]
21152144
args = get_groupby_method_args(reduction_func, df)
21162145

2146+
if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
2147+
with pytest.raises(
2148+
ValueError, match="empty group due to unobserved categories"
2149+
):
2150+
gb.agg([reduction_func], *args)
2151+
return
2152+
21172153
result = gb.agg([reduction_func], *args)
21182154
expected = getattr(gb, reduction_func)(*args)
21192155

pandas/tests/groupby/test_function.py

+33
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,39 @@ def test_idxmin_idxmax_axis1():
544544
gb2.idxmax(axis=1)
545545

546546

547+
@pytest.mark.parametrize(
548+
"func, values, expected_values, warn",
549+
[
550+
("idxmin", [0, 1, 2], [0, 2], None),
551+
("idxmax", [0, 1, 2], [1, 2], None),
552+
("idxmin", [0, np.nan, 2], [np.nan, 2], FutureWarning),
553+
("idxmax", [0, np.nan, 2], [np.nan, 2], FutureWarning),
554+
("idxmin", [1, 0, np.nan], [1, np.nan], FutureWarning),
555+
("idxmax", [1, 0, np.nan], [0, np.nan], FutureWarning),
556+
],
557+
)
558+
@pytest.mark.parametrize("test_series", [True, False])
559+
def test_idxmin_idxmax_skipna_false(func, values, expected_values, warn, test_series):
560+
# GH#54234
561+
df = DataFrame(
562+
{
563+
"a": [1, 1, 2],
564+
"b": values,
565+
}
566+
)
567+
gb = df.groupby("a")
568+
index = Index([1, 2], name="a")
569+
expected = DataFrame({"b": expected_values}, index=index)
570+
if test_series:
571+
gb = gb["b"]
572+
expected = expected["b"]
573+
klass = "Series" if test_series else "DataFrame"
574+
msg = f"The behavior of {klass}GroupBy.{func} with all-NA values"
575+
with tm.assert_produces_warning(warn, match=msg):
576+
result = getattr(gb, func)(skipna=False)
577+
tm.assert_equal(result, expected)
578+
579+
547580
@pytest.mark.parametrize("numeric_only", [True, False, None])
548581
def test_axis1_numeric_only(request, groupby_func, numeric_only):
549582
if groupby_func in ("idxmax", "idxmin"):

0 commit comments

Comments
 (0)