Skip to content

Commit 1de399c

Browse files
rhshadrachpmhatre1
authored andcommitted
REGR: groupby.idxmin/idxmax wrong result on extreme values (pandas-dev#57046)
* WIP * REGR: groupby.idxmin/idxmax wrong result on extreme values * fixes * NumPy 2.0 can't come soon enough * fixup * fixup
1 parent 68fca6b commit 1de399c

File tree

4 files changed

+75
-7
lines changed

4 files changed

+75
-7
lines changed

doc/source/whatsnew/v2.2.1.rst

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ Fixed regressions
1515
~~~~~~~~~~~~~~~~~
1616
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
1717
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
18+
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
19+
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
1820
- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
1921

2022
.. ---------------------------------------------------------------------------

pandas/_libs/groupby.pyx

+10-7
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,7 @@ def group_idxmin_idxmax(
17711771
Py_ssize_t i, j, N, K, lab
17721772
numeric_object_t val
17731773
numeric_object_t[:, ::1] group_min_or_max
1774+
uint8_t[:, ::1] seen
17741775
bint uses_mask = mask is not None
17751776
bint isna_entry
17761777
bint compute_max = name == "idxmax"
@@ -1784,13 +1785,10 @@ def group_idxmin_idxmax(
17841785

17851786
if numeric_object_t is object:
17861787
group_min_or_max = np.empty((<object>out).shape, dtype=object)
1788+
seen = np.zeros((<object>out).shape, dtype=np.uint8)
17871789
else:
17881790
group_min_or_max = np.empty_like(out, dtype=values.dtype)
1789-
if N > 0 and K > 0:
1790-
# When N or K is zero, we never use group_min_or_max
1791-
group_min_or_max[:] = _get_min_or_max(
1792-
values[0, 0], compute_max, is_datetimelike
1793-
)
1791+
seen = np.zeros_like(out, dtype=np.uint8)
17941792

17951793
# When using transform, we need a valid value for take in the case
17961794
# a category is not observed; these values will be dropped
@@ -1806,6 +1804,7 @@ def group_idxmin_idxmax(
18061804
if not skipna and out[lab, j] == -1:
18071805
# Once we've hit NA there is no going back
18081806
continue
1807+
18091808
val = values[i, j]
18101809

18111810
if uses_mask:
@@ -1814,10 +1813,14 @@ def group_idxmin_idxmax(
18141813
isna_entry = _treat_as_na(val, is_datetimelike)
18151814

18161815
if isna_entry:
1817-
if not skipna:
1816+
if not skipna or not seen[lab, j]:
18181817
out[lab, j] = -1
18191818
else:
1820-
if compute_max:
1819+
if not seen[lab, j]:
1820+
seen[lab, j] = True
1821+
group_min_or_max[lab, j] = val
1822+
out[lab, j] = i
1823+
elif compute_max:
18211824
if val > group_min_or_max[lab, j]:
18221825
group_min_or_max[lab, j] = val
18231826
out[lab, j] = i

pandas/core/groupby/ops.py

+1
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ def _call_cython_op(
424424
mask=mask,
425425
result_mask=result_mask,
426426
is_datetimelike=is_datetimelike,
427+
**kwargs,
427428
)
428429
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
429430
if self.how in ["std", "sem"]:

pandas/tests/groupby/test_reductions.py

+62
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,68 @@ def test_empty(frame_or_series, all_boolean_reductions):
257257
tm.assert_equal(result, expected)
258258

259259

260+
@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
261+
def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
262+
# GH#57040
263+
if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
264+
# No need to test
265+
return
266+
info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
267+
min_value = info(any_real_numpy_dtype).min
268+
max_value = info(any_real_numpy_dtype).max
269+
df = DataFrame(
270+
{"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
271+
dtype=any_real_numpy_dtype,
272+
)
273+
gb = df.groupby("a")
274+
result = getattr(gb, how)()
275+
expected = DataFrame(
276+
{"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
277+
)
278+
tm.assert_frame_equal(result, expected)
279+
280+
281+
@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
282+
def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
283+
# GH#57040
284+
min_value = np.finfo(float_numpy_dtype).min
285+
max_value = np.finfo(float_numpy_dtype).max
286+
df = DataFrame(
287+
{
288+
"a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
289+
"b": Series(
290+
[
291+
np.nan,
292+
min_value,
293+
np.nan,
294+
max_value,
295+
min_value,
296+
np.nan,
297+
max_value,
298+
np.nan,
299+
np.nan,
300+
np.nan,
301+
],
302+
dtype=float_numpy_dtype,
303+
),
304+
},
305+
)
306+
gb = df.groupby("a")
307+
308+
warn = None if skipna else FutureWarning
309+
msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values"
310+
with tm.assert_produces_warning(warn, match=msg):
311+
result = getattr(gb, how)(skipna=skipna)
312+
if skipna:
313+
values = [1, 3, 4, 6, np.nan]
314+
else:
315+
values = np.nan
316+
expected = DataFrame(
317+
{"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp")
318+
)
319+
tm.assert_frame_equal(result, expected)
320+
321+
260322
@pytest.mark.parametrize(
261323
"func, values",
262324
[

0 commit comments

Comments
 (0)