Skip to content

Commit c451294

Browse files
Backport PR #57046 on branch 2.2.x (REGR: groupby.idxmin/idxmax wrong result on extreme values) (#57086)
Backport PR #57046: REGR: groupby.idxmin/idxmax wrong result on extreme values Co-authored-by: Richard Shadrach <[email protected]>
1 parent 441f65d commit c451294

File tree

4 files changed

+75
-7
lines changed

4 files changed

+75
-7
lines changed

doc/source/whatsnew/v2.2.1.rst

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ Fixed regressions
1515
~~~~~~~~~~~~~~~~~
1616
- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
1717
- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
18+
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
19+
- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
1820
- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
1921

2022
.. ---------------------------------------------------------------------------

pandas/_libs/groupby.pyx

+10-7
Original file line numberDiff line numberDiff line change
@@ -1767,6 +1767,7 @@ def group_idxmin_idxmax(
17671767
Py_ssize_t i, j, N, K, lab
17681768
numeric_object_t val
17691769
numeric_object_t[:, ::1] group_min_or_max
1770+
uint8_t[:, ::1] seen
17701771
bint uses_mask = mask is not None
17711772
bint isna_entry
17721773
bint compute_max = name == "idxmax"
@@ -1780,13 +1781,10 @@ def group_idxmin_idxmax(
17801781

17811782
if numeric_object_t is object:
17821783
group_min_or_max = np.empty((<object>out).shape, dtype=object)
1784+
seen = np.zeros((<object>out).shape, dtype=np.uint8)
17831785
else:
17841786
group_min_or_max = np.empty_like(out, dtype=values.dtype)
1785-
if N > 0 and K > 0:
1786-
# When N or K is zero, we never use group_min_or_max
1787-
group_min_or_max[:] = _get_min_or_max(
1788-
values[0, 0], compute_max, is_datetimelike
1789-
)
1787+
seen = np.zeros_like(out, dtype=np.uint8)
17901788

17911789
# When using transform, we need a valid value for take in the case
17921790
# a category is not observed; these values will be dropped
@@ -1802,6 +1800,7 @@ def group_idxmin_idxmax(
18021800
if not skipna and out[lab, j] == -1:
18031801
# Once we've hit NA there is no going back
18041802
continue
1803+
18051804
val = values[i, j]
18061805

18071806
if uses_mask:
@@ -1810,10 +1809,14 @@ def group_idxmin_idxmax(
18101809
isna_entry = _treat_as_na(val, is_datetimelike)
18111810

18121811
if isna_entry:
1813-
if not skipna:
1812+
if not skipna or not seen[lab, j]:
18141813
out[lab, j] = -1
18151814
else:
1816-
if compute_max:
1815+
if not seen[lab, j]:
1816+
seen[lab, j] = True
1817+
group_min_or_max[lab, j] = val
1818+
out[lab, j] = i
1819+
elif compute_max:
18171820
if val > group_min_or_max[lab, j]:
18181821
group_min_or_max[lab, j] = val
18191822
out[lab, j] = i

pandas/core/groupby/ops.py

+1
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ def _call_cython_op(
424424
mask=mask,
425425
result_mask=result_mask,
426426
is_datetimelike=is_datetimelike,
427+
**kwargs,
427428
)
428429
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
429430
if self.how in ["std", "sem"]:

pandas/tests/groupby/test_reductions.py

+62
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,68 @@ def test_empty(frame_or_series, bool_agg_func):
195195
tm.assert_equal(result, expected)
196196

197197

198+
@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
199+
def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
200+
# GH#57040
201+
if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
202+
# No need to test
203+
return
204+
info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
205+
min_value = info(any_real_numpy_dtype).min
206+
max_value = info(any_real_numpy_dtype).max
207+
df = DataFrame(
208+
{"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
209+
dtype=any_real_numpy_dtype,
210+
)
211+
gb = df.groupby("a")
212+
result = getattr(gb, how)()
213+
expected = DataFrame(
214+
{"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
215+
)
216+
tm.assert_frame_equal(result, expected)
217+
218+
219+
@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
220+
def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
221+
# GH#57040
222+
min_value = np.finfo(float_numpy_dtype).min
223+
max_value = np.finfo(float_numpy_dtype).max
224+
df = DataFrame(
225+
{
226+
"a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
227+
"b": Series(
228+
[
229+
np.nan,
230+
min_value,
231+
np.nan,
232+
max_value,
233+
min_value,
234+
np.nan,
235+
max_value,
236+
np.nan,
237+
np.nan,
238+
np.nan,
239+
],
240+
dtype=float_numpy_dtype,
241+
),
242+
},
243+
)
244+
gb = df.groupby("a")
245+
246+
warn = None if skipna else FutureWarning
247+
msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values"
248+
with tm.assert_produces_warning(warn, match=msg):
249+
result = getattr(gb, how)(skipna=skipna)
250+
if skipna:
251+
values = [1, 3, 4, 6, np.nan]
252+
else:
253+
values = np.nan
254+
expected = DataFrame(
255+
{"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp")
256+
)
257+
tm.assert_frame_equal(result, expected)
258+
259+
198260
@pytest.mark.parametrize(
199261
"func, values",
200262
[

0 commit comments

Comments
 (0)