Skip to content

Commit 51a5d9c

Browse files
jbrockmendelyeshsurya
authored andcommitted
BUG: incorrect rounding in groupby.cummin near int64 implementation bounds (pandas-dev#40767)
1 parent a49464e commit 51a5d9c

File tree

4 files changed

+91
-26
lines changed

4 files changed

+91
-26
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -794,7 +794,7 @@ Groupby/resample/rolling
794794
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
795795
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
796796
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)
797-
797+
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
798798

799799

800800
Reshaping

pandas/_libs/groupby.pyx

+26-6
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
11391139
ndarray[groupby_t, ndim=2] values,
11401140
const intp_t[:] labels,
11411141
Py_ssize_t min_count=-1,
1142+
bint is_datetimelike=False,
11421143
bint compute_max=True):
11431144
"""
11441145
Compute minimum/maximum of columns of `values`, in row groups `labels`.
@@ -1156,6 +1157,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
11561157
min_count : Py_ssize_t, default -1
11571158
The minimum number of non-NA group elements, NA result if threshold
11581159
is not met
1160+
is_datetimelike : bool
1161+
True if `values` contains datetime-like entries.
11591162
compute_max : bint, default True
11601163
True to compute group-wise max, False to compute min
11611164
@@ -1204,8 +1207,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
12041207
for j in range(K):
12051208
val = values[i, j]
12061209

1207-
if not _treat_as_na(val, True):
1208-
# TODO: Sure we always want is_datetimelike=True?
1210+
if not _treat_as_na(val, is_datetimelike):
12091211
nobs[lab, j] += 1
12101212
if compute_max:
12111213
if val > group_min_or_max[lab, j]:
@@ -1237,9 +1239,18 @@ def group_max(groupby_t[:, ::1] out,
12371239
int64_t[::1] counts,
12381240
ndarray[groupby_t, ndim=2] values,
12391241
const intp_t[:] labels,
1240-
Py_ssize_t min_count=-1) -> None:
1242+
Py_ssize_t min_count=-1,
1243+
bint is_datetimelike=False) -> None:
12411244
"""See group_min_max.__doc__"""
1242-
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
1245+
group_min_max(
1246+
out,
1247+
counts,
1248+
values,
1249+
labels,
1250+
min_count=min_count,
1251+
is_datetimelike=is_datetimelike,
1252+
compute_max=True,
1253+
)
12431254

12441255

12451256
@cython.wraparound(False)
@@ -1248,9 +1259,18 @@ def group_min(groupby_t[:, ::1] out,
12481259
int64_t[::1] counts,
12491260
ndarray[groupby_t, ndim=2] values,
12501261
const intp_t[:] labels,
1251-
Py_ssize_t min_count=-1) -> None:
1262+
Py_ssize_t min_count=-1,
1263+
bint is_datetimelike=False) -> None:
12521264
"""See group_min_max.__doc__"""
1253-
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
1265+
group_min_max(
1266+
out,
1267+
counts,
1268+
values,
1269+
labels,
1270+
min_count=min_count,
1271+
is_datetimelike=is_datetimelike,
1272+
compute_max=False,
1273+
)
12541274

12551275

12561276
@cython.boundscheck(False)

pandas/core/groupby/ops.py

+27-17
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
from pandas._libs import (
2222
NaT,
23-
iNaT,
2423
lib,
2524
)
2625
import pandas._libs.groupby as libgroupby
@@ -663,12 +662,7 @@ def _cython_operation(
663662
elif is_bool_dtype(dtype):
664663
values = ensure_int_or_float(values)
665664
elif is_integer_dtype(dtype):
666-
# we use iNaT for the missing value on ints
667-
# so pre-convert to guard this condition
668-
if (values == iNaT).any():
669-
values = ensure_float64(values)
670-
else:
671-
values = ensure_int_or_float(values)
665+
values = ensure_int_or_float(values)
672666
elif is_numeric:
673667
if not is_complex_dtype(dtype):
674668
values = ensure_float64(values)
@@ -686,20 +680,36 @@ def _cython_operation(
686680
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
687681
if kind == "aggregate":
688682
counts = np.zeros(ngroups, dtype=np.int64)
689-
func(result, counts, values, comp_ids, min_count)
683+
if how in ["min", "max"]:
684+
func(
685+
result,
686+
counts,
687+
values,
688+
comp_ids,
689+
min_count,
690+
is_datetimelike=is_datetimelike,
691+
)
692+
else:
693+
func(result, counts, values, comp_ids, min_count)
690694
elif kind == "transform":
691695
# TODO: min_count
692696
func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
693697

694-
if is_integer_dtype(result.dtype) and not is_datetimelike:
695-
mask = result == iNaT
696-
if mask.any():
697-
result = result.astype("float64")
698-
result[mask] = np.nan
699-
700-
if kind == "aggregate" and self._filter_empty_groups and not counts.all():
701-
assert result.ndim != 2
702-
result = result[counts > 0]
698+
if kind == "aggregate":
699+
# i.e. counts is defined. Locations where count<min_count
700+
# need to have the result set to np.nan, which may require casting,
701+
# see GH#40767
702+
if is_integer_dtype(result.dtype) and not is_datetimelike:
703+
cutoff = max(1, min_count)
704+
empty_groups = counts < cutoff
705+
if empty_groups.any():
706+
# Note: this conversion could be lossy, see GH#40767
707+
result = result.astype("float64")
708+
result[empty_groups] = np.nan
709+
710+
if self._filter_empty_groups and not counts.all():
711+
assert result.ndim != 2
712+
result = result[counts > 0]
703713

704714
result = result.T
705715

pandas/tests/groupby/test_function.py

+37-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pytest
66

7+
from pandas._libs.tslibs import iNaT
78
from pandas.errors import UnsupportedFunctionCall
89

910
import pandas as pd
@@ -591,6 +592,38 @@ def test_max_nan_bug():
591592
assert not r["File"].isna().any()
592593

593594

595+
def test_max_inat():
596+
# GH#40767 dont interpret iNaT as NaN
597+
ser = Series([1, iNaT])
598+
gb = ser.groupby([1, 1])
599+
600+
result = gb.max(min_count=2)
601+
expected = Series({1: 1}, dtype=np.int64)
602+
tm.assert_series_equal(result, expected, check_exact=True)
603+
604+
result = gb.min(min_count=2)
605+
expected = Series({1: iNaT}, dtype=np.int64)
606+
tm.assert_series_equal(result, expected, check_exact=True)
607+
608+
# not enough entries -> gets masked to NaN
609+
result = gb.min(min_count=3)
610+
expected = Series({1: np.nan})
611+
tm.assert_series_equal(result, expected, check_exact=True)
612+
613+
614+
def test_max_inat_not_all_na():
615+
# GH#40767 dont interpret iNaT as NaN
616+
617+
# make sure we dont round iNaT+1 to iNaT
618+
ser = Series([1, iNaT, 2, iNaT + 1])
619+
gb = ser.groupby([1, 2, 3, 3])
620+
result = gb.min(min_count=2)
621+
622+
# Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
623+
expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
624+
tm.assert_series_equal(result, expected, check_exact=True)
625+
626+
594627
def test_nlargest():
595628
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
596629
b = Series(list("a" * 5 + "b" * 5))
@@ -708,11 +741,13 @@ def test_cummin(numpy_dtypes_for_minmax):
708741

709742
# Test w/ min value for dtype
710743
df.loc[[2, 6], "B"] = min_val
744+
df.loc[[1, 5], "B"] = min_val + 1
711745
expected.loc[[2, 3, 6, 7], "B"] = min_val
746+
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
712747
result = df.groupby("A").cummin()
713-
tm.assert_frame_equal(result, expected)
748+
tm.assert_frame_equal(result, expected, check_exact=True)
714749
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
715-
tm.assert_frame_equal(result, expected)
750+
tm.assert_frame_equal(result, expected, check_exact=True)
716751

717752
# Test nan in some values
718753
base_df.loc[[0, 2, 4, 6], "B"] = np.nan

0 commit comments

Comments
 (0)