diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f132cdf6bbf6e..2f0ff731ebce3 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -791,7 +791,7 @@ Groupby/resample/rolling - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`) - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`) - Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) - +- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 29e74fe4c0d8f..8fb307150a48f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1139,6 +1139,7 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=-1, + bint is_datetimelike=False, bint compute_max=True): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1156,6 +1157,8 @@ cdef group_min_max(groupby_t[:, ::1] out, min_count : Py_ssize_t, default -1 The minimum number of non-NA group elements, NA result if threshold is not met + is_datetimelike : bool + True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min @@ -1204,8 +1207,7 @@ cdef group_min_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? + if not _treat_as_na(val, is_datetimelike): nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1237,9 +1239,18 @@ def group_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1) -> None: + Py_ssize_t min_count=-1, + bint is_datetimelike=False) -> None: """See group_min_max.__doc__""" - group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True) + group_min_max( + out, + counts, + values, + labels, + min_count=min_count, + is_datetimelike=is_datetimelike, + compute_max=True, + ) @cython.wraparound(False) @@ -1248,9 +1259,18 @@ def group_min(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1) -> None: + Py_ssize_t min_count=-1, + bint is_datetimelike=False) -> None: """See group_min_max.__doc__""" - group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False) + group_min_max( + out, + counts, + values, + labels, + min_count=min_count, + is_datetimelike=is_datetimelike, + compute_max=False, + ) @cython.boundscheck(False) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3ab92ab5afa9c..702d67b198e8d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -20,7 +20,6 @@ from pandas._libs import ( NaT, - iNaT, lib, ) import pandas._libs.groupby as libgroupby @@ -663,12 +662,7 @@ def _cython_operation( elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): - # we use iNaT for the missing value on ints - # so pre-convert to guard this condition - if (values == iNaT).any(): - values = ensure_float64(values) - else: - values = ensure_int_or_float(values) + values = ensure_int_or_float(values) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) @@ -686,20 +680,36 @@ def _cython_operation( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - func(result, counts, values, comp_ids, min_count) + if how in ["min", "max"]: + func( + result, + counts, + values, + comp_ids, + min_count, + is_datetimelike=is_datetimelike, + ) + else: + func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) - if is_integer_dtype(result.dtype) and not is_datetimelike: - mask = result == iNaT - if mask.any(): - result = result.astype("float64") - result[mask] = np.nan - - if kind == "aggregate" and self._filter_empty_groups and not counts.all(): - assert result.ndim != 2 - result = result[counts > 0] + if kind == "aggregate": + # i.e. counts is defined. Locations where count 0] result = result.T diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 843d438018a32..d7020e2ffd701 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs.tslibs import iNaT from pandas.errors import UnsupportedFunctionCall import pandas as pd @@ -591,6 +592,38 @@ def test_max_nan_bug(): assert not r["File"].isna().any() +def test_max_inat(): + # GH#40767 dont interpret iNaT as NaN + ser = Series([1, iNaT]) + gb = ser.groupby([1, 1]) + + result = gb.max(min_count=2) + expected = Series({1: 1}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + result = gb.min(min_count=2) + expected = Series({1: iNaT}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + # not enough entries -> gets masked to NaN + result = gb.min(min_count=3) + expected = Series({1: np.nan}) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_max_inat_not_all_na(): + # GH#40767 dont interpret iNaT as NaN + + # make sure we dont round iNaT+1 to iNaT + ser = Series([1, iNaT, 2, iNaT + 1]) + gb = ser.groupby([1, 2, 3, 3]) + result = gb.min(min_count=2) + + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) + tm.assert_series_equal(result, expected, check_exact=True) + + def test_nlargest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list("a" * 5 + "b" * 5)) @@ -708,11 +741,13 @@ def test_cummin(numpy_dtypes_for_minmax): # Test w/ min value for dtype df.loc[[2, 6], "B"] = min_val + df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val + expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_exact=True) expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_exact=True) # Test nan in some values base_df.loc[[0, 2, 4, 6], "B"] = np.nan