From 7936a9c9f23678791001cccee7254111c1711d77 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 11:45:00 -0700 Subject: [PATCH 1/5] BUG: groupby.cummin losing precision on large integers --- pandas/core/groupby/ops.py | 31 ++++++++++++++------------- pandas/tests/groupby/test_function.py | 6 ++++-- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4b68717763d87..bf17fdeb55bd9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -671,12 +671,7 @@ def _cython_operation( elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): - # we use iNaT for the missing value on ints - # so pre-convert to guard this condition - if (values == iNaT).any(): - values = ensure_float64(values) - else: - values = ensure_int_or_float(values) + values = ensure_int_or_float(values) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) @@ -699,15 +694,21 @@ def _cython_operation( # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) - if is_integer_dtype(result.dtype) and not is_datetimelike: - mask = result == iNaT - if mask.any(): - result = result.astype("float64") - result[mask] = np.nan - - if kind == "aggregate" and self._filter_empty_groups and not counts.all(): - assert result.ndim != 2 - result = result[counts > 0] + if kind == "aggregate": + # i.e. counts is defined + if is_integer_dtype(result.dtype) and not is_datetimelike: + mask = result == iNaT + if mask.any(): + # TODO: we dont have any tests that get here with min_count > 1 + cutoff = max(1, min_count) + empty_groups = counts < cutoff + + result = result.astype("float64") # TODO: could be lossy + result[empty_groups] = np.nan + + if self._filter_empty_groups and not counts.all(): + assert result.ndim != 2 + result = result[counts > 0] result = result.T diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 515774eae009b..84a3bf8215c54 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -752,11 +752,13 @@ def test_cummin(numpy_dtypes_for_minmax): # Test w/ min value for dtype df.loc[[2, 6], "B"] = min_val + df.loc[[1, 5], "B"] = min_val + 1 expected.loc[[2, 3, 6, 7], "B"] = min_val + expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_exact=True) expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_exact=True) # Test nan in some values base_df.loc[[0, 2, 4, 6], "B"] = np.nan From 234419e0d7e49a42bfbed70b0f0d997428d352e5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 14:30:56 -0700 Subject: [PATCH 2/5] simplify check --- pandas/core/groupby/ops.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index bf17fdeb55bd9..548699a2143ac 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -25,7 +25,6 @@ from pandas._libs import ( NaT, - iNaT, lib, ) import pandas._libs.groupby as libgroupby @@ -697,12 +696,10 @@ def _cython_operation( if kind == "aggregate": # i.e. counts is defined if is_integer_dtype(result.dtype) and not is_datetimelike: - mask = result == iNaT - if mask.any(): - # TODO: we dont have any tests that get here with min_count > 1 - cutoff = max(1, min_count) - empty_groups = counts < cutoff - + # TODO: we don't have any tests that get here with min_count > 1 + cutoff = max(1, min_count) + empty_groups = counts < cutoff + if empty_groups.any(): result = result.astype("float64") # TODO: could be lossy result[empty_groups] = np.nan From b8f5a2880d77059b152b60bbee36e734052a2ef9 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Apr 2021 18:34:54 -0700 Subject: [PATCH 3/5] Handle min/max --- pandas/_libs/groupby.pyx | 32 +++++++++++++++++++++----- pandas/core/groupby/ops.py | 16 ++++++++++--- pandas/tests/groupby/test_function.py | 33 +++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 48ee01c809efd..5f38f8a1a529b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1122,6 +1122,7 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, Py_ssize_t min_count=-1, + bint is_datetimelike=False, bint compute_max=True): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1139,6 +1140,8 @@ cdef group_min_max(groupby_t[:, ::1] out, min_count : Py_ssize_t, default -1 The minimum number of non-NA group elements, NA result if threshold is not met + is_datetimelike : bool + True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min @@ -1187,8 +1190,7 @@ cdef group_min_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? + if not _treat_as_na(val, is_datetimelike): nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1220,9 +1222,18 @@ def group_max(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1) -> None: + Py_ssize_t min_count=-1, + bint is_datetimelike=False) -> None: """See group_min_max.__doc__""" - group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True) + group_min_max( + out, + counts, + values, + labels, + min_count=min_count, + is_datetimelike=is_datetimelike, + compute_max=True, + ) @cython.wraparound(False) @@ -1231,9 +1242,18 @@ def group_min(groupby_t[:, ::1] out, int64_t[::1] counts, ndarray[groupby_t, ndim=2] values, const intp_t[:] labels, - Py_ssize_t min_count=-1) -> None: + Py_ssize_t min_count=-1, + bint is_datetimelike=False) -> None: """See group_min_max.__doc__""" - group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False) + group_min_max( + out, + counts, + values, + labels, + min_count=min_count, + is_datetimelike=is_datetimelike, + compute_max=False, + ) @cython.boundscheck(False) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6ed4fc50affdb..9b1a110693190 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -694,7 +694,17 @@ def _cython_operation( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - func(result, counts, values, comp_ids, min_count) + if how in ["min", "max"]: + func( + result, + counts, + values, + comp_ids, + min_count, + is_datetimelike=is_datetimelike, + ) + else: + func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) @@ -702,11 +712,11 @@ def _cython_operation( if kind == "aggregate": # i.e. counts is defined if is_integer_dtype(result.dtype) and not is_datetimelike: - # TODO: we don't have any tests that get here with min_count > 1 cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): - result = result.astype("float64") # TODO: could be lossy + # Note: this conversion could be lossy, see GH#40767 + result = result.astype("float64") result[empty_groups] = np.nan if self._filter_empty_groups and not counts.all(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 84a3bf8215c54..3b2114cb650a9 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs.tslibs import iNaT from pandas.errors import UnsupportedFunctionCall import pandas as pd @@ -635,6 +636,38 @@ def test_max_nan_bug(): assert not r["File"].isna().any() +def test_max_inat(): + # GH#40767 dont interpret iNaT as NaN + ser = Series([1, iNaT]) + gb = ser.groupby([1, 1]) + + result = gb.max(min_count=2) + expected = Series({1: 1}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + result = gb.min(min_count=2) + expected = Series({1: iNaT}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + # not enough entries -> gets masked to NaN + result = gb.min(min_count=3) + expected = Series({1: np.nan}) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_max_inat_not_all_na(): + # GH#40767 dont interpret iNaT as NaN + + # make sure we dont round iNaT+1 to iNaT + ser = Series([1, iNaT, 2, iNaT + 1]) + gb = ser.groupby([1, 2, 3, 3]) + result = gb.min(min_count=2) + + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) + tm.assert_series_equal(result, expected, check_exact=True) + + def test_nlargest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list("a" * 5 + "b" * 5)) From e5cf34d2201f8125354726e16818639d80a3c749 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 12 Apr 2021 14:34:35 -0700 Subject: [PATCH 4/5] comment+GH ref --- pandas/core/groupby/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d8288fe562825..021244dc613a8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -697,7 +697,9 @@ def _cython_operation( func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": - # i.e. counts is defined + # i.e. counts is defined. Locations where count Date: Tue, 13 Apr 2021 10:09:01 -0700 Subject: [PATCH 5/5] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17cb0a7836fbe..4061987fe6ce7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -787,6 +787,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`) - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`) +- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`) Reshaping ^^^^^^^^^