From 7936a9c9f23678791001cccee7254111c1711d77 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 2 Apr 2021 11:45:00 -0700
Subject: [PATCH 1/5] BUG: groupby.cummin losing precision on large integers

---
 pandas/core/groupby/ops.py            | 31 ++++++++++++++-------------
 pandas/tests/groupby/test_function.py |  6 ++++--
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 4b68717763d87..bf17fdeb55bd9 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -671,12 +671,7 @@ def _cython_operation(
         elif is_bool_dtype(dtype):
             values = ensure_int_or_float(values)
         elif is_integer_dtype(dtype):
-            # we use iNaT for the missing value on ints
-            # so pre-convert to guard this condition
-            if (values == iNaT).any():
-                values = ensure_float64(values)
-            else:
-                values = ensure_int_or_float(values)
+            values = ensure_int_or_float(values)
         elif is_numeric:
             if not is_complex_dtype(dtype):
                 values = ensure_float64(values)
@@ -699,15 +694,21 @@ def _cython_operation(
             # TODO: min_count
             func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 
-        if is_integer_dtype(result.dtype) and not is_datetimelike:
-            mask = result == iNaT
-            if mask.any():
-                result = result.astype("float64")
-                result[mask] = np.nan
-
-        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
-            assert result.ndim != 2
-            result = result[counts > 0]
+        if kind == "aggregate":
+            # i.e. counts is defined
+            if is_integer_dtype(result.dtype) and not is_datetimelike:
+                mask = result == iNaT
+                if mask.any():
+                    # TODO: we dont have any tests that get here with min_count > 1
+                    cutoff = max(1, min_count)
+                    empty_groups = counts < cutoff
+
+                    result = result.astype("float64")  # TODO: could be lossy
+                    result[empty_groups] = np.nan
+
+            if self._filter_empty_groups and not counts.all():
+                assert result.ndim != 2
+                result = result[counts > 0]
 
         result = result.T
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 515774eae009b..84a3bf8215c54 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -752,11 +752,13 @@ def test_cummin(numpy_dtypes_for_minmax):
 
     # Test w/ min value for dtype
     df.loc[[2, 6], "B"] = min_val
+    df.loc[[1, 5], "B"] = min_val + 1
     expected.loc[[2, 3, 6, 7], "B"] = min_val
+    expected.loc[[1, 5], "B"] = min_val + 1  # should not be rounded to min_val
     result = df.groupby("A").cummin()
-    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(result, expected, check_exact=True)
     expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
-    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(result, expected, check_exact=True)
 
     # Test nan in some values
     base_df.loc[[0, 2, 4, 6], "B"] = np.nan

From 234419e0d7e49a42bfbed70b0f0d997428d352e5 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 2 Apr 2021 14:30:56 -0700
Subject: [PATCH 2/5] simplify check

---
 pandas/core/groupby/ops.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index bf17fdeb55bd9..548699a2143ac 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -25,7 +25,6 @@
 
 from pandas._libs import (
     NaT,
-    iNaT,
     lib,
 )
 import pandas._libs.groupby as libgroupby
@@ -697,12 +696,10 @@ def _cython_operation(
         if kind == "aggregate":
             # i.e. counts is defined
             if is_integer_dtype(result.dtype) and not is_datetimelike:
-                mask = result == iNaT
-                if mask.any():
-                    # TODO: we dont have any tests that get here with min_count > 1
-                    cutoff = max(1, min_count)
-                    empty_groups = counts < cutoff
-
+                # TODO: we don't have any tests that get here with min_count > 1
+                cutoff = max(1, min_count)
+                empty_groups = counts < cutoff
+                if empty_groups.any():
                     result = result.astype("float64")  # TODO: could be lossy
                     result[empty_groups] = np.nan
 

From b8f5a2880d77059b152b60bbee36e734052a2ef9 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 5 Apr 2021 18:34:54 -0700
Subject: [PATCH 3/5] Handle min/max

---
 pandas/_libs/groupby.pyx              | 32 +++++++++++++++++++++-----
 pandas/core/groupby/ops.py            | 16 ++++++++++---
 pandas/tests/groupby/test_function.py | 33 +++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 48ee01c809efd..5f38f8a1a529b 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1122,6 +1122,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
                    ndarray[groupby_t, ndim=2] values,
                    const intp_t[:] labels,
                    Py_ssize_t min_count=-1,
+                   bint is_datetimelike=False,
                    bint compute_max=True):
     """
     Compute minimum/maximum  of columns of `values`, in row groups `labels`.
@@ -1139,6 +1140,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
     min_count : Py_ssize_t, default -1
         The minimum number of non-NA group elements, NA result if threshold
         is not met
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
     compute_max : bint, default True
         True to compute group-wise max, False to compute min
 
@@ -1187,8 +1190,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val = values[i, j]
 
-                if not _treat_as_na(val, True):
-                    # TODO: Sure we always want is_datetimelike=True?
+                if not _treat_as_na(val, is_datetimelike):
                     nobs[lab, j] += 1
                     if compute_max:
                         if val > group_min_or_max[lab, j]:
@@ -1220,9 +1222,18 @@ def group_max(groupby_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[groupby_t, ndim=2] values,
               const intp_t[:] labels,
-              Py_ssize_t min_count=-1) -> None:
+              Py_ssize_t min_count=-1,
+              bint is_datetimelike=False) -> None:
     """See group_min_max.__doc__"""
-    group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
+    group_min_max(
+        out,
+        counts,
+        values,
+        labels,
+        min_count=min_count,
+        is_datetimelike=is_datetimelike,
+        compute_max=True,
+    )
 
 
 @cython.wraparound(False)
@@ -1231,9 +1242,18 @@ def group_min(groupby_t[:, ::1] out,
               int64_t[::1] counts,
               ndarray[groupby_t, ndim=2] values,
               const intp_t[:] labels,
-              Py_ssize_t min_count=-1) -> None:
+              Py_ssize_t min_count=-1,
+              bint is_datetimelike=False) -> None:
     """See group_min_max.__doc__"""
-    group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
+    group_min_max(
+        out,
+        counts,
+        values,
+        labels,
+        min_count=min_count,
+        is_datetimelike=is_datetimelike,
+        compute_max=False,
+    )
 
 
 @cython.boundscheck(False)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 6ed4fc50affdb..9b1a110693190 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -694,7 +694,17 @@ def _cython_operation(
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
         if kind == "aggregate":
             counts = np.zeros(ngroups, dtype=np.int64)
-            func(result, counts, values, comp_ids, min_count)
+            if how in ["min", "max"]:
+                func(
+                    result,
+                    counts,
+                    values,
+                    comp_ids,
+                    min_count,
+                    is_datetimelike=is_datetimelike,
+                )
+            else:
+                func(result, counts, values, comp_ids, min_count)
         elif kind == "transform":
             # TODO: min_count
             func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
@@ -702,11 +712,11 @@ def _cython_operation(
         if kind == "aggregate":
             # i.e. counts is defined
             if is_integer_dtype(result.dtype) and not is_datetimelike:
-                # TODO: we don't have any tests that get here with min_count > 1
                 cutoff = max(1, min_count)
                 empty_groups = counts < cutoff
                 if empty_groups.any():
-                    result = result.astype("float64")  # TODO: could be lossy
+                    # Note: this conversion could be lossy, see GH#40767
+                    result = result.astype("float64")
                     result[empty_groups] = np.nan
 
             if self._filter_empty_groups and not counts.all():
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 84a3bf8215c54..3b2114cb650a9 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs.tslibs import iNaT
 from pandas.errors import UnsupportedFunctionCall
 
 import pandas as pd
@@ -635,6 +636,38 @@ def test_max_nan_bug():
     assert not r["File"].isna().any()
 
 
+def test_max_inat():
+    # GH#40767 dont interpret iNaT as NaN
+    ser = Series([1, iNaT])
+    gb = ser.groupby([1, 1])
+
+    result = gb.max(min_count=2)
+    expected = Series({1: 1}, dtype=np.int64)
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+    result = gb.min(min_count=2)
+    expected = Series({1: iNaT}, dtype=np.int64)
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+    # not enough entries -> gets masked to NaN
+    result = gb.min(min_count=3)
+    expected = Series({1: np.nan})
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+
+def test_max_inat_not_all_na():
+    # GH#40767 dont interpret iNaT as NaN
+
+    # make sure we dont round iNaT+1 to iNaT
+    ser = Series([1, iNaT, 2, iNaT + 1])
+    gb = ser.groupby([1, 2, 3, 3])
+    result = gb.min(min_count=2)
+
+    # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
+    expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+
 def test_nlargest():
     a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
     b = Series(list("a" * 5 + "b" * 5))

From e5cf34d2201f8125354726e16818639d80a3c749 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 12 Apr 2021 14:34:35 -0700
Subject: [PATCH 4/5] comment+GH ref

---
 pandas/core/groupby/ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d8288fe562825..021244dc613a8 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -697,7 +697,9 @@ def _cython_operation(
             func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
 
         if kind == "aggregate":
-            # i.e. counts is defined
+            # i.e. counts is defined.  Locations where count<min_count
+            # need to have the result set to np.nan, which may require casting,
+            # see GH#40767
             if is_integer_dtype(result.dtype) and not is_datetimelike:
                 cutoff = max(1, min_count)
                 empty_groups = counts < cutoff

From eeba093857b69b7172d69e7ec5d2fe3e93697d1c Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 13 Apr 2021 10:09:01 -0700
Subject: [PATCH 5/5] whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 17cb0a7836fbe..4061987fe6ce7 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -787,6 +787,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`)
 - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
 - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
+- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)
 
 Reshaping
 ^^^^^^^^^