From dd3dc81589d8d3b87d9f04350ec35959f01d16af Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 17 Mar 2022 13:38:48 -0700
Subject: [PATCH] BUG: avoid RuntimeError in groupby.max

---
 pandas/_libs/groupby.pyx                     | 69 +++++++++++---------
 pandas/core/groupby/ops.py                   | 12 +---
 pandas/tests/resample/test_datetime_index.py | 20 +++---
 3 files changed, 47 insertions(+), 54 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index d84cc4873a250..899c56a0a075e 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1047,6 +1047,7 @@ def group_last(
     const uint8_t[:, :] mask,
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -1056,7 +1057,6 @@ def group_last(
         iu_64_floating_obj_t val
         ndarray[iu_64_floating_obj_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
-        bint runtime_error = False
         bint uses_mask = mask is not None
         bint isna_entry
 
@@ -1116,8 +1116,7 @@ def group_last(
                     if uses_mask:
                         isna_entry = mask[i, j]
                     else:
-                        isna_entry = _treat_as_na(val, True)
-                        # TODO: Sure we always want is_datetimelike=True?
+                        isna_entry = _treat_as_na(val, is_datetimelike)
 
                     if not isna_entry:
                         nobs[lab, j] += 1
@@ -1125,26 +1124,30 @@ def group_last(
 
             for i in range(ncounts):
                 for j in range(K):
+                    # TODO(cython3): the entire next block can be shared
+                    #  across 3 places once conditional-nogil is available
                     if nobs[i, j] < min_count:
+                        # if we are integer dtype, not is_datetimelike, and
+                        #  not uses_mask, then getting here implies that
+                        #  counts[i] < min_count, which means we will
+                        #  be cast to float64 and masked at the end
+                        #  of WrappedCythonOp._call_cython_op. So we can safely
+                        #  set a placeholder value in out[i, j].
                         if uses_mask:
                             result_mask[i, j] = True
                         elif iu_64_floating_obj_t is int64_t:
-                            # TODO: only if datetimelike?
+                            # Per above, this is a placeholder in
+                            #  non-is_datetimelike cases.
                             out[i, j] = NPY_NAT
                         elif iu_64_floating_obj_t is uint64_t:
-                            runtime_error = True
-                            break
+                            # placeholder, see above
+                            out[i, j] = 0
                         else:
                             out[i, j] = NAN
 
                     else:
                         out[i, j] = resx[i, j]
 
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
 
 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
 #  use `const iu_64_floating_obj_t[:, :] values`
@@ -1159,6 +1162,7 @@ def group_nth(
     uint8_t[:, ::1] result_mask=None,
     int64_t min_count=-1,
     int64_t rank=1,
+    bint is_datetimelike=False,
 ) -> None:
     """
     Only aggregates on axis=0
@@ -1168,7 +1172,6 @@ def group_nth(
         iu_64_floating_obj_t val
         ndarray[iu_64_floating_obj_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
-        bint runtime_error = False
         bint uses_mask = mask is not None
         bint isna_entry
 
@@ -1230,8 +1233,7 @@ def group_nth(
                     if uses_mask:
                         isna_entry = mask[i, j]
                     else:
-                        isna_entry = _treat_as_na(val, True)
-                        # TODO: Sure we always want is_datetimelike=True?
+                        isna_entry = _treat_as_na(val, is_datetimelike)
 
                     if not isna_entry:
                         nobs[lab, j] += 1
@@ -1241,25 +1243,27 @@ def group_nth(
             for i in range(ncounts):
                 for j in range(K):
                     if nobs[i, j] < min_count:
+                        # if we are integer dtype, not is_datetimelike, and
+                        #  not uses_mask, then getting here implies that
+                        #  counts[i] < min_count, which means we will
+                        #  be cast to float64 and masked at the end
+                        #  of WrappedCythonOp._call_cython_op. So we can safely
+                        #  set a placeholder value in out[i, j].
                         if uses_mask:
                             result_mask[i, j] = True
                             out[i, j] = 0
                         elif iu_64_floating_obj_t is int64_t:
-                            # TODO: only if datetimelike?
+                            # Per above, this is a placeholder in
+                            #  non-is_datetimelike cases.
                             out[i, j] = NPY_NAT
                         elif iu_64_floating_obj_t is uint64_t:
-                            runtime_error = True
-                            break
+                            # placeholder, see above
+                            out[i, j] = 0
                         else:
                             out[i, j] = NAN
                     else:
                         out[i, j] = resx[i, j]
 
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -1386,7 +1390,6 @@ cdef group_min_max(
         Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
         iu_64_floating_t val, nan_val
         ndarray[iu_64_floating_t, ndim=2] group_min_or_max
-        bint runtime_error = False
         int64_t[:, ::1] nobs
         bint uses_mask = mask is not None
         bint isna_entry
@@ -1403,7 +1406,6 @@ cdef group_min_max(
     group_min_or_max[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max, is_datetimelike)
 
     if iu_64_floating_t is int64_t:
-        # TODO: only if is_datetimelike?
         nan_val = NPY_NAT
     elif iu_64_floating_t is uint64_t:
         # NB: We do not define nan_val because there is no such thing
@@ -1442,25 +1444,30 @@ cdef group_min_max(
         for i in range(ngroups):
             for j in range(K):
                 if nobs[i, j] < min_count:
+                    # if we are integer dtype, not is_datetimelike, and
+                    #  not uses_mask, then getting here implies that
+                    #  counts[i] < min_count, which means we will
+                    #  be cast to float64 and masked at the end
+                    #  of WrappedCythonOp._call_cython_op. So we can safely
+                    #  set a placeholder value in out[i, j].
                     if uses_mask:
                         result_mask[i, j] = True
                         # set out[i, j] to 0 to be deterministic, as
                         #  it was initialized with np.empty. Also ensures
                         #  we can downcast out if appropriate.
                         out[i, j] = 0
+                    elif iu_64_floating_t is int64_t:
+                        # Per above, this is a placeholder in
+                        #  non-is_datetimelike cases.
+                        out[i, j] = nan_val
                     elif iu_64_floating_t is uint64_t:
-                        runtime_error = True
-                        break
+                        # placeholder, see above
+                        out[i, j] = 0
                     else:
                         out[i, j] = nan_val
                 else:
                     out[i, j] = group_min_or_max[i, j]
 
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 130c69400ea5f..15bc262997075 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -519,7 +519,7 @@ def _call_cython_op(
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
         if self.kind == "aggregate":
             counts = np.zeros(ngroups, dtype=np.int64)
-            if self.how in ["min", "max", "mean"]:
+            if self.how in ["min", "max", "mean", "last", "first"]:
                 func(
                     out=result,
                     counts=counts,
@@ -530,16 +530,6 @@ def _call_cython_op(
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
                 )
-            elif self.how in ["first", "last"]:
-                func(
-                    out=result,
-                    counts=counts,
-                    values=values,
-                    labels=comp_ids,
-                    min_count=min_count,
-                    mask=mask,
-                    result_mask=result_mask,
-                )
             elif self.how in ["add"]:
                 # We support datetimelike
                 func(
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index 8a96643b9834f..fbc3b385e5098 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -1856,15 +1856,11 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype):
     )
     df = df.loc[(df.index < "2000-01-02") | (df.index > "2000-01-03"), :]
 
-    if any_unsigned_int_numpy_dtype == "uint64":
-        with pytest.raises(RuntimeError, match="empty group with uint64_t"):
-            df.resample("D").max()
-    else:
-        result = df.resample("D").max()
-
-        expected = DataFrame(
-            [1, np.nan, 0],
-            columns=["x"],
-            index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"),
-        )
-        tm.assert_frame_equal(result, expected)
+    result = df.resample("D").max()
+
+    expected = DataFrame(
+        [1, np.nan, 0],
+        columns=["x"],
+        index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"),
+    )
+    tm.assert_frame_equal(result, expected)