From dd3dc81589d8d3b87d9f04350ec35959f01d16af Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Mar 2022 13:38:48 -0700 Subject: [PATCH] BUG: avoid RuntimeError in groupby.max --- pandas/_libs/groupby.pyx | 69 +++++++++++--------- pandas/core/groupby/ops.py | 12 +--- pandas/tests/resample/test_datetime_index.py | 20 +++--- 3 files changed, 47 insertions(+), 54 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d84cc4873a250..899c56a0a075e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1047,6 +1047,7 @@ def group_last( const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, + bint is_datetimelike=False, ) -> None: """ Only aggregates on axis=0 @@ -1056,7 +1057,6 @@ def group_last( iu_64_floating_obj_t val ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs - bint runtime_error = False bint uses_mask = mask is not None bint isna_entry @@ -1116,8 +1116,7 @@ def group_last( if uses_mask: isna_entry = mask[i, j] else: - isna_entry = _treat_as_na(val, True) - # TODO: Sure we always want is_datetimelike=True? + isna_entry = _treat_as_na(val, is_datetimelike) if not isna_entry: nobs[lab, j] += 1 @@ -1125,26 +1124,30 @@ def group_last( for i in range(ncounts): for j in range(K): + # TODO(cython3): the entire next block can be shared + # across 3 places once conditional-nogil is available if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. if uses_mask: result_mask[i, j] = True elif iu_64_floating_obj_t is int64_t: - # TODO: only if datetimelike? + # Per above, this is a placeholder in + # non-is_datetimelike cases. out[i, j] = NPY_NAT elif iu_64_floating_obj_t is uint64_t: - runtime_error = True - break + # placeholder, see above + out[i, j] = 0 else: out[i, j] = NAN else: out[i, j] = resx[i, j] - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can # use `const iu_64_floating_obj_t[:, :] values` @@ -1159,6 +1162,7 @@ def group_nth( uint8_t[:, ::1] result_mask=None, int64_t min_count=-1, int64_t rank=1, + bint is_datetimelike=False, ) -> None: """ Only aggregates on axis=0 @@ -1168,7 +1172,6 @@ def group_nth( iu_64_floating_obj_t val ndarray[iu_64_floating_obj_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs - bint runtime_error = False bint uses_mask = mask is not None bint isna_entry @@ -1230,8 +1233,7 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] else: - isna_entry = _treat_as_na(val, True) - # TODO: Sure we always want is_datetimelike=True? + isna_entry = _treat_as_na(val, is_datetimelike) if not isna_entry: nobs[lab, j] += 1 @@ -1241,25 +1243,27 @@ def group_nth( for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. if uses_mask: result_mask[i, j] = True out[i, j] = 0 elif iu_64_floating_obj_t is int64_t: - # TODO: only if datetimelike? + # Per above, this is a placeholder in + # non-is_datetimelike cases. out[i, j] = NPY_NAT elif iu_64_floating_obj_t is uint64_t: - runtime_error = True - break + # placeholder, see above + out[i, j] = 0 else: out[i, j] = NAN else: out[i, j] = resx[i, j] - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - @cython.boundscheck(False) @cython.wraparound(False) @@ -1386,7 +1390,6 @@ cdef group_min_max( Py_ssize_t i, j, N, K, lab, ngroups = len(counts) iu_64_floating_t val, nan_val ndarray[iu_64_floating_t, ndim=2] group_min_or_max - bint runtime_error = False int64_t[:, ::1] nobs bint uses_mask = mask is not None bint isna_entry @@ -1403,7 +1406,6 @@ cdef group_min_max( group_min_or_max[:] = _get_min_or_max(0, compute_max, is_datetimelike) if iu_64_floating_t is int64_t: - # TODO: only if is_datetimelike? nan_val = NPY_NAT elif iu_64_floating_t is uint64_t: # NB: We do not define nan_val because there is no such thing @@ -1442,25 +1444,30 @@ cdef group_min_max( for i in range(ngroups): for j in range(K): if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. if uses_mask: result_mask[i, j] = True # set out[i, j] to 0 to be deterministic, as # it was initialized with np.empty. Also ensures # we can downcast out if appropriate. out[i, j] = 0 + elif iu_64_floating_t is int64_t: + # Per above, this is a placeholder in + # non-is_datetimelike cases. + out[i, j] = nan_val elif iu_64_floating_t is uint64_t: - runtime_error = True - break + # placeholder, see above + out[i, j] = 0 else: out[i, j] = nan_val else: out[i, j] = group_min_or_max[i, j] - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 130c69400ea5f..15bc262997075 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -519,7 +519,7 @@ def _call_cython_op( result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max", "mean"]: + if self.how in ["min", "max", "mean", "last", "first"]: func( out=result, counts=counts, @@ -530,16 +530,6 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) - elif self.how in ["first", "last"]: - func( - out=result, - counts=counts, - values=values, - labels=comp_ids, - min_count=min_count, - mask=mask, - result_mask=result_mask, - ) elif self.how in ["add"]: # We support datetimelike func( diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8a96643b9834f..fbc3b385e5098 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1856,15 +1856,11 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype): ) df = df.loc[(df.index < "2000-01-02") | (df.index > "2000-01-03"), :] - if any_unsigned_int_numpy_dtype == "uint64": - with pytest.raises(RuntimeError, match="empty group with uint64_t"): - df.resample("D").max() - else: - result = df.resample("D").max() - - expected = DataFrame( - [1, np.nan, 0], - columns=["x"], - index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"), - ) - tm.assert_frame_equal(result, expected) + result = df.resample("D").max() + + expected = DataFrame( + [1, np.nan, 0], + columns=["x"], + index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"), + ) + tm.assert_frame_equal(result, expected)