Skip to content

Commit 2d80d92

Browse files
authored
BUG: avoid RuntimeError in groupby.max (#46408)
1 parent 6e1ba74 commit 2d80d92

File tree

3 files changed

+47
-54
lines changed

3 files changed

+47
-54
lines changed

pandas/_libs/groupby.pyx

+38-31
Original file line numberDiff line numberDiff line change
@@ -1047,6 +1047,7 @@ def group_last(
10471047
const uint8_t[:, :] mask,
10481048
uint8_t[:, ::1] result_mask=None,
10491049
Py_ssize_t min_count=-1,
1050+
bint is_datetimelike=False,
10501051
) -> None:
10511052
"""
10521053
Only aggregates on axis=0
@@ -1056,7 +1057,6 @@ def group_last(
10561057
iu_64_floating_obj_t val
10571058
ndarray[iu_64_floating_obj_t, ndim=2] resx
10581059
ndarray[int64_t, ndim=2] nobs
1059-
bint runtime_error = False
10601060
bint uses_mask = mask is not None
10611061
bint isna_entry
10621062

@@ -1116,35 +1116,38 @@ def group_last(
11161116
if uses_mask:
11171117
isna_entry = mask[i, j]
11181118
else:
1119-
isna_entry = _treat_as_na(val, True)
1120-
# TODO: Sure we always want is_datetimelike=True?
1119+
isna_entry = _treat_as_na(val, is_datetimelike)
11211120

11221121
if not isna_entry:
11231122
nobs[lab, j] += 1
11241123
resx[lab, j] = val
11251124

11261125
for i in range(ncounts):
11271126
for j in range(K):
1127+
# TODO(cython3): the entire next block can be shared
1128+
# across 3 places once conditional-nogil is available
11281129
if nobs[i, j] < min_count:
1130+
# if we are integer dtype, not is_datetimelike, and
1131+
# not uses_mask, then getting here implies that
1132+
# counts[i] < min_count, which means we will
1133+
# be cast to float64 and masked at the end
1134+
# of WrappedCythonOp._call_cython_op. So we can safely
1135+
# set a placeholder value in out[i, j].
11291136
if uses_mask:
11301137
result_mask[i, j] = True
11311138
elif iu_64_floating_obj_t is int64_t:
1132-
# TODO: only if datetimelike?
1139+
# Per above, this is a placeholder in
1140+
# non-is_datetimelike cases.
11331141
out[i, j] = NPY_NAT
11341142
elif iu_64_floating_obj_t is uint64_t:
1135-
runtime_error = True
1136-
break
1143+
# placeholder, see above
1144+
out[i, j] = 0
11371145
else:
11381146
out[i, j] = NAN
11391147

11401148
else:
11411149
out[i, j] = resx[i, j]
11421150

1143-
if runtime_error:
1144-
# We cannot raise directly above because that is within a nogil
1145-
# block.
1146-
raise RuntimeError("empty group with uint64_t")
1147-
11481151

11491152
# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
11501153
# use `const iu_64_floating_obj_t[:, :] values`
@@ -1159,6 +1162,7 @@ def group_nth(
11591162
uint8_t[:, ::1] result_mask=None,
11601163
int64_t min_count=-1,
11611164
int64_t rank=1,
1165+
bint is_datetimelike=False,
11621166
) -> None:
11631167
"""
11641168
Only aggregates on axis=0
@@ -1168,7 +1172,6 @@ def group_nth(
11681172
iu_64_floating_obj_t val
11691173
ndarray[iu_64_floating_obj_t, ndim=2] resx
11701174
ndarray[int64_t, ndim=2] nobs
1171-
bint runtime_error = False
11721175
bint uses_mask = mask is not None
11731176
bint isna_entry
11741177

@@ -1230,8 +1233,7 @@ def group_nth(
12301233
if uses_mask:
12311234
isna_entry = mask[i, j]
12321235
else:
1233-
isna_entry = _treat_as_na(val, True)
1234-
# TODO: Sure we always want is_datetimelike=True?
1236+
isna_entry = _treat_as_na(val, is_datetimelike)
12351237

12361238
if not isna_entry:
12371239
nobs[lab, j] += 1
@@ -1241,25 +1243,27 @@ def group_nth(
12411243
for i in range(ncounts):
12421244
for j in range(K):
12431245
if nobs[i, j] < min_count:
1246+
# if we are integer dtype, not is_datetimelike, and
1247+
# not uses_mask, then getting here implies that
1248+
# counts[i] < min_count, which means we will
1249+
# be cast to float64 and masked at the end
1250+
# of WrappedCythonOp._call_cython_op. So we can safely
1251+
# set a placeholder value in out[i, j].
12441252
if uses_mask:
12451253
result_mask[i, j] = True
12461254
out[i, j] = 0
12471255
elif iu_64_floating_obj_t is int64_t:
1248-
# TODO: only if datetimelike?
1256+
# Per above, this is a placeholder in
1257+
# non-is_datetimelike cases.
12491258
out[i, j] = NPY_NAT
12501259
elif iu_64_floating_obj_t is uint64_t:
1251-
runtime_error = True
1252-
break
1260+
# placeholder, see above
1261+
out[i, j] = 0
12531262
else:
12541263
out[i, j] = NAN
12551264
else:
12561265
out[i, j] = resx[i, j]
12571266

1258-
if runtime_error:
1259-
# We cannot raise directly above because that is within a nogil
1260-
# block.
1261-
raise RuntimeError("empty group with uint64_t")
1262-
12631267

12641268
@cython.boundscheck(False)
12651269
@cython.wraparound(False)
@@ -1386,7 +1390,6 @@ cdef group_min_max(
13861390
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
13871391
iu_64_floating_t val, nan_val
13881392
ndarray[iu_64_floating_t, ndim=2] group_min_or_max
1389-
bint runtime_error = False
13901393
int64_t[:, ::1] nobs
13911394
bint uses_mask = mask is not None
13921395
bint isna_entry
@@ -1403,7 +1406,6 @@ cdef group_min_max(
14031406
group_min_or_max[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max, is_datetimelike)
14041407

14051408
if iu_64_floating_t is int64_t:
1406-
# TODO: only if is_datetimelike?
14071409
nan_val = NPY_NAT
14081410
elif iu_64_floating_t is uint64_t:
14091411
# NB: We do not define nan_val because there is no such thing
@@ -1442,25 +1444,30 @@ cdef group_min_max(
14421444
for i in range(ngroups):
14431445
for j in range(K):
14441446
if nobs[i, j] < min_count:
1447+
# if we are integer dtype, not is_datetimelike, and
1448+
# not uses_mask, then getting here implies that
1449+
# counts[i] < min_count, which means we will
1450+
# be cast to float64 and masked at the end
1451+
# of WrappedCythonOp._call_cython_op. So we can safely
1452+
# set a placeholder value in out[i, j].
14451453
if uses_mask:
14461454
result_mask[i, j] = True
14471455
# set out[i, j] to 0 to be deterministic, as
14481456
# it was initialized with np.empty. Also ensures
14491457
# we can downcast out if appropriate.
14501458
out[i, j] = 0
1459+
elif iu_64_floating_t is int64_t:
1460+
# Per above, this is a placeholder in
1461+
# non-is_datetimelike cases.
1462+
out[i, j] = nan_val
14511463
elif iu_64_floating_t is uint64_t:
1452-
runtime_error = True
1453-
break
1464+
# placeholder, see above
1465+
out[i, j] = 0
14541466
else:
14551467
out[i, j] = nan_val
14561468
else:
14571469
out[i, j] = group_min_or_max[i, j]
14581470

1459-
if runtime_error:
1460-
# We cannot raise directly above because that is within a nogil
1461-
# block.
1462-
raise RuntimeError("empty group with uint64_t")
1463-
14641471

14651472
@cython.wraparound(False)
14661473
@cython.boundscheck(False)

pandas/core/groupby/ops.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ def _call_cython_op(
519519
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
520520
if self.kind == "aggregate":
521521
counts = np.zeros(ngroups, dtype=np.int64)
522-
if self.how in ["min", "max", "mean"]:
522+
if self.how in ["min", "max", "mean", "last", "first"]:
523523
func(
524524
out=result,
525525
counts=counts,
@@ -530,16 +530,6 @@ def _call_cython_op(
530530
result_mask=result_mask,
531531
is_datetimelike=is_datetimelike,
532532
)
533-
elif self.how in ["first", "last"]:
534-
func(
535-
out=result,
536-
counts=counts,
537-
values=values,
538-
labels=comp_ids,
539-
min_count=min_count,
540-
mask=mask,
541-
result_mask=result_mask,
542-
)
543533
elif self.how in ["add"]:
544534
# We support datetimelike
545535
func(

pandas/tests/resample/test_datetime_index.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -1856,15 +1856,11 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype):
18561856
)
18571857
df = df.loc[(df.index < "2000-01-02") | (df.index > "2000-01-03"), :]
18581858

1859-
if any_unsigned_int_numpy_dtype == "uint64":
1860-
with pytest.raises(RuntimeError, match="empty group with uint64_t"):
1861-
df.resample("D").max()
1862-
else:
1863-
result = df.resample("D").max()
1864-
1865-
expected = DataFrame(
1866-
[1, np.nan, 0],
1867-
columns=["x"],
1868-
index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"),
1869-
)
1870-
tm.assert_frame_equal(result, expected)
1859+
result = df.resample("D").max()
1860+
1861+
expected = DataFrame(
1862+
[1, np.nan, 0],
1863+
columns=["x"],
1864+
index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"),
1865+
)
1866+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)