Skip to content

BUG: avoid RuntimeError in groupby.max #46408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 38 additions & 31 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,7 @@ def group_last(
const uint8_t[:, :] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
) -> None:
"""
Only aggregates on axis=0
Expand All @@ -1056,7 +1057,6 @@ def group_last(
iu_64_floating_obj_t val
ndarray[iu_64_floating_obj_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False
bint uses_mask = mask is not None
bint isna_entry

Expand Down Expand Up @@ -1116,35 +1116,38 @@ def group_last(
if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, True)
# TODO: Sure we always want is_datetimelike=True?
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
resx[lab, j] = val

for i in range(ncounts):
for j in range(K):
# TODO(cython3): the entire next block can be shared
# across 3 places once conditional-nogil is available
if nobs[i, j] < min_count:
# if we are integer dtype, not is_datetimelike, and
# not uses_mask, then getting here implies that
# counts[i] < min_count, which means we will
# be cast to float64 and masked at the end
# of WrappedCythonOp._call_cython_op. So we can safely
# set a placeholder value in out[i, j].
if uses_mask:
result_mask[i, j] = True
elif iu_64_floating_obj_t is int64_t:
# TODO: only if datetimelike?
# Per above, this is a placeholder in
# non-is_datetimelike cases.
out[i, j] = NPY_NAT
elif iu_64_floating_obj_t is uint64_t:
runtime_error = True
break
# placeholder, see above
out[i, j] = 0
else:
out[i, j] = NAN

else:
out[i, j] = resx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
# use `const iu_64_floating_obj_t[:, :] values`
Expand All @@ -1159,6 +1162,7 @@ def group_nth(
uint8_t[:, ::1] result_mask=None,
int64_t min_count=-1,
int64_t rank=1,
bint is_datetimelike=False,
) -> None:
"""
Only aggregates on axis=0
Expand All @@ -1168,7 +1172,6 @@ def group_nth(
iu_64_floating_obj_t val
ndarray[iu_64_floating_obj_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False
bint uses_mask = mask is not None
bint isna_entry

Expand Down Expand Up @@ -1230,8 +1233,7 @@ def group_nth(
if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, True)
# TODO: Sure we always want is_datetimelike=True?
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
Expand All @@ -1241,25 +1243,27 @@ def group_nth(
for i in range(ncounts):
for j in range(K):
if nobs[i, j] < min_count:
# if we are integer dtype, not is_datetimelike, and
# not uses_mask, then getting here implies that
# counts[i] < min_count, which means we will
# be cast to float64 and masked at the end
# of WrappedCythonOp._call_cython_op. So we can safely
# set a placeholder value in out[i, j].
if uses_mask:
result_mask[i, j] = True
out[i, j] = 0
elif iu_64_floating_obj_t is int64_t:
# TODO: only if datetimelike?
# Per above, this is a placeholder in
# non-is_datetimelike cases.
out[i, j] = NPY_NAT
elif iu_64_floating_obj_t is uint64_t:
runtime_error = True
break
# placeholder, see above
out[i, j] = 0
else:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


@cython.boundscheck(False)
@cython.wraparound(False)
Expand Down Expand Up @@ -1386,7 +1390,6 @@ cdef group_min_max(
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
iu_64_floating_t val, nan_val
ndarray[iu_64_floating_t, ndim=2] group_min_or_max
bint runtime_error = False
int64_t[:, ::1] nobs
bint uses_mask = mask is not None
bint isna_entry
Expand All @@ -1403,7 +1406,6 @@ cdef group_min_max(
group_min_or_max[:] = _get_min_or_max(<iu_64_floating_t>0, compute_max, is_datetimelike)

if iu_64_floating_t is int64_t:
# TODO: only if is_datetimelike?
nan_val = NPY_NAT
elif iu_64_floating_t is uint64_t:
# NB: We do not define nan_val because there is no such thing
Expand Down Expand Up @@ -1442,25 +1444,30 @@ cdef group_min_max(
for i in range(ngroups):
for j in range(K):
if nobs[i, j] < min_count:
# if we are integer dtype, not is_datetimelike, and
# not uses_mask, then getting here implies that
# counts[i] < min_count, which means we will
# be cast to float64 and masked at the end
# of WrappedCythonOp._call_cython_op. So we can safely
# set a placeholder value in out[i, j].
if uses_mask:
result_mask[i, j] = True
# set out[i, j] to 0 to be deterministic, as
# it was initialized with np.empty. Also ensures
# we can downcast out if appropriate.
out[i, j] = 0
elif iu_64_floating_t is int64_t:
# Per above, this is a placeholder in
# non-is_datetimelike cases.
out[i, j] = nan_val
elif iu_64_floating_t is uint64_t:
runtime_error = True
break
# placeholder, see above
out[i, j] = 0
else:
out[i, j] = nan_val
else:
out[i, j] = group_min_or_max[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


@cython.wraparound(False)
@cython.boundscheck(False)
Expand Down
12 changes: 1 addition & 11 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _call_cython_op(
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
if self.kind == "aggregate":
counts = np.zeros(ngroups, dtype=np.int64)
if self.how in ["min", "max", "mean"]:
if self.how in ["min", "max", "mean", "last", "first"]:
func(
out=result,
counts=counts,
Expand All @@ -530,16 +530,6 @@ def _call_cython_op(
result_mask=result_mask,
is_datetimelike=is_datetimelike,
)
elif self.how in ["first", "last"]:
func(
out=result,
counts=counts,
values=values,
labels=comp_ids,
min_count=min_count,
mask=mask,
result_mask=result_mask,
)
elif self.how in ["add"]:
# We support datetimelike
func(
Expand Down
20 changes: 8 additions & 12 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1856,15 +1856,11 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype):
)
df = df.loc[(df.index < "2000-01-02") | (df.index > "2000-01-03"), :]

if any_unsigned_int_numpy_dtype == "uint64":
with pytest.raises(RuntimeError, match="empty group with uint64_t"):
df.resample("D").max()
else:
result = df.resample("D").max()

expected = DataFrame(
[1, np.nan, 0],
columns=["x"],
index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"),
)
tm.assert_frame_equal(result, expected)
result = df.resample("D").max()

expected = DataFrame(
[1, np.nan, 0],
columns=["x"],
index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D"),
)
tm.assert_frame_equal(result, expected)