Skip to content

add uint64 support for some libgroupby funcs #28931

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 60 additions & 2 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ ctypedef fused rank_t:
float64_t
float32_t
int64_t
uint64_t
object


Expand All @@ -34,6 +35,7 @@ def group_last(rank_t[:, :] out,
rank_t val
ndarray[rank_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand Down Expand Up @@ -106,11 +108,20 @@ def group_last(rank_t[:, :] out,
if nobs[i, j] == 0:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
runtime_error = True
break
else:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where are these caught?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this would get caught by one of the except Exceptions in the groupby code that i'm trying to make more specific.

# block.
raise RuntimeError("empty group with uint64_t")


group_last_float64 = group_last["float64_t"]
group_last_float32 = group_last["float32_t"]
group_last_int64 = group_last["int64_t"]
Expand All @@ -132,6 +143,7 @@ def group_nth(rank_t[:, :] out,
rank_t val
ndarray[rank_t, ndim=2] resx
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand Down Expand Up @@ -199,11 +211,19 @@ def group_nth(rank_t[:, :] out,
if nobs[i, j] == 0:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
runtime_error = True
break
else:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


group_nth_float64 = group_nth["float64_t"]
group_nth_float32 = group_nth["float32_t"]
Expand Down Expand Up @@ -282,12 +302,16 @@ def group_rank(float64_t[:, :] out,
if ascending ^ (na_option == 'top'):
if rank_t is int64_t:
nan_fill_val = np.iinfo(np.int64).max
elif rank_t is uint64_t:
nan_fill_val = np.iinfo(np.uint64).max
else:
nan_fill_val = np.inf
order = (masked_vals, mask, labels)
else:
if rank_t is int64_t:
nan_fill_val = np.iinfo(np.int64).min
elif rank_t is uint64_t:
nan_fill_val = 0
else:
nan_fill_val = -np.inf

Expand Down Expand Up @@ -397,6 +421,7 @@ def group_rank(float64_t[:, :] out,
group_rank_float64 = group_rank["float64_t"]
group_rank_float32 = group_rank["float32_t"]
group_rank_int64 = group_rank["int64_t"]
group_rank_uint64 = group_rank["uint64_t"]
# Note: we do not have a group_rank_object because that would require a
# not-nogil implementation, see GH#19560

Expand All @@ -410,6 +435,7 @@ ctypedef fused groupby_t:
float64_t
float32_t
int64_t
uint64_t


@cython.wraparound(False)
Expand All @@ -426,6 +452,7 @@ def group_max(groupby_t[:, :] out,
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
groupby_t val, count, nan_val
ndarray[groupby_t, ndim=2] maxx, nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand All @@ -439,6 +466,11 @@ def group_max(groupby_t[:, :] out,
# Note: evaluated at compile-time
maxx[:] = -_int64_max
nan_val = NPY_NAT
elif groupby_t is uint64_t:
# NB: We do not define nan_val because there is no such thing
# for uint64_t. We carefully avoid having to reference it in this
# case.
maxx[:] = 0
else:
maxx[:] = -np.inf
nan_val = NAN
Expand All @@ -462,18 +494,26 @@ def group_max(groupby_t[:, :] out,
if val > maxx[lab, j]:
maxx[lab, j] = val
else:
if val == val and val != nan_val:
if val == val:
nobs[lab, j] += 1
if val > maxx[lab, j]:
maxx[lab, j] = val

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if groupby_t is uint64_t:
runtime_error = True
break
out[i, j] = nan_val
else:
out[i, j] = maxx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


@cython.wraparound(False)
@cython.boundscheck(False)
Expand All @@ -489,6 +529,7 @@ def group_min(groupby_t[:, :] out,
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
groupby_t val, count, nan_val
ndarray[groupby_t, ndim=2] minx, nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

Expand All @@ -501,6 +542,11 @@ def group_min(groupby_t[:, :] out,
if groupby_t is int64_t:
minx[:] = _int64_max
nan_val = NPY_NAT
elif groupby_t is uint64_t:
# NB: We do not define nan_val because there is no such thing
# for uint64_t. We carefully avoid having to reference it in this
# case.
minx[:] = np.iinfo(np.uint64).max
else:
minx[:] = np.inf
nan_val = NAN
Expand All @@ -524,18 +570,26 @@ def group_min(groupby_t[:, :] out,
if val < minx[lab, j]:
minx[lab, j] = val
else:
if val == val and val != nan_val:
if val == val:
nobs[lab, j] += 1
if val < minx[lab, j]:
minx[lab, j] = val

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if groupby_t is uint64_t:
runtime_error = True
break
out[i, j] = nan_val
else:
out[i, j] = minx[i, j]

if runtime_error:
# We cannot raise directly above because that is within a nogil
# block.
raise RuntimeError("empty group with uint64_t")


@cython.boundscheck(False)
@cython.wraparound(False)
Expand Down Expand Up @@ -575,6 +629,8 @@ def group_cummin(groupby_t[:, :] out,
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
if groupby_t is int64_t:
accum[:] = _int64_max
elif groupby_t is uint64_t:
accum[:] = np.iinfo(np.uint64).max
else:
accum[:] = np.inf

Expand Down Expand Up @@ -642,6 +698,8 @@ def group_cummax(groupby_t[:, :] out,
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
if groupby_t is int64_t:
accum[:] = -_int64_max
elif groupby_t is uint64_t:
accum[:] = 0
else:
accum[:] = -np.inf

Expand Down
8 changes: 8 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,7 +1361,15 @@ def f(self, **kwargs):
return self._cython_agg_general(alias, alt=npfunc, **kwargs)
except AssertionError as e:
raise SpecificationError(str(e))
except DataError:
pass
except Exception:
# TODO: the remaining test cases that get here are from:
# - AttributeError from _cython_agg_blocks bug passing
# DataFrame to make_block; see GH#28275
# - TypeError in _cython_operation calling ensure_float64
# on object array containing complex numbers;
# see test_groupby_complex, test_max_nan_bug
pass

# apply a non-cython aggregation
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def test_median_empty_bins(observed):


@pytest.mark.parametrize(
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64"]
"dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe use any_real_type fixture here (can be followup)

)
@pytest.mark.parametrize(
"method,data",
Expand Down