add uint64 support for some libgroupby funcs (#28931)

jbrockmendel · jreback · commit a0d01b803575 · 2019-10-16T08:42:47.000-04:00
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -16,6 +16,7 @@ ctypedef fused rank_t:
     float64_t
     float32_t
     int64_t
+    uint64_t
     object
 
 
@@ -34,6 +35,7 @@ def group_last(rank_t[:, :] out,
         rank_t val
         ndarray[rank_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
+        bint runtime_error = False
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -106,11 +108,20 @@ def group_last(rank_t[:, :] out,
                     if nobs[i, j] == 0:
                         if rank_t is int64_t:
                             out[i, j] = NPY_NAT
+                        elif rank_t is uint64_t:
+                            runtime_error = True
+                            break
                         else:
                             out[i, j] = NAN
                     else:
                         out[i, j] = resx[i, j]
 
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
+
 group_last_float64 = group_last["float64_t"]
 group_last_float32 = group_last["float32_t"]
 group_last_int64 = group_last["int64_t"]
@@ -132,6 +143,7 @@ def group_nth(rank_t[:, :] out,
         rank_t val
         ndarray[rank_t, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
+        bint runtime_error = False
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -199,11 +211,19 @@ def group_nth(rank_t[:, :] out,
                     if nobs[i, j] == 0:
                         if rank_t is int64_t:
                             out[i, j] = NPY_NAT
+                        elif rank_t is uint64_t:
+                            runtime_error = True
+                            break
                         else:
                             out[i, j] = NAN
                     else:
                         out[i, j] = resx[i, j]
 
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
 
 group_nth_float64 = group_nth["float64_t"]
 group_nth_float32 = group_nth["float32_t"]
@@ -282,12 +302,16 @@ def group_rank(float64_t[:, :] out,
     if ascending ^ (na_option == 'top'):
         if rank_t is int64_t:
             nan_fill_val = np.iinfo(np.int64).max
+        elif rank_t is uint64_t:
+            nan_fill_val = np.iinfo(np.uint64).max
         else:
             nan_fill_val = np.inf
         order = (masked_vals, mask, labels)
     else:
         if rank_t is int64_t:
             nan_fill_val = np.iinfo(np.int64).min
+        elif rank_t is uint64_t:
+            nan_fill_val = 0
         else:
             nan_fill_val = -np.inf
 
@@ -397,6 +421,7 @@ def group_rank(float64_t[:, :] out,
 group_rank_float64 = group_rank["float64_t"]
 group_rank_float32 = group_rank["float32_t"]
 group_rank_int64 = group_rank["int64_t"]
+group_rank_uint64 = group_rank["uint64_t"]
 # Note: we do not have a group_rank_object because that would require a
 #  not-nogil implementation, see GH#19560
 
@@ -410,6 +435,7 @@ ctypedef fused groupby_t:
     float64_t
     float32_t
     int64_t
+    uint64_t
 
 
 @cython.wraparound(False)
@@ -426,6 +452,7 @@ def group_max(groupby_t[:, :] out,
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         groupby_t val, count, nan_val
         ndarray[groupby_t, ndim=2] maxx, nobs
+        bint runtime_error = False
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -439,6 +466,11 @@ def group_max(groupby_t[:, :] out,
         # Note: evaluated at compile-time
         maxx[:] = -_int64_max
         nan_val = NPY_NAT
+    elif groupby_t is uint64_t:
+        # NB: We do not define nan_val because there is no such thing
+        #  for uint64_t.  We carefully avoid having to reference it in this
+        #  case.
+        maxx[:] = 0
     else:
         maxx[:] = -np.inf
         nan_val = NAN
@@ -462,18 +494,26 @@ def group_max(groupby_t[:, :] out,
                         if val > maxx[lab, j]:
                             maxx[lab, j] = val
                 else:
-                    if val == val and val != nan_val:
+                    if val == val:
                         nobs[lab, j] += 1
                         if val > maxx[lab, j]:
                             maxx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
+                    if groupby_t is uint64_t:
+                        runtime_error = True
+                        break
                     out[i, j] = nan_val
                 else:
                     out[i, j] = maxx[i, j]
 
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
@@ -489,6 +529,7 @@ def group_min(groupby_t[:, :] out,
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         groupby_t val, count, nan_val
         ndarray[groupby_t, ndim=2] minx, nobs
+        bint runtime_error = False
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -501,6 +542,11 @@ def group_min(groupby_t[:, :] out,
     if groupby_t is int64_t:
         minx[:] = _int64_max
         nan_val = NPY_NAT
+    elif groupby_t is uint64_t:
+        # NB: We do not define nan_val because there is no such thing
+        #  for uint64_t.  We carefully avoid having to reference it in this
+        #  case.
+        minx[:] = np.iinfo(np.uint64).max
     else:
         minx[:] = np.inf
         nan_val = NAN
@@ -524,18 +570,26 @@ def group_min(groupby_t[:, :] out,
                         if val < minx[lab, j]:
                             minx[lab, j] = val
                 else:
-                    if val == val and val != nan_val:
+                    if val == val:
                         nobs[lab, j] += 1
                         if val < minx[lab, j]:
                             minx[lab, j] = val
 
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
+                    if groupby_t is uint64_t:
+                        runtime_error = True
+                        break
                     out[i, j] = nan_val
                 else:
                     out[i, j] = minx[i, j]
 
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -575,6 +629,8 @@ def group_cummin(groupby_t[:, :] out,
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
     if groupby_t is int64_t:
         accum[:] = _int64_max
+    elif groupby_t is uint64_t:
+        accum[:] = np.iinfo(np.uint64).max
     else:
         accum[:] = np.inf
 
@@ -642,6 +698,8 @@ def group_cummax(groupby_t[:, :] out,
     accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
     if groupby_t is int64_t:
         accum[:] = -_int64_max
+    elif groupby_t is uint64_t:
+        accum[:] = 0
     else:
         accum[:] = -np.inf
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1355,7 +1355,15 @@ def f(self, **kwargs):
                     return self._cython_agg_general(alias, alt=npfunc, **kwargs)
                 except AssertionError as e:
                     raise SpecificationError(str(e))
+                except DataError:
+                    pass
                 except Exception:
+                    # TODO: the remaining test cases that get here are from:
+                    #  - AttributeError from _cython_agg_blocks bug passing
+                    #    DataFrame to make_block; see  GH#28275
+                    #  - TypeError in _cython_operation calling ensure_float64
+                    #    on object array containing complex numbers;
+                    #    see test_groupby_complex, test_max_nan_bug
                     pass
 
                 # apply a non-cython aggregation
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -378,7 +378,7 @@ def test_median_empty_bins(observed):
 
 
 @pytest.mark.parametrize(
-    "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"]
+    "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
 )
 @pytest.mark.parametrize(
     "method,data",

Original file line number	Diff line number	Diff line change
`@@ -378,7 +378,7 @@ def test_median_empty_bins(observed):`
`378`	`378`
`379`	`379`
`380`	`380`	`@pytest.mark.parametrize(`
`381`		`- "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"]`
	`381`	`+ "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]`
`382`	`382`	`)`
`383`	`383`	`@pytest.mark.parametrize(`
`384`	`384`	`"method,data",`