From 6057696196ca38cae063f1dc0498fe05ccaeaf4a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Oct 2019 19:46:57 -0700 Subject: [PATCH 1/5] REF: use fused types for group_last --- pandas/_libs/groupby_helper.pxi.in | 123 +++++++++++++++++++---------- 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 000689f634545..209f701969f6c 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -12,39 +12,27 @@ _int64_max = np.iinfo(np.int64).max # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -{{py: - -# name, c_type, nan_val -dtypes = [('float64', 'float64_t', 'NAN'), - ('float32', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'NPY_NAT'), - ('object', 'object', 'NAN')] - -def get_dispatch(dtypes): - - for name, c_type, nan_val in dtypes: - - yield name, c_type, nan_val -}} - - -{{for name, c_type, nan_val in get_dispatch(dtypes)}} +ctypedef fused rank_t: + float64_t + float32_t + int64_t + object @cython.wraparound(False) @cython.boundscheck(False) -def group_last_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): +def group_last(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val - ndarray[{{c_type}}, ndim=2] resx + rank_t val + ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -53,19 +41,15 @@ def group_last_{{name}}({{c_type}}[:, :] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - {{if name == 'object'}} - resx = np.empty((out).shape, dtype=object) - {{else}} - resx = np.empty_like(out) - {{endif}} + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) N, K = (values).shape - {{if name == "object"}} - if True: # make templating happy - {{else}} - with nogil: - {{endif}} + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -76,20 +60,71 @@ def group_last_{{name}}({{c_type}}[:, :] out, val = values[i, j] # not nan - if ( - {{if not name.startswith("int")}} - val == val and - {{endif}} - val != {{nan_val}}): + if val == val: nobs[lab, j] += 1 resx[lab, j] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = {{nan_val}} + out[i, j] = NAN else: out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if rank_t is int64_t: + # need a special notna check + if val != NPY_NAT: + nobs[lab, j] += 1 + resx[lab, j] = val + else: + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + +group_last_float64 = group_last["float64_t"] +group_last_float32 = group_last["float32_t"] +group_last_int64 = group_last["int64_t"] +group_last_object = group_last["object"] + + +{{py: + +# name, c_type, nan_val +dtypes = [('float64', 'float64_t', 'NAN'), + ('float32', 'float32_t', 'NAN'), + ('int64', 'int64_t', 'NPY_NAT'), + ('object', 'object', 'NAN')] + +def get_dispatch(dtypes): + + for name, c_type, nan_val in dtypes: + + yield name, c_type, nan_val +}} + + +{{for name, c_type, nan_val in get_dispatch(dtypes)}} @cython.wraparound(False) @@ -484,7 +519,8 @@ def group_cummin(groupby_t[:, :] out, const int64_t[:] labels, int ngroups, bint is_datetimelike): - """Cumulative minimum of columns of `values`, in row groups `labels`. + """ + Cumulative minimum of columns of `values`, in row groups `labels`. Parameters ---------- @@ -548,9 +584,10 @@ def group_cummin(groupby_t[:, :] out, def group_cummax(groupby_t[:, :] out, groupby_t[:, :] values, const int64_t[:] labels, - int ngroups, + int ngroups, bint is_datetimelike): - """Cumulative maximum of columns of `values`, in row groups `labels`. + """ + Cumulative maximum of columns of `values`, in row groups `labels`. Parameters ---------- From dfcd5efa3726ffd17779b48b88a43b1d834ff518 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Oct 2019 20:44:26 -0700 Subject: [PATCH 2/5] REF: use fused types for groupby_helper --- pandas/_libs/groupby_helper.pxi.in | 152 ++++++++++++++++------------- 1 file changed, 84 insertions(+), 68 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 209f701969f6c..03c4ae58f4dc5 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -108,39 +108,20 @@ group_last_int64 = group_last["int64_t"] group_last_object = group_last["object"] -{{py: - -# name, c_type, nan_val -dtypes = [('float64', 'float64_t', 'NAN'), - ('float32', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'NPY_NAT'), - ('object', 'object', 'NAN')] - -def get_dispatch(dtypes): - - for name, c_type, nan_val in dtypes: - - yield name, c_type, nan_val -}} - - -{{for name, c_type, nan_val in get_dispatch(dtypes)}} - - @cython.wraparound(False) @cython.boundscheck(False) -def group_nth_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, int64_t rank, - Py_ssize_t min_count=-1): +def group_nth(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val - ndarray[{{c_type}}, ndim=2] resx + rank_t val + ndarray[rank_t, ndim=2] resx ndarray[int64_t, ndim=2] nobs assert min_count == -1, "'min_count' only used in add and prod" @@ -149,19 +130,15 @@ def group_nth_{{name}}({{c_type}}[:, :] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - {{if name=='object'}} - resx = np.empty((out).shape, dtype=object) - {{else}} - resx = np.empty_like(out) - {{endif}} + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) N, K = (values).shape - {{if name == "object"}} - if True: # make templating happy - {{else}} - with nogil: - {{endif}} + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -172,11 +149,7 @@ def group_nth_{{name}}({{c_type}}[:, :] out, val = values[i, j] # not nan - if ( - {{if not name.startswith("int")}} - val == val and - {{endif}} - val != {{nan_val}}): + if val == val: nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val @@ -184,28 +157,65 @@ def group_nth_{{name}}({{c_type}}[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = {{nan_val}} + out[i, j] = NAN else: out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue -{{if name != 'object'}} + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if rank_t is int64_t: + # need a special notna check + if val != NPY_NAT: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + else: + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +group_nth_float64 = group_nth["float64_t"] +group_nth_float32 = group_nth["float32_t"] +group_nth_int64 = group_nth["int64_t"] +group_nth_object = group_nth["object"] @cython.boundscheck(False) @cython.wraparound(False) -def group_rank_{{name}}(float64_t[:, :] out, - {{c_type}}[:, :] values, - const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): +def group_rank(float64_t[:, :] out, + rank_t[:, :] values, + const int64_t[:] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): """ Provides the rank of values within each group. Parameters ---------- out : array of float64_t values which this method will write its results to - values : array of {{c_type}} values to be ranked + values : array of rank_t values to be ranked labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` is_datetimelike : bool, default False @@ -238,10 +248,13 @@ def group_rank_{{name}}(float64_t[:, :] out, Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 ndarray[int64_t] _as ndarray[float64_t, ndim=2] grp_sizes - ndarray[{{c_type}}] masked_vals + ndarray[rank_t] masked_vals ndarray[uint8_t] mask bint keep_na - {{c_type}} nan_fill_val + rank_t nan_fill_val + + if rank_t is object: + raise NotImplementedError("Cant do nogil") tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -252,25 +265,23 @@ def group_rank_{{name}}(float64_t[:, :] out, # with mask, without obfuscating location of missing data # in values array masked_vals = np.array(values[:, 0], copy=True) - {{if name == 'int64'}} - mask = (masked_vals == {{nan_val}}).astype(np.uint8) - {{else}} - mask = np.isnan(masked_vals).astype(np.uint8) - {{endif}} + if rank_t is int64_t: + mask = (masked_vals == NPY_NAT).astype(np.uint8) + else: + mask = np.isnan(masked_vals).astype(np.uint8) if ascending ^ (na_option == 'top'): - {{if name == 'int64'}} - nan_fill_val = np.iinfo(np.int64).max - {{else}} - nan_fill_val = np.inf - {{endif}} + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + else: + nan_fill_val = np.inf order = (masked_vals, mask, labels) else: - {{if name == 'int64'}} - nan_fill_val = np.iinfo(np.int64).min - {{else}} - nan_fill_val = -np.inf - {{endif}} + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + else: + nan_fill_val = -np.inf + order = (masked_vals, ~mask, labels) np.putmask(masked_vals, mask, nan_fill_val) @@ -372,8 +383,13 @@ def group_rank_{{name}}(float64_t[:, :] out, out[i, 0] = NAN elif grp_sizes[i, 0] != 0: out[i, 0] = out[i, 0] / grp_sizes[i, 0] -{{endif}} -{{endfor}} + + +group_rank_float64 = group_rank["float64_t"] +group_rank_float32 = group_rank["float32_t"] +group_rank_int64 = group_rank["int64_t"] +# Note: we do not have a group_rank_object because that would require a +# not-nogil implementation. # ---------------------------------------------------------------------- From 12c728819a8510592ee79df01f7cf3227685521b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Oct 2019 06:26:50 -0700 Subject: [PATCH 3/5] fix whitesapce --- pandas/_libs/groupby_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 03c4ae58f4dc5..d93a65ab96484 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -600,7 +600,7 @@ def group_cummin(groupby_t[:, :] out, def group_cummax(groupby_t[:, :] out, groupby_t[:, :] values, const int64_t[:] labels, - int ngroups, + int ngroups, bint is_datetimelike): """ Cumulative maximum of columns of `values`, in row groups `labels`. From 42ff41458c8be6848ff56827193fd490ba116886 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Oct 2019 13:21:02 -0700 Subject: [PATCH 4/5] suggested edit --- pandas/_libs/groupby_helper.pxi.in | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index d93a65ab96484..3ce8c742c58f7 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -60,14 +60,23 @@ def group_last(rank_t[:, :] out, val = values[i, j] # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val + if rank_t is int64_t: + # need a special notna check + if val != NPY_NAT: + nobs[lab, j] += 1 + resx[lab, j] = val + else: + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + if rank_t is int64_t: + out[i, j] = NPY_NAT + else: + out[i, j] = NAN else: out[i, j] = resx[i, j] else: From 76b20c6fe3ede2c3ee398888e587735698a6efb0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Oct 2019 13:21:25 -0700 Subject: [PATCH 5/5] comment --- pandas/_libs/groupby_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 3ce8c742c58f7..6b434b6470581 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -398,7 +398,7 @@ group_rank_float64 = group_rank["float64_t"] group_rank_float32 = group_rank["float32_t"] group_rank_int64 = group_rank["int64_t"] # Note: we do not have a group_rank_object because that would require a -# not-nogil implementation. +# not-nogil implementation, see GH#19560 # ----------------------------------------------------------------------