From b27b0b3d1dcbe8d693b555a3d9168257ff5866ec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 12:53:08 -0700 Subject: [PATCH 1/4] REF: remove groupby_helper --- pandas/_libs/groupby.pyx | 688 +++++++++++++++++++++++++++- pandas/_libs/groupby_helper.pxi.in | 693 ----------------------------- setup.py | 3 +- 3 files changed, 687 insertions(+), 697 deletions(-) delete mode 100644 pandas/_libs/groupby_helper.pxi.in diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c9994812462b1..93e790ec1e5ae 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -11,6 +11,8 @@ from numpy cimport (ndarray, uint32_t, uint64_t, float32_t, float64_t) cnp.import_array() +cdef extern from "numpy/npy_math.h": + float64_t NAN "NPY_NAN" from pandas._libs.util cimport numeric, get_nat @@ -21,6 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) cdef int64_t NPY_NAT = get_nat() +cdef int64_t _int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN @@ -789,5 +792,686 @@ def group_quantile(ndarray[float64_t] out, grp_start += grp_sz -# generated from template -include "groupby_helper.pxi" +# ---------------------------------------------------------------------- +# group_nth, group_last, group_rank +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + float64_t + float32_t + int64_t + uint64_t + object + + +cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: + if rank_t is object: + # Should never be used, but we need to avoid the `val != val` below + # or else cython will raise about gil acquisition. + raise NotImplementedError + + elif rank_t is int64_t: + return is_datetimelike and val == NPY_NAT + else: + return val != val + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + +group_last_float64 = group_last["float64_t"] +group_last_float32 = group_last["float32_t"] +group_last_int64 = group_last["int64_t"] +group_last_object = group_last["object"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, int64_t rank, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_nth_float64 = group_nth["float64_t"] +group_nth_float32 = group_nth["float32_t"] +group_nth_int64 = group_nth["int64_t"] +group_nth_object = group_nth["object"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank(float64_t[:, :] out, + rank_t[:, :] values, + const int64_t[:] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """ + Provides the rank of values within each group. + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of rank_t values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool, default False + unused in this method but provided for call compatibility with other + Cython transformations + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[rank_t] masked_vals + ndarray[uint8_t] mask + bint keep_na + rank_t nan_fill_val + + if rank_t is object: + raise NotImplementedError("Cant do nogil") + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = (values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + if rank_t is int64_t: + mask = (masked_vals == NPY_NAT).astype(np.uint8) + else: + mask = np.isnan(masked_vals).astype(np.uint8) + + if ascending ^ (na_option == 'top'): + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max + else: + nan_fill_val = np.inf + order = (masked_vals, mask, labels) + else: + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 + else: + nan_fill_val = -np.inf + + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order).astype(np.int64, copy=False) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the the starting index of the current group (grp_start) + # and the current index + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]]) or + (labels[_as[i]] != labels[_as[i+1]])): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (i - grp_start + 1 - + grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + elif grp_sizes[i, 0] != 0: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] + + +group_rank_float64 = group_rank["float64_t"] +group_rank_float32 = group_rank["float32_t"] +group_rank_int64 = group_rank["int64_t"] +group_rank_uint64 = group_rank["uint64_t"] +# Note: we do not have a group_rank_object because that would require a +# not-nogil implementation, see GH#19560 + + +# ---------------------------------------------------------------------- +# group_min, group_max +# ---------------------------------------------------------------------- + +# TODO: consider implementing for more dtypes +ctypedef fused groupby_t: + float64_t + float32_t + int64_t + uint64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] maxx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + if groupby_t is int64_t: + # Note: evaluated at compile-time + maxx[:] = -_int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + maxx[:] = 0 + else: + maxx[:] = -np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = maxx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] minx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + if groupby_t is int64_t: + minx[:] = _int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + minx[:] = np.iinfo(np.uint64).max + else: + minx[:] = np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = minx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummin(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative minimum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummin in. + values : array + Values to take cummin of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = _int64_max + elif groupby_t is uint64_t: + accum[:] = np.iinfo(np.uint64).max + else: + accum[:] = np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummax(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative maximum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummax in. + values : array + Values to take cummax of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = -_int64_max + elif groupby_t is uint64_t: + accum[:] = 0 + else: + accum[:] = -np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in deleted file mode 100644 index c837c6c5c6519..0000000000000 --- a/pandas/_libs/groupby_helper.pxi.in +++ /dev/null @@ -1,693 +0,0 @@ -""" -Template for each `dtype` helper function using groupby - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -cdef extern from "numpy/npy_math.h": - float64_t NAN "NPY_NAN" -_int64_max = np.iinfo(np.int64).max - -# ---------------------------------------------------------------------- -# group_nth, group_last, group_rank -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - float64_t - float32_t - int64_t - uint64_t - object - - -cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: - if rank_t is object: - # Should never be used, but we need to avoid the `val != val` below - # or else cython will raise about gil acquisition. - raise NotImplementedError - - elif rank_t is int64_t: - return is_datetimelike and val == NPY_NAT - else: - return val != val - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if val == val: - # NB: use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - elif rank_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = NAN - - else: - out[i, j] = resx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - -group_last_float64 = group_last["float64_t"] -group_last_float32 = group_last["float32_t"] -group_last_int64 = group_last["int64_t"] -group_last_object = group_last["object"] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if val == val: - # NB: use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - elif rank_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -group_nth_float64 = group_nth["float64_t"] -group_nth_float32 = group_nth["float32_t"] -group_nth_int64 = group_nth["int64_t"] -group_nth_object = group_nth["object"] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_rank(float64_t[:, :] out, - rank_t[:, :] values, - const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): - """ - Provides the rank of values within each group. - - Parameters - ---------- - out : array of float64_t values which this method will write its results to - values : array of rank_t values to be ranked - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - is_datetimelike : bool, default False - unused in this method but provided for call compatibility with other - Cython transformations - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default - 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups - ascending : boolean, default True - False for ranks by high (1) to low (N) - na_option : {'keep', 'top', 'bottom'}, default 'keep' - pct : boolean, default False - Compute percentage rank of data within each group - na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending - - Notes - ----- - This method modifies the `out` parameter rather than returning an object - """ - cdef: - TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t, ndim=2] grp_sizes - ndarray[rank_t] masked_vals - ndarray[uint8_t] mask - bint keep_na - rank_t nan_fill_val - - if rank_t is object: - raise NotImplementedError("Cant do nogil") - - tiebreak = tiebreakers[ties_method] - keep_na = na_option == 'keep' - N, K = (values).shape - grp_sizes = np.ones_like(out) - - # Copy values into new array in order to fill missing data - # with mask, without obfuscating location of missing data - # in values array - masked_vals = np.array(values[:, 0], copy=True) - if rank_t is int64_t: - mask = (masked_vals == NPY_NAT).astype(np.uint8) - else: - mask = np.isnan(masked_vals).astype(np.uint8) - - if ascending ^ (na_option == 'top'): - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max - elif rank_t is uint64_t: - nan_fill_val = np.iinfo(np.uint64).max - else: - nan_fill_val = np.inf - order = (masked_vals, mask, labels) - else: - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).min - elif rank_t is uint64_t: - nan_fill_val = 0 - else: - nan_fill_val = -np.inf - - order = (masked_vals, ~mask, labels) - np.putmask(masked_vals, mask, nan_fill_val) - - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - _as = np.lexsort(order).astype(np.int64, copy=False) - - if not ascending: - _as = _as[::-1] - - with nogil: - # Loop over the length of the value array - # each incremental i value can be looked up in the _as array - # that we sorted previously, which gives us the location of - # that sorted value for retrieval back from the original - # values / masked_vals arrays - for i in range(N): - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the the starting index of the current group (grp_start) - # and the current index - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]]) or - (labels[_as[i]] != labels[_as[i+1]])): - # if keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[_as[i]]: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - grp_start - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - grp_tie_count += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be - # sure to reset any of the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (i - grp_start + 1 - - grp_na_count) - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - - if pct: - for i in range(N): - # We don't include NaN values in percentage - # rankings, so we assign them percentages of NaN. - if out[i, 0] != out[i, 0] or out[i, 0] == NAN: - out[i, 0] = NAN - elif grp_sizes[i, 0] != 0: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] - - -group_rank_float64 = group_rank["float64_t"] -group_rank_float32 = group_rank["float32_t"] -group_rank_int64 = group_rank["int64_t"] -group_rank_uint64 = group_rank["uint64_t"] -# Note: we do not have a group_rank_object because that would require a -# not-nogil implementation, see GH#19560 - - -# ---------------------------------------------------------------------- -# group_min, group_max -# ---------------------------------------------------------------------- - -# TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - uint64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx, nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - if groupby_t is int64_t: - # Note: evaluated at compile-time - maxx[:] = -_int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - maxx[:] = 0 - else: - maxx[:] = -np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if groupby_t is uint64_t: - runtime_error = True - break - out[i, j] = nan_val - else: - out[i, j] = maxx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx, nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - if groupby_t is int64_t: - minx[:] = _int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - minx[:] = np.iinfo(np.uint64).max - else: - minx[:] = np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if groupby_t is uint64_t: - runtime_error = True - break - out[i, j] = nan_val - else: - out[i, j] = minx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative minimum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummin in. - values : array - Values to take cummin of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = _int64_max - elif groupby_t is uint64_t: - accum[:] = np.iinfo(np.uint64).max - else: - accum[:] = np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if _treat_as_na(val, is_datetimelike): - out[i, j] = val - else: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative maximum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummax in. - values : array - Values to take cummax of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = -_int64_max - elif groupby_t is uint64_t: - accum[:] = 0 - else: - accum[:] = -np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if _treat_as_na(val, is_datetimelike): - out[i, j] = val - else: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval diff --git a/setup.py b/setup.py index c35a0e75ecb80..2892cd0b2e294 100755 --- a/setup.py +++ b/setup.py @@ -88,7 +88,6 @@ def is_platform_mac(): "_libs/algos_take_helper.pxi.in", "_libs/algos_rank_helper.pxi.in", ], - "groupby": ["_libs/groupby_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -564,7 +563,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ext_data = { "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, - "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]}, + "_libs.groupby": {"pyxfile": "_libs/groupby"}, "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable", From 2617894a2f5840466dcb9a4aa8d759be3caa1ca1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 15:16:43 -0700 Subject: [PATCH 2/4] lint fixup --- pandas/_libs/groupby.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 93e790ec1e5ae..b9ca079b268f0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -904,6 +904,7 @@ def group_last(rank_t[:, :] out, # block. raise RuntimeError("empty group with uint64_t") + group_last_float64 = group_last["float64_t"] group_last_float32 = group_last["float32_t"] group_last_int64 = group_last["int64_t"] From 89a9ff379fd1cadf14a9756a05307f97aade864b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 16:34:16 -0700 Subject: [PATCH 3/4] dummy to force CI From 250eaddb72233de5dc2d000fd72e9f7e43998bff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Oct 2019 18:43:25 -0700 Subject: [PATCH 4/4] troubleshoot --- pandas/_libs/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4fb0026267a72..e40487e4f08b1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -23,7 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) cdef int64_t NPY_NAT = get_nat() -cdef int64_t _int64_max = np.iinfo(np.int64).max +_int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN