From b27b0b3d1dcbe8d693b555a3d9168257ff5866ec Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Oct 2019 12:53:08 -0700
Subject: [PATCH 1/4] REF: remove groupby_helper

---
 pandas/_libs/groupby.pyx           | 688 +++++++++++++++++++++++++++-
 pandas/_libs/groupby_helper.pxi.in | 693 -----------------------------
 setup.py                           |   3 +-
 3 files changed, 687 insertions(+), 697 deletions(-)
 delete mode 100644 pandas/_libs/groupby_helper.pxi.in
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index c9994812462b1..93e790ec1e5ae 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -11,6 +11,8 @@ from numpy cimport (ndarray,
                     uint32_t, uint64_t, float32_t, float64_t)
 cnp.import_array()
 
+cdef extern from "numpy/npy_math.h":
+    float64_t NAN "NPY_NAN"
 
 from pandas._libs.util cimport numeric, get_nat
 
@@ -21,6 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64,
                                 groupsort_indexer, tiebreakers)
 
 cdef int64_t NPY_NAT = get_nat()
+cdef int64_t _int64_max = np.iinfo(np.int64).max
 
 cdef float64_t NaN = <float64_t>np.NaN
 
@@ -789,5 +792,686 @@ def group_quantile(ndarray[float64_t] out,
             grp_start += grp_sz
 
 
-# generated from template
-include "groupby_helper.pxi"
+# ----------------------------------------------------------------------
+# group_nth, group_last, group_rank
+# ----------------------------------------------------------------------
+
+ctypedef fused rank_t:
+    float64_t
+    float32_t
+    int64_t
+    uint64_t
+    object
+
+
+cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
+    if rank_t is object:
+        # Should never be used, but we need to avoid the `val != val` below
+        #  or else cython will raise about gil acquisition.
+        raise NotImplementedError
+
+    elif rank_t is int64_t:
+        return is_datetimelike and val == NPY_NAT
+    else:
+        return val != val
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_last(rank_t[:, :] out,
+               int64_t[:] counts,
+               rank_t[:, :] values,
+               const int64_t[:] labels,
+               Py_ssize_t min_count=-1):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        rank_t val
+        ndarray[rank_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+        bint runtime_error = False
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
+    if rank_t is object:
+        resx = np.empty((<object>out).shape, dtype=object)
+    else:
+        resx = np.empty_like(out)
+
+    N, K = (<object>values).shape
+
+    if rank_t is object:
+        # TODO: De-duplicate once conditional-nogil is available
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                if val == val:
+                    # NB: use _treat_as_na here once
+                    #  conditional-nogil is available.
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
+    else:
+        with nogil:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    if not _treat_as_na(val, True):
+                        # TODO: Sure we always want is_datetimelike=True?
+                        nobs[lab, j] += 1
+                        resx[lab, j] = val
+
+            for i in range(ncounts):
+                for j in range(K):
+                    if nobs[i, j] == 0:
+                        if rank_t is int64_t:
+                            out[i, j] = NPY_NAT
+                        elif rank_t is uint64_t:
+                            runtime_error = True
+                            break
+                        else:
+                            out[i, j] = NAN
+
+                    else:
+                        out[i, j] = resx[i, j]
+
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
+group_last_float64 = group_last["float64_t"]
+group_last_float32 = group_last["float32_t"]
+group_last_int64 = group_last["int64_t"]
+group_last_object = group_last["object"]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_nth(rank_t[:, :] out,
+              int64_t[:] counts,
+              rank_t[:, :] values,
+              const int64_t[:] labels, int64_t rank,
+              Py_ssize_t min_count=-1):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        rank_t val
+        ndarray[rank_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+        bint runtime_error = False
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object>out).shape, dtype=np.int64)
+    if rank_t is object:
+        resx = np.empty((<object>out).shape, dtype=object)
+    else:
+        resx = np.empty_like(out)
+
+    N, K = (<object>values).shape
+
+    if rank_t is object:
+        # TODO: De-duplicate once conditional-nogil is available
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                if val == val:
+                    # NB: use _treat_as_na here once
+                    #  conditional-nogil is available.
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
+
+    else:
+        with nogil:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    if not _treat_as_na(val, True):
+                        # TODO: Sure we always want is_datetimelike=True?
+                        nobs[lab, j] += 1
+                        if nobs[lab, j] == rank:
+                            resx[lab, j] = val
+
+            for i in range(ncounts):
+                for j in range(K):
+                    if nobs[i, j] == 0:
+                        if rank_t is int64_t:
+                            out[i, j] = NPY_NAT
+                        elif rank_t is uint64_t:
+                            runtime_error = True
+                            break
+                        else:
+                            out[i, j] = NAN
+                    else:
+                        out[i, j] = resx[i, j]
+
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
+
+group_nth_float64 = group_nth["float64_t"]
+group_nth_float32 = group_nth["float32_t"]
+group_nth_int64 = group_nth["int64_t"]
+group_nth_object = group_nth["object"]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_rank(float64_t[:, :] out,
+               rank_t[:, :] values,
+               const int64_t[:] labels,
+               bint is_datetimelike, object ties_method,
+               bint ascending, bint pct, object na_option):
+    """
+    Provides the rank of values within each group.
+
+    Parameters
+    ----------
+    out : array of float64_t values which this method will write its results to
+    values : array of rank_t values to be ranked
+    labels : array containing unique label for each group, with its ordering
+        matching up to the corresponding record in `values`
+    is_datetimelike : bool, default False
+        unused in this method but provided for call compatibility with other
+        Cython transformations
+    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+        'average'
+        * average: average rank of group
+        * min: lowest rank in group
+        * max: highest rank in group
+        * first: ranks assigned in order they appear in the array
+        * dense: like 'min', but rank always increases by 1 between groups
+    ascending : boolean, default True
+        False for ranks by high (1) to low (N)
+        na_option : {'keep', 'top', 'bottom'}, default 'keep'
+    pct : boolean, default False
+        Compute percentage rank of data within each group
+    na_option : {'keep', 'top', 'bottom'}, default 'keep'
+        * keep: leave NA values where they are
+        * top: smallest rank if ascending
+        * bottom: smallest rank if descending
+
+    Notes
+    -----
+    This method modifies the `out` parameter rather than returning an object
+    """
+    cdef:
+        TiebreakEnumType tiebreak
+        Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
+        Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+        ndarray[int64_t] _as
+        ndarray[float64_t, ndim=2] grp_sizes
+        ndarray[rank_t] masked_vals
+        ndarray[uint8_t] mask
+        bint keep_na
+        rank_t nan_fill_val
+
+    if rank_t is object:
+        raise NotImplementedError("Cant do nogil")
+
+    tiebreak = tiebreakers[ties_method]
+    keep_na = na_option == 'keep'
+    N, K = (<object>values).shape
+    grp_sizes = np.ones_like(out)
+
+    # Copy values into new array in order to fill missing data
+    # with mask, without obfuscating location of missing data
+    # in values array
+    masked_vals = np.array(values[:, 0], copy=True)
+    if rank_t is int64_t:
+        mask = (masked_vals == NPY_NAT).astype(np.uint8)
+    else:
+        mask = np.isnan(masked_vals).astype(np.uint8)
+
+    if ascending ^ (na_option == 'top'):
+        if rank_t is int64_t:
+            nan_fill_val = np.iinfo(np.int64).max
+        elif rank_t is uint64_t:
+            nan_fill_val = np.iinfo(np.uint64).max
+        else:
+            nan_fill_val = np.inf
+        order = (masked_vals, mask, labels)
+    else:
+        if rank_t is int64_t:
+            nan_fill_val = np.iinfo(np.int64).min
+        elif rank_t is uint64_t:
+            nan_fill_val = 0
+        else:
+            nan_fill_val = -np.inf
+
+        order = (masked_vals, ~mask, labels)
+    np.putmask(masked_vals, mask, nan_fill_val)
+
+    # lexsort using labels, then mask, then actual values
+    # each label corresponds to a different group value,
+    # the mask helps you differentiate missing values before
+    # performing sort on the actual values
+    _as = np.lexsort(order).astype(np.int64, copy=False)
+
+    if not ascending:
+        _as = _as[::-1]
+
+    with nogil:
+        # Loop over the length of the value array
+        # each incremental i value can be looked up in the _as array
+        # that we sorted previously, which gives us the location of
+        # that sorted value for retrieval back from the original
+        # values / masked_vals arrays
+        for i in range(N):
+            # dups and sum_ranks will be incremented each loop where
+            # the value / group remains the same, and should be reset
+            # when either of those change
+            # Used to calculate tiebreakers
+            dups += 1
+            sum_ranks += i - grp_start + 1
+
+            # Update out only when there is a transition of values or labels.
+            # When a new value or group is encountered, go back #dups steps(
+            # the number of occurrence of current value) and assign the ranks
+            # based on the the starting index of the current group (grp_start)
+            # and the current index
+            if (i == N - 1 or
+                    (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+                    (mask[_as[i]] ^ mask[_as[i+1]]) or
+                    (labels[_as[i]] != labels[_as[i+1]])):
+                # if keep_na, check for missing values and assign back
+                # to the result where appropriate
+                if keep_na and mask[_as[i]]:
+                    for j in range(i - dups + 1, i + 1):
+                        out[_as[j], 0] = NaN
+                        grp_na_count = dups
+                elif tiebreak == TIEBREAK_AVERAGE:
+                    for j in range(i - dups + 1, i + 1):
+                        out[_as[j], 0] = sum_ranks / <float64_t>dups
+                elif tiebreak == TIEBREAK_MIN:
+                    for j in range(i - dups + 1, i + 1):
+                        out[_as[j], 0] = i - grp_start - dups + 2
+                elif tiebreak == TIEBREAK_MAX:
+                    for j in range(i - dups + 1, i + 1):
+                        out[_as[j], 0] = i - grp_start + 1
+                elif tiebreak == TIEBREAK_FIRST:
+                    for j in range(i - dups + 1, i + 1):
+                        if ascending:
+                            out[_as[j], 0] = j + 1 - grp_start
+                        else:
+                            out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start
+                elif tiebreak == TIEBREAK_DENSE:
+                    for j in range(i - dups + 1, i + 1):
+                        out[_as[j], 0] = grp_vals_seen
+
+                # look forward to the next value (using the sorting in _as)
+                # if the value does not equal the current value then we need to
+                # reset the dups and sum_ranks, knowing that a new value is
+                # coming up. the conditional also needs to handle nan equality
+                # and the end of iteration
+                if (i == N - 1 or
+                        (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
+                        (mask[_as[i]] ^ mask[_as[i+1]])):
+                    dups = sum_ranks = 0
+                    grp_vals_seen += 1
+                    grp_tie_count += 1
+
+                # Similar to the previous conditional, check now if we are
+                # moving to a new group. If so, keep track of the index where
+                # the new group occurs, so the tiebreaker calculations can
+                # decrement that from their position. fill in the size of each
+                # group encountered (used by pct calculations later). also be
+                # sure to reset any of the items helping to calculate dups
+                if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
+                    if tiebreak != TIEBREAK_DENSE:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (i - grp_start + 1 -
+                                                    grp_na_count)
+                    else:
+                        for j in range(grp_start, i + 1):
+                            grp_sizes[_as[j], 0] = (grp_tie_count -
+                                                    (grp_na_count > 0))
+                    dups = sum_ranks = 0
+                    grp_na_count = 0
+                    grp_tie_count = 0
+                    grp_start = i + 1
+                    grp_vals_seen = 1
+
+        if pct:
+            for i in range(N):
+                # We don't include NaN values in percentage
+                # rankings, so we assign them percentages of NaN.
+                if out[i, 0] != out[i, 0] or out[i, 0] == NAN:
+                    out[i, 0] = NAN
+                elif grp_sizes[i, 0] != 0:
+                    out[i, 0] = out[i, 0] / grp_sizes[i, 0]
+
+
+group_rank_float64 = group_rank["float64_t"]
+group_rank_float32 = group_rank["float32_t"]
+group_rank_int64 = group_rank["int64_t"]
+group_rank_uint64 = group_rank["uint64_t"]
+# Note: we do not have a group_rank_object because that would require a
+#  not-nogil implementation, see GH#19560
+
+
+# ----------------------------------------------------------------------
+# group_min, group_max
+# ----------------------------------------------------------------------
+
+# TODO: consider implementing for more dtypes
+ctypedef fused groupby_t:
+    float64_t
+    float32_t
+    int64_t
+    uint64_t
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max(groupby_t[:, :] out,
+              int64_t[:] counts,
+              groupby_t[:, :] values,
+              const int64_t[:] labels,
+              Py_ssize_t min_count=-1):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        groupby_t val, count, nan_val
+        ndarray[groupby_t, ndim=2] maxx, nobs
+        bint runtime_error = False
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    if groupby_t is int64_t:
+        # Note: evaluated at compile-time
+        maxx[:] = -_int64_max
+        nan_val = NPY_NAT
+    elif groupby_t is uint64_t:
+        # NB: We do not define nan_val because there is no such thing
+        #  for uint64_t.  We carefully avoid having to reference it in this
+        #  case.
+        maxx[:] = 0
+    else:
+        maxx[:] = -np.inf
+        nan_val = NAN
+
+    N, K = (<object>values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                if not _treat_as_na(val, True):
+                    # TODO: Sure we always want is_datetimelike=True?
+                    nobs[lab, j] += 1
+                    if val > maxx[lab, j]:
+                        maxx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    if groupby_t is uint64_t:
+                        runtime_error = True
+                        break
+                    out[i, j] = nan_val
+                else:
+                    out[i, j] = maxx[i, j]
+
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min(groupby_t[:, :] out,
+              int64_t[:] counts,
+              groupby_t[:, :] values,
+              const int64_t[:] labels,
+              Py_ssize_t min_count=-1):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        groupby_t val, count, nan_val
+        ndarray[groupby_t, ndim=2] minx, nobs
+        bint runtime_error = False
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    if groupby_t is int64_t:
+        minx[:] = _int64_max
+        nan_val = NPY_NAT
+    elif groupby_t is uint64_t:
+        # NB: We do not define nan_val because there is no such thing
+        #  for uint64_t.  We carefully avoid having to reference it in this
+        #  case.
+        minx[:] = np.iinfo(np.uint64).max
+    else:
+        minx[:] = np.inf
+        nan_val = NAN
+
+    N, K = (<object>values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                if not _treat_as_na(val, True):
+                    # TODO: Sure we always want is_datetimelike=True?
+                    nobs[lab, j] += 1
+                    if val < minx[lab, j]:
+                        minx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    if groupby_t is uint64_t:
+                        runtime_error = True
+                        break
+                    out[i, j] = nan_val
+                else:
+                    out[i, j] = minx[i, j]
+
+    if runtime_error:
+        # We cannot raise directly above because that is within a nogil
+        #  block.
+        raise RuntimeError("empty group with uint64_t")
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummin(groupby_t[:, :] out,
+                 groupby_t[:, :] values,
+                 const int64_t[:] labels,
+                 int ngroups,
+                 bint is_datetimelike):
+    """
+    Cumulative minimum of columns of `values`, in row groups `labels`.
+
+    Parameters
+    ----------
+    out : array
+        Array to store cummin in.
+    values : array
+        Values to take cummin of.
+    labels : int64 array
+        Labels to group by.
+    ngroups : int
+        Number of groups, larger than all entries of `labels`.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+
+    Notes
+    -----
+    This method modifies the `out` parameter, rather than returning an object.
+    """
+
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        groupby_t val, mval
+        ndarray[groupby_t, ndim=2] accum
+        int64_t lab
+
+    N, K = (<object>values).shape
+    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
+    if groupby_t is int64_t:
+        accum[:] = _int64_max
+    elif groupby_t is uint64_t:
+        accum[:] = np.iinfo(np.uint64).max
+    else:
+        accum[:] = np.inf
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+
+                if _treat_as_na(val, is_datetimelike):
+                    out[i, j] = val
+                else:
+                    mval = accum[lab, j]
+                    if val < mval:
+                        accum[lab, j] = mval = val
+                    out[i, j] = mval
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cummax(groupby_t[:, :] out,
+                 groupby_t[:, :] values,
+                 const int64_t[:] labels,
+                 int ngroups,
+                 bint is_datetimelike):
+    """
+    Cumulative maximum of columns of `values`, in row groups `labels`.
+
+    Parameters
+    ----------
+    out : array
+        Array to store cummax in.
+    values : array
+        Values to take cummax of.
+    labels : int64 array
+        Labels to group by.
+    ngroups : int
+        Number of groups, larger than all entries of `labels`.
+    is_datetimelike : bool
+        True if `values` contains datetime-like entries.
+
+    Notes
+    -----
+    This method modifies the `out` parameter, rather than returning an object.
+    """
+
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        groupby_t val, mval
+        ndarray[groupby_t, ndim=2] accum
+        int64_t lab
+
+    N, K = (<object>values).shape
+    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
+    if groupby_t is int64_t:
+        accum[:] = -_int64_max
+    elif groupby_t is uint64_t:
+        accum[:] = 0
+    else:
+        accum[:] = -np.inf
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+
+                if _treat_as_na(val, is_datetimelike):
+                    out[i, j] = val
+                else:
+                    mval = accum[lab, j]
+                    if val > mval:
+                        accum[lab, j] = mval = val
+                    out[i, j] = mval
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
deleted file mode 100644
index c837c6c5c6519..0000000000000
--- a/pandas/_libs/groupby_helper.pxi.in
+++ /dev/null
@@ -1,693 +0,0 @@
-"""
-Template for each `dtype` helper function using groupby
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-cdef extern from "numpy/npy_math.h":
-    float64_t NAN "NPY_NAN"
-_int64_max = np.iinfo(np.int64).max
-
-# ----------------------------------------------------------------------
-# group_nth, group_last, group_rank
-# ----------------------------------------------------------------------
-
-ctypedef fused rank_t:
-    float64_t
-    float32_t
-    int64_t
-    uint64_t
-    object
-
-
-cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil:
-    if rank_t is object:
-        # Should never be used, but we need to avoid the `val != val` below
-        #  or else cython will raise about gil acquisition.
-        raise NotImplementedError
-
-    elif rank_t is int64_t:
-        return is_datetimelike and val == NPY_NAT
-    else:
-        return val != val
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last(rank_t[:, :] out,
-               int64_t[:] counts,
-               rank_t[:, :] values,
-               const int64_t[:] labels,
-               Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val
-        ndarray[rank_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-        bint runtime_error = False
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    if rank_t is object:
-        resx = np.empty((<object>out).shape, dtype=object)
-    else:
-        resx = np.empty_like(out)
-
-    N, K = (<object>values).shape
-
-    if rank_t is object:
-        # TODO: De-duplicate once conditional-nogil is available
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                if val == val:
-                    # NB: use _treat_as_na here once
-                    #  conditional-nogil is available.
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-    else:
-        with nogil:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    if not _treat_as_na(val, True):
-                        # TODO: Sure we always want is_datetimelike=True?
-                        nobs[lab, j] += 1
-                        resx[lab, j] = val
-
-            for i in range(ncounts):
-                for j in range(K):
-                    if nobs[i, j] == 0:
-                        if rank_t is int64_t:
-                            out[i, j] = NPY_NAT
-                        elif rank_t is uint64_t:
-                            runtime_error = True
-                            break
-                        else:
-                            out[i, j] = NAN
-
-                    else:
-                        out[i, j] = resx[i, j]
-
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
-group_last_float64 = group_last["float64_t"]
-group_last_float32 = group_last["float32_t"]
-group_last_int64 = group_last["int64_t"]
-group_last_object = group_last["object"]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth(rank_t[:, :] out,
-              int64_t[:] counts,
-              rank_t[:, :] values,
-              const int64_t[:] labels, int64_t rank,
-              Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        rank_t val
-        ndarray[rank_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-        bint runtime_error = False
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    if rank_t is object:
-        resx = np.empty((<object>out).shape, dtype=object)
-    else:
-        resx = np.empty_like(out)
-
-    N, K = (<object>values).shape
-
-    if rank_t is object:
-        # TODO: De-duplicate once conditional-nogil is available
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                if val == val:
-                    # NB: use _treat_as_na here once
-                    #  conditional-nogil is available.
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-    else:
-        with nogil:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    if not _treat_as_na(val, True):
-                        # TODO: Sure we always want is_datetimelike=True?
-                        nobs[lab, j] += 1
-                        if nobs[lab, j] == rank:
-                            resx[lab, j] = val
-
-            for i in range(ncounts):
-                for j in range(K):
-                    if nobs[i, j] == 0:
-                        if rank_t is int64_t:
-                            out[i, j] = NPY_NAT
-                        elif rank_t is uint64_t:
-                            runtime_error = True
-                            break
-                        else:
-                            out[i, j] = NAN
-                    else:
-                        out[i, j] = resx[i, j]
-
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
-
-group_nth_float64 = group_nth["float64_t"]
-group_nth_float32 = group_nth["float32_t"]
-group_nth_int64 = group_nth["int64_t"]
-group_nth_object = group_nth["object"]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_rank(float64_t[:, :] out,
-               rank_t[:, :] values,
-               const int64_t[:] labels,
-               bint is_datetimelike, object ties_method,
-               bint ascending, bint pct, object na_option):
-    """
-    Provides the rank of values within each group.
-
-    Parameters
-    ----------
-    out : array of float64_t values which this method will write its results to
-    values : array of rank_t values to be ranked
-    labels : array containing unique label for each group, with its ordering
-        matching up to the corresponding record in `values`
-    is_datetimelike : bool, default False
-        unused in this method but provided for call compatibility with other
-        Cython transformations
-    ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
-        'average'
-        * average: average rank of group
-        * min: lowest rank in group
-        * max: highest rank in group
-        * first: ranks assigned in order they appear in the array
-        * dense: like 'min', but rank always increases by 1 between groups
-    ascending : boolean, default True
-        False for ranks by high (1) to low (N)
-        na_option : {'keep', 'top', 'bottom'}, default 'keep'
-    pct : boolean, default False
-        Compute percentage rank of data within each group
-    na_option : {'keep', 'top', 'bottom'}, default 'keep'
-        * keep: leave NA values where they are
-        * top: smallest rank if ascending
-        * bottom: smallest rank if descending
-
-    Notes
-    -----
-    This method modifies the `out` parameter rather than returning an object
-    """
-    cdef:
-        TiebreakEnumType tiebreak
-        Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
-        Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
-        ndarray[int64_t] _as
-        ndarray[float64_t, ndim=2] grp_sizes
-        ndarray[rank_t] masked_vals
-        ndarray[uint8_t] mask
-        bint keep_na
-        rank_t nan_fill_val
-
-    if rank_t is object:
-        raise NotImplementedError("Cant do nogil")
-
-    tiebreak = tiebreakers[ties_method]
-    keep_na = na_option == 'keep'
-    N, K = (<object>values).shape
-    grp_sizes = np.ones_like(out)
-
-    # Copy values into new array in order to fill missing data
-    # with mask, without obfuscating location of missing data
-    # in values array
-    masked_vals = np.array(values[:, 0], copy=True)
-    if rank_t is int64_t:
-        mask = (masked_vals == NPY_NAT).astype(np.uint8)
-    else:
-        mask = np.isnan(masked_vals).astype(np.uint8)
-
-    if ascending ^ (na_option == 'top'):
-        if rank_t is int64_t:
-            nan_fill_val = np.iinfo(np.int64).max
-        elif rank_t is uint64_t:
-            nan_fill_val = np.iinfo(np.uint64).max
-        else:
-            nan_fill_val = np.inf
-        order = (masked_vals, mask, labels)
-    else:
-        if rank_t is int64_t:
-            nan_fill_val = np.iinfo(np.int64).min
-        elif rank_t is uint64_t:
-            nan_fill_val = 0
-        else:
-            nan_fill_val = -np.inf
-
-        order = (masked_vals, ~mask, labels)
-    np.putmask(masked_vals, mask, nan_fill_val)
-
-    # lexsort using labels, then mask, then actual values
-    # each label corresponds to a different group value,
-    # the mask helps you differentiate missing values before
-    # performing sort on the actual values
-    _as = np.lexsort(order).astype(np.int64, copy=False)
-
-    if not ascending:
-        _as = _as[::-1]
-
-    with nogil:
-        # Loop over the length of the value array
-        # each incremental i value can be looked up in the _as array
-        # that we sorted previously, which gives us the location of
-        # that sorted value for retrieval back from the original
-        # values / masked_vals arrays
-        for i in range(N):
-            # dups and sum_ranks will be incremented each loop where
-            # the value / group remains the same, and should be reset
-            # when either of those change
-            # Used to calculate tiebreakers
-            dups += 1
-            sum_ranks += i - grp_start + 1
-
-            # Update out only when there is a transition of values or labels.
-            # When a new value or group is encountered, go back #dups steps(
-            # the number of occurrence of current value) and assign the ranks
-            # based on the the starting index of the current group (grp_start)
-            # and the current index
-            if (i == N - 1 or
-                    (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
-                    (mask[_as[i]] ^ mask[_as[i+1]]) or
-                    (labels[_as[i]] != labels[_as[i+1]])):
-                # if keep_na, check for missing values and assign back
-                # to the result where appropriate
-                if keep_na and mask[_as[i]]:
-                    for j in range(i - dups + 1, i + 1):
-                        out[_as[j], 0] = NaN
-                        grp_na_count = dups
-                elif tiebreak == TIEBREAK_AVERAGE:
-                    for j in range(i - dups + 1, i + 1):
-                        out[_as[j], 0] = sum_ranks / <float64_t>dups
-                elif tiebreak == TIEBREAK_MIN:
-                    for j in range(i - dups + 1, i + 1):
-                        out[_as[j], 0] = i - grp_start - dups + 2
-                elif tiebreak == TIEBREAK_MAX:
-                    for j in range(i - dups + 1, i + 1):
-                        out[_as[j], 0] = i - grp_start + 1
-                elif tiebreak == TIEBREAK_FIRST:
-                    for j in range(i - dups + 1, i + 1):
-                        if ascending:
-                            out[_as[j], 0] = j + 1 - grp_start
-                        else:
-                            out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start
-                elif tiebreak == TIEBREAK_DENSE:
-                    for j in range(i - dups + 1, i + 1):
-                        out[_as[j], 0] = grp_vals_seen
-
-                # look forward to the next value (using the sorting in _as)
-                # if the value does not equal the current value then we need to
-                # reset the dups and sum_ranks, knowing that a new value is
-                # coming up. the conditional also needs to handle nan equality
-                # and the end of iteration
-                if (i == N - 1 or
-                        (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or
-                        (mask[_as[i]] ^ mask[_as[i+1]])):
-                    dups = sum_ranks = 0
-                    grp_vals_seen += 1
-                    grp_tie_count += 1
-
-                # Similar to the previous conditional, check now if we are
-                # moving to a new group. If so, keep track of the index where
-                # the new group occurs, so the tiebreaker calculations can
-                # decrement that from their position. fill in the size of each
-                # group encountered (used by pct calculations later). also be
-                # sure to reset any of the items helping to calculate dups
-                if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
-                    if tiebreak != TIEBREAK_DENSE:
-                        for j in range(grp_start, i + 1):
-                            grp_sizes[_as[j], 0] = (i - grp_start + 1 -
-                                                    grp_na_count)
-                    else:
-                        for j in range(grp_start, i + 1):
-                            grp_sizes[_as[j], 0] = (grp_tie_count -
-                                                    (grp_na_count > 0))
-                    dups = sum_ranks = 0
-                    grp_na_count = 0
-                    grp_tie_count = 0
-                    grp_start = i + 1
-                    grp_vals_seen = 1
-
-        if pct:
-            for i in range(N):
-                # We don't include NaN values in percentage
-                # rankings, so we assign them percentages of NaN.
-                if out[i, 0] != out[i, 0] or out[i, 0] == NAN:
-                    out[i, 0] = NAN
-                elif grp_sizes[i, 0] != 0:
-                    out[i, 0] = out[i, 0] / grp_sizes[i, 0]
-
-
-group_rank_float64 = group_rank["float64_t"]
-group_rank_float32 = group_rank["float32_t"]
-group_rank_int64 = group_rank["int64_t"]
-group_rank_uint64 = group_rank["uint64_t"]
-# Note: we do not have a group_rank_object because that would require a
-#  not-nogil implementation, see GH#19560
-
-
-# ----------------------------------------------------------------------
-# group_min, group_max
-# ----------------------------------------------------------------------
-
-# TODO: consider implementing for more dtypes
-ctypedef fused groupby_t:
-    float64_t
-    float32_t
-    int64_t
-    uint64_t
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max(groupby_t[:, :] out,
-              int64_t[:] counts,
-              groupby_t[:, :] values,
-              const int64_t[:] labels,
-              Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        groupby_t val, count, nan_val
-        ndarray[groupby_t, ndim=2] maxx, nobs
-        bint runtime_error = False
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    if groupby_t is int64_t:
-        # Note: evaluated at compile-time
-        maxx[:] = -_int64_max
-        nan_val = NPY_NAT
-    elif groupby_t is uint64_t:
-        # NB: We do not define nan_val because there is no such thing
-        #  for uint64_t.  We carefully avoid having to reference it in this
-        #  case.
-        maxx[:] = 0
-    else:
-        maxx[:] = -np.inf
-        nan_val = NAN
-
-    N, K = (<object>values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                if not _treat_as_na(val, True):
-                    # TODO: Sure we always want is_datetimelike=True?
-                    nobs[lab, j] += 1
-                    if val > maxx[lab, j]:
-                        maxx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    if groupby_t is uint64_t:
-                        runtime_error = True
-                        break
-                    out[i, j] = nan_val
-                else:
-                    out[i, j] = maxx[i, j]
-
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min(groupby_t[:, :] out,
-              int64_t[:] counts,
-              groupby_t[:, :] values,
-              const int64_t[:] labels,
-              Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        groupby_t val, count, nan_val
-        ndarray[groupby_t, ndim=2] minx, nobs
-        bint runtime_error = False
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    if not len(values) == len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    if groupby_t is int64_t:
-        minx[:] = _int64_max
-        nan_val = NPY_NAT
-    elif groupby_t is uint64_t:
-        # NB: We do not define nan_val because there is no such thing
-        #  for uint64_t.  We carefully avoid having to reference it in this
-        #  case.
-        minx[:] = np.iinfo(np.uint64).max
-    else:
-        minx[:] = np.inf
-        nan_val = NAN
-
-    N, K = (<object>values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                if not _treat_as_na(val, True):
-                    # TODO: Sure we always want is_datetimelike=True?
-                    nobs[lab, j] += 1
-                    if val < minx[lab, j]:
-                        minx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    if groupby_t is uint64_t:
-                        runtime_error = True
-                        break
-                    out[i, j] = nan_val
-                else:
-                    out[i, j] = minx[i, j]
-
-    if runtime_error:
-        # We cannot raise directly above because that is within a nogil
-        #  block.
-        raise RuntimeError("empty group with uint64_t")
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cummin(groupby_t[:, :] out,
-                 groupby_t[:, :] values,
-                 const int64_t[:] labels,
-                 int ngroups,
-                 bint is_datetimelike):
-    """
-    Cumulative minimum of columns of `values`, in row groups `labels`.
-
-    Parameters
-    ----------
-    out : array
-        Array to store cummin in.
-    values : array
-        Values to take cummin of.
-    labels : int64 array
-        Labels to group by.
-    ngroups : int
-        Number of groups, larger than all entries of `labels`.
-    is_datetimelike : bool
-        True if `values` contains datetime-like entries.
-
-    Notes
-    -----
-    This method modifies the `out` parameter, rather than returning an object.
-    """
-
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        groupby_t val, mval
-        ndarray[groupby_t, ndim=2] accum
-        int64_t lab
-
-    N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
-    if groupby_t is int64_t:
-        accum[:] = _int64_max
-    elif groupby_t is uint64_t:
-        accum[:] = np.iinfo(np.uint64).max
-    else:
-        accum[:] = np.inf
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
-
-                if _treat_as_na(val, is_datetimelike):
-                    out[i, j] = val
-                else:
-                    mval = accum[lab, j]
-                    if val < mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cummax(groupby_t[:, :] out,
-                 groupby_t[:, :] values,
-                 const int64_t[:] labels,
-                 int ngroups,
-                 bint is_datetimelike):
-    """
-    Cumulative maximum of columns of `values`, in row groups `labels`.
-
-    Parameters
-    ----------
-    out : array
-        Array to store cummax in.
-    values : array
-        Values to take cummax of.
-    labels : int64 array
-        Labels to group by.
-    ngroups : int
-        Number of groups, larger than all entries of `labels`.
-    is_datetimelike : bool
-        True if `values` contains datetime-like entries.
-
-    Notes
-    -----
-    This method modifies the `out` parameter, rather than returning an object.
-    """
-
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        groupby_t val, mval
-        ndarray[groupby_t, ndim=2] accum
-        int64_t lab
-
-    N, K = (<object>values).shape
-    accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
-    if groupby_t is int64_t:
-        accum[:] = -_int64_max
-    elif groupby_t is uint64_t:
-        accum[:] = 0
-    else:
-        accum[:] = -np.inf
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
-
-                if _treat_as_na(val, is_datetimelike):
-                    out[i, j] = val
-                else:
-                    mval = accum[lab, j]
-                    if val > mval:
-                        accum[lab, j] = mval = val
-                    out[i, j] = mval
diff --git a/setup.py b/setup.py
index c35a0e75ecb80..2892cd0b2e294 100755
--- a/setup.py
+++ b/setup.py
@@ -88,7 +88,6 @@ def is_platform_mac():
         "_libs/algos_take_helper.pxi.in",
         "_libs/algos_rank_helper.pxi.in",
     ],
-    "groupby": ["_libs/groupby_helper.pxi.in"],
     "hashtable": [
         "_libs/hashtable_class_helper.pxi.in",
         "_libs/hashtable_func_helper.pxi.in",
@@ -564,7 +563,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"):
 
 ext_data = {
     "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]},
-    "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]},
+    "_libs.groupby": {"pyxfile": "_libs/groupby"},
     "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []},
     "_libs.hashtable": {
         "pyxfile": "_libs/hashtable",

From 2617894a2f5840466dcb9a4aa8d759be3caa1ca1 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Oct 2019 15:16:43 -0700
Subject: [PATCH 2/4] lint fixup

---
 pandas/_libs/groupby.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 93e790ec1e5ae..b9ca079b268f0 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -904,6 +904,7 @@ def group_last(rank_t[:, :] out,
         #  block.
         raise RuntimeError("empty group with uint64_t")
 
+
 group_last_float64 = group_last["float64_t"]
 group_last_float32 = group_last["float32_t"]
 group_last_int64 = group_last["int64_t"]

From 89a9ff379fd1cadf14a9756a05307f97aade864b Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Oct 2019 16:34:16 -0700
Subject: [PATCH 3/4] dummy to force CI


From 250eaddb72233de5dc2d000fd72e9f7e43998bff Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 16 Oct 2019 18:43:25 -0700
Subject: [PATCH 4/4] troubleshoot

---
 pandas/_libs/groupby.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 4fb0026267a72..e40487e4f08b1 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -23,7 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64,
                                 groupsort_indexer, tiebreakers)
 
 cdef int64_t NPY_NAT = get_nat()
-cdef int64_t _int64_max = np.iinfo(np.int64).max
+_int64_max = np.iinfo(np.int64).max
 
 cdef float64_t NaN = <float64_t>np.NaN