REF: remove libreduction.SeriesGrouper (#43505)

jbrockmendel · web-flow · commit f72f566df036 · 2021-09-10T20:17:41.000-04:00
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -34,166 +34,6 @@ cpdef check_result_array(object obj, object dtype):
             raise ValueError("Must produce aggregated value")
 
 
-cdef class _BaseGrouper:
-    cdef _check_dummy(self, object dummy):
-        # both values and index must be an ndarray!
-
-        values = dummy.values
-        # GH 23683: datetimetz types are equivalent to datetime types here
-        if (dummy.dtype != self.arr.dtype
-                and values.dtype != self.arr.dtype):
-            raise ValueError('Dummy array must be same dtype')
-        if is_array(values) and not values.flags.contiguous:
-            # e.g. Categorical has no `flags` attribute
-            values = values.copy()
-        index = dummy.index.values
-        if not index.flags.contiguous:
-            index = index.copy()
-
-        return values, index
-
-    cdef _init_dummy_series_and_index(self, Slider islider, Slider vslider):
-        """
-        Create Series and Index objects that we will alter in-place while iterating.
-        """
-        cached_index = self.ityp(islider.buf, dtype=self.idtype)
-        cached_series = self.typ(
-            vslider.buf, dtype=vslider.buf.dtype, index=cached_index, name=self.name
-        )
-        return cached_index, cached_series
-
-    cdef inline _update_cached_objs(self, object cached_series, object cached_index,
-                                    Slider islider, Slider vslider):
-        cached_index._engine.clear_mapping()
-        cached_index._cache.clear()  # e.g. inferred_freq must go
-        cached_series._mgr.set_values(vslider.buf)
-
-    cdef inline object _apply_to_group(self,
-                                       object cached_series, object cached_index,
-                                       bint initialized):
-        """
-        Call self.f on our new group, then update to the next group.
-        """
-        cdef:
-            object res
-
-        # NB: we assume that _update_cached_objs has already cleared cleared
-        #  the cache and engine mapping
-        res = self.f(cached_series)
-        res = extract_result(res)
-        if not initialized:
-            # On the first pass, we check the output shape to see
-            #  if this looks like a reduction.
-            initialized = True
-            check_result_array(res, cached_series.dtype)
-
-        return res, initialized
-
-
-cdef class SeriesGrouper(_BaseGrouper):
-    """
-    Performs generic grouping operation while avoiding ndarray construction
-    overhead
-    """
-    cdef:
-        Py_ssize_t nresults, ngroups
-
-    cdef public:
-        ndarray arr, index, dummy_arr, dummy_index
-        object f, labels, values, typ, ityp, name, idtype
-
-    def __init__(self, object series, object f, ndarray[intp_t] labels,
-                 Py_ssize_t ngroups):
-
-        if len(series) == 0:
-            # get_result would never assign `result`
-            raise ValueError("SeriesGrouper requires non-empty `series`")
-
-        self.labels = labels
-        self.f = f
-
-        values = series.values
-        if is_array(values) and not values.flags.c_contiguous:
-            # e.g. Categorical has no `flags` attribute
-            values = values.copy('C')
-        self.arr = values
-        self.typ = series._constructor
-        self.ityp = series.index._constructor
-        self.idtype = series.index.dtype
-        self.index = series.index.values
-        self.name = series.name
-
-        dummy = series.iloc[:0]
-        self.dummy_arr, self.dummy_index = self._check_dummy(dummy)
-        self.ngroups = ngroups
-
-    def get_result(self):
-        cdef:
-            # Define result to avoid UnboundLocalError
-            ndarray arr, result = None
-            ndarray[intp_t] labels
-            ndarray[int64_t] counts
-            Py_ssize_t i, n, group_size, lab, start, end
-            object res
-            bint initialized = 0
-            Slider vslider, islider
-            object cached_series = None, cached_index = None
-
-        labels = self.labels
-        counts = np.zeros(self.ngroups, dtype=np.int64)
-        group_size = 0
-        n = len(self.arr)
-
-        vslider = Slider(self.arr, self.dummy_arr)
-        islider = Slider(self.index, self.dummy_index)
-
-        result = np.empty(self.ngroups, dtype='O')
-
-        cached_index, cached_series = self._init_dummy_series_and_index(
-            islider, vslider
-        )
-
-        start = 0
-        try:
-            for i in range(n):
-                group_size += 1
-
-                lab = labels[i]
-
-                if i == n - 1 or lab != labels[i + 1]:
-                    if lab == -1:
-                        start += group_size
-                        group_size = 0
-                        continue
-
-                    end = start + group_size
-                    islider.move(start, end)
-                    vslider.move(start, end)
-
-                    self._update_cached_objs(
-                        cached_series, cached_index, islider, vslider)
-
-                    res, initialized = self._apply_to_group(cached_series, cached_index,
-                                                            initialized)
-
-                    start += group_size
-
-                    result[lab] = res
-                    counts[lab] = group_size
-                    group_size = 0
-
-        finally:
-            # so we don't free the wrong memory
-            islider.reset()
-            vslider.reset()
-
-        # We check for empty series in the constructor, so should always
-        #  have result initialized by this point.
-        assert initialized, "`result` has not been initialized."
-
-        return result, counts
-
-
 cpdef inline extract_result(object res):
     """ extract the result object, it might be a 0-dim ndarray
         or a len-1 0-dim, or a scalar """
@@ -208,40 +48,3 @@ cpdef inline extract_result(object res):
             # see test_resampler_grouper.py::test_apply
             res = res[0]
     return res
-
-
-cdef class Slider:
-    """
-    Only handles contiguous data for now
-    """
-    cdef:
-        ndarray values, buf
-        Py_ssize_t stride
-        char *orig_data
-
-    def __init__(self, ndarray values, ndarray buf):
-        assert values.ndim == 1
-        assert values.dtype == buf.dtype
-
-        if not values.flags.contiguous:
-            values = values.copy()
-
-        self.values = values
-        self.buf = buf
-
-        self.stride = values.strides[0]
-        self.orig_data = self.buf.data
-
-        self.buf.data = self.values.data
-        self.buf.strides[0] = self.stride
-
-    cdef move(self, int start, int end):
-        """
-        For slicing
-        """
-        self.buf.data = self.values.data + self.stride * start
-        self.buf.shape[0] = end - start
-
-    cdef reset(self):
-        self.buf.data = self.orig_data
-        self.buf.shape[0] = 0
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -82,7 +82,6 @@
     BaseMaskedArray,
     BaseMaskedDtype,
 )
-import pandas.core.common as com
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
 from pandas.core.groupby import grouper
@@ -933,10 +932,6 @@ def agg_series(
             result = self._aggregate_series_pure_python(obj, func)
 
         elif not isinstance(obj._values, np.ndarray):
-            # _aggregate_series_fast would raise TypeError when
-            #  calling libreduction.Slider
-            # In the datetime64tz case it would incorrectly cast to tz-naive
-            # TODO: can we get a performant workaround for EAs backed by ndarray?
             result = self._aggregate_series_pure_python(obj, func)
 
             # we can preserve a little bit more aggressively with EA dtype
@@ -945,17 +940,8 @@ def agg_series(
             #  is sufficiently strict that it casts appropriately.
             preserve_dtype = True
 
-        elif obj.index._has_complex_internals:
-            # Preempt TypeError in _aggregate_series_fast
-            result = self._aggregate_series_pure_python(obj, func)
-
-        elif isinstance(self, BinGrouper):
-            # Not yet able to remove the BaseGrouper aggregate_series_fast,
-            #  as test_crosstab.test_categorical breaks without it
-            result = self._aggregate_series_pure_python(obj, func)
-
         else:
-            result = self._aggregate_series_fast(obj, func)
+            result = self._aggregate_series_pure_python(obj, func)
 
         npvalues = lib.maybe_convert_objects(result, try_float=False)
         if preserve_dtype:
@@ -964,23 +950,6 @@ def agg_series(
             out = npvalues
         return out
 
-    def _aggregate_series_fast(self, obj: Series, func: F) -> npt.NDArray[np.object_]:
-        # At this point we have already checked that
-        #  - obj.index is not a MultiIndex
-        #  - obj is backed by an ndarray, not ExtensionArray
-        #  - len(obj) > 0
-        func = com.is_builtin_func(func)
-
-        ids, _, ngroups = self.group_info
-
-        # avoids object / Series creation overhead
-        indexer = get_group_index_sorter(ids, ngroups)
-        obj = obj.take(indexer)
-        ids = ids.take(indexer)
-        sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
-        result, _ = sgrouper.get_result()
-        return result
-
     @final
     def _aggregate_series_pure_python(
         self, obj: Series, func: F
@@ -995,9 +964,6 @@ def _aggregate_series_pure_python(
         splitter = get_splitter(obj, ids, ngroups, axis=0)
 
         for i, group in enumerate(splitter):
-
-            # Each step of this loop corresponds to
-            #  libreduction._BaseGrouper._apply_to_group
             res = func(group)
             res = libreduction.extract_result(res)
 
diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
@@ -1,58 +1,13 @@
 import numpy as np
 import pytest
 
-from pandas._libs import (
-    lib,
-    reduction as libreduction,
-)
+from pandas._libs import lib
 import pandas.util._test_decorators as td
 
 import pandas as pd
-from pandas import Series
 import pandas._testing as tm
 
 
-def test_series_grouper():
-    obj = Series(np.random.randn(10))
-
-    labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)
-
-    grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2)
-    result, counts = grouper.get_result()
-
-    expected = np.array([obj[3:6].mean(), obj[6:].mean()], dtype=object)
-    tm.assert_almost_equal(result, expected)
-
-    exp_counts = np.array([3, 4], dtype=np.int64)
-    tm.assert_almost_equal(counts, exp_counts)
-
-
-def test_series_grouper_result_length_difference():
-    # GH 40014
-    obj = Series(np.random.randn(10), dtype="float64")
-    obj.index = obj.index.astype("O")
-    labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)
-
-    grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2)
-    result, counts = grouper.get_result()
-
-    expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object)
-    tm.assert_equal(result, expected)
-
-    exp_counts = np.array([3, 4], dtype=np.int64)
-    tm.assert_equal(counts, exp_counts)
-
-
-def test_series_grouper_requires_nonempty_raises():
-    # GH#29500
-    obj = Series(np.random.randn(10))
-    dummy = obj.iloc[:0]
-    labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp)
-
-    with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"):
-        libreduction.SeriesGrouper(dummy, np.mean, labels, 2)
-
-
 def assert_block_lengths(x):
     assert len(x) == len(x._mgr.blocks[0].mgr_locs)
     return 0
diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py
@@ -811,7 +811,7 @@ def test_categoricals(a_dtype, b_dtype):
     a_is_cat = is_categorical_dtype(a.dtype)
     assert not a_is_cat or a.value_counts().loc[1] == 0
     result = crosstab(a, b, margins=True, dropna=False)
-    values = [[18, 16, 34], [0, 0, np.nan], [34, 32, 66], [52, 48, 100]]
+    values = [[18, 16, 34], [0, 0, 0], [34, 32, 66], [52, 48, 100]]
     expected = DataFrame(values, index, columns)
     if not a_is_cat:
         expected = expected.loc[[0, 2, "All"]]