From c3ab765e07db6076c1bc691db9843ae4de3f5e94 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Sep 2021 14:45:28 -0700 Subject: [PATCH] REF: remove libreduction.SeriesGrouper --- pandas/_libs/reduction.pyx | 197 ----------------------- pandas/core/groupby/ops.py | 36 +---- pandas/tests/groupby/test_bin_groupby.py | 47 +----- pandas/tests/reshape/test_crosstab.py | 2 +- 4 files changed, 3 insertions(+), 279 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 77fd4d94d05ac..f6c404c07c7e4 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -34,166 +34,6 @@ cpdef check_result_array(object obj, object dtype): raise ValueError("Must produce aggregated value") -cdef class _BaseGrouper: - cdef _check_dummy(self, object dummy): - # both values and index must be an ndarray! - - values = dummy.values - # GH 23683: datetimetz types are equivalent to datetime types here - if (dummy.dtype != self.arr.dtype - and values.dtype != self.arr.dtype): - raise ValueError('Dummy array must be same dtype') - if is_array(values) and not values.flags.contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy() - index = dummy.index.values - if not index.flags.contiguous: - index = index.copy() - - return values, index - - cdef _init_dummy_series_and_index(self, Slider islider, Slider vslider): - """ - Create Series and Index objects that we will alter in-place while iterating. - """ - cached_index = self.ityp(islider.buf, dtype=self.idtype) - cached_series = self.typ( - vslider.buf, dtype=vslider.buf.dtype, index=cached_index, name=self.name - ) - return cached_index, cached_series - - cdef inline _update_cached_objs(self, object cached_series, object cached_index, - Slider islider, Slider vslider): - cached_index._engine.clear_mapping() - cached_index._cache.clear() # e.g. inferred_freq must go - cached_series._mgr.set_values(vslider.buf) - - cdef inline object _apply_to_group(self, - object cached_series, object cached_index, - bint initialized): - """ - Call self.f on our new group, then update to the next group. - """ - cdef: - object res - - # NB: we assume that _update_cached_objs has already cleared cleared - # the cache and engine mapping - res = self.f(cached_series) - res = extract_result(res) - if not initialized: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - initialized = True - check_result_array(res, cached_series.dtype) - - return res, initialized - - -cdef class SeriesGrouper(_BaseGrouper): - """ - Performs generic grouping operation while avoiding ndarray construction - overhead - """ - cdef: - Py_ssize_t nresults, ngroups - - cdef public: - ndarray arr, index, dummy_arr, dummy_index - object f, labels, values, typ, ityp, name, idtype - - def __init__(self, object series, object f, ndarray[intp_t] labels, - Py_ssize_t ngroups): - - if len(series) == 0: - # get_result would never assign `result` - raise ValueError("SeriesGrouper requires non-empty `series`") - - self.labels = labels - self.f = f - - values = series.values - if is_array(values) and not values.flags.c_contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy('C') - self.arr = values - self.typ = series._constructor - self.ityp = series.index._constructor - self.idtype = series.index.dtype - self.index = series.index.values - self.name = series.name - - dummy = series.iloc[:0] - self.dummy_arr, self.dummy_index = self._check_dummy(dummy) - self.ngroups = ngroups - - def get_result(self): - cdef: - # Define result to avoid UnboundLocalError - ndarray arr, result = None - ndarray[intp_t] labels - ndarray[int64_t] counts - Py_ssize_t i, n, group_size, lab, start, end - object res - bint initialized = 0 - Slider vslider, islider - object cached_series = None, cached_index = None - - labels = self.labels - counts = np.zeros(self.ngroups, dtype=np.int64) - group_size = 0 - n = len(self.arr) - - vslider = Slider(self.arr, self.dummy_arr) - islider = Slider(self.index, self.dummy_index) - - result = np.empty(self.ngroups, dtype='O') - - cached_index, cached_series = self._init_dummy_series_and_index( - islider, vslider - ) - - start = 0 - try: - for i in range(n): - group_size += 1 - - lab = labels[i] - - if i == n - 1 or lab != labels[i + 1]: - if lab == -1: - start += group_size - group_size = 0 - continue - - end = start + group_size - islider.move(start, end) - vslider.move(start, end) - - self._update_cached_objs( - cached_series, cached_index, islider, vslider) - - res, initialized = self._apply_to_group(cached_series, cached_index, - initialized) - - start += group_size - - result[lab] = res - counts[lab] = group_size - group_size = 0 - - finally: - # so we don't free the wrong memory - islider.reset() - vslider.reset() - - # We check for empty series in the constructor, so should always - # have result initialized by this point. - assert initialized, "`result` has not been initialized." - - return result, counts - - cpdef inline extract_result(object res): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ @@ -208,40 +48,3 @@ cpdef inline extract_result(object res): # see test_resampler_grouper.py::test_apply res = res[0] return res - - -cdef class Slider: - """ - Only handles contiguous data for now - """ - cdef: - ndarray values, buf - Py_ssize_t stride - char *orig_data - - def __init__(self, ndarray values, ndarray buf): - assert values.ndim == 1 - assert values.dtype == buf.dtype - - if not values.flags.contiguous: - values = values.copy() - - self.values = values - self.buf = buf - - self.stride = values.strides[0] - self.orig_data = self.buf.data - - self.buf.data = self.values.data - self.buf.strides[0] = self.stride - - cdef move(self, int start, int end): - """ - For slicing - """ - self.buf.data = self.values.data + self.stride * start - self.buf.shape[0] = end - start - - cdef reset(self): - self.buf.data = self.orig_data - self.buf.shape[0] = 0 diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d5569fb5f8a96..9278e598cf728 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -82,7 +82,6 @@ BaseMaskedArray, BaseMaskedDtype, ) -import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import grouper @@ -933,10 +932,6 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) elif not isinstance(obj._values, np.ndarray): - # _aggregate_series_fast would raise TypeError when - # calling libreduction.Slider - # In the datetime64tz case it would incorrectly cast to tz-naive - # TODO: can we get a performant workaround for EAs backed by ndarray? result = self._aggregate_series_pure_python(obj, func) # we can preserve a little bit more aggressively with EA dtype @@ -945,17 +940,8 @@ def agg_series( # is sufficiently strict that it casts appropriately. preserve_dtype = True - elif obj.index._has_complex_internals: - # Preempt TypeError in _aggregate_series_fast - result = self._aggregate_series_pure_python(obj, func) - - elif isinstance(self, BinGrouper): - # Not yet able to remove the BaseGrouper aggregate_series_fast, - # as test_crosstab.test_categorical breaks without it - result = self._aggregate_series_pure_python(obj, func) - else: - result = self._aggregate_series_fast(obj, func) + result = self._aggregate_series_pure_python(obj, func) npvalues = lib.maybe_convert_objects(result, try_float=False) if preserve_dtype: @@ -964,23 +950,6 @@ def agg_series( out = npvalues return out - def _aggregate_series_fast(self, obj: Series, func: F) -> npt.NDArray[np.object_]: - # At this point we have already checked that - # - obj.index is not a MultiIndex - # - obj is backed by an ndarray, not ExtensionArray - # - len(obj) > 0 - func = com.is_builtin_func(func) - - ids, _, ngroups = self.group_info - - # avoids object / Series creation overhead - indexer = get_group_index_sorter(ids, ngroups) - obj = obj.take(indexer) - ids = ids.take(indexer) - sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups) - result, _ = sgrouper.get_result() - return result - @final def _aggregate_series_pure_python( self, obj: Series, func: F @@ -995,9 +964,6 @@ def _aggregate_series_pure_python( splitter = get_splitter(obj, ids, ngroups, axis=0) for i, group in enumerate(splitter): - - # Each step of this loop corresponds to - # libreduction._BaseGrouper._apply_to_group res = func(group) res = libreduction.extract_result(res) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 789c9cf33289e..8c30836f2cf91 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -1,58 +1,13 @@ import numpy as np import pytest -from pandas._libs import ( - lib, - reduction as libreduction, -) +from pandas._libs import lib import pandas.util._test_decorators as td import pandas as pd -from pandas import Series import pandas._testing as tm -def test_series_grouper(): - obj = Series(np.random.randn(10)) - - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) - - grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2) - result, counts = grouper.get_result() - - expected = np.array([obj[3:6].mean(), obj[6:].mean()], dtype=object) - tm.assert_almost_equal(result, expected) - - exp_counts = np.array([3, 4], dtype=np.int64) - tm.assert_almost_equal(counts, exp_counts) - - -def test_series_grouper_result_length_difference(): - # GH 40014 - obj = Series(np.random.randn(10), dtype="float64") - obj.index = obj.index.astype("O") - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) - - grouper = libreduction.SeriesGrouper(obj, lambda x: all(x > 0), labels, 2) - result, counts = grouper.get_result() - - expected = np.array([all(obj[3:6] > 0), all(obj[6:] > 0)], dtype=object) - tm.assert_equal(result, expected) - - exp_counts = np.array([3, 4], dtype=np.int64) - tm.assert_equal(counts, exp_counts) - - -def test_series_grouper_requires_nonempty_raises(): - # GH#29500 - obj = Series(np.random.randn(10)) - dummy = obj.iloc[:0] - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.intp) - - with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): - libreduction.SeriesGrouper(dummy, np.mean, labels, 2) - - def assert_block_lengths(x): assert len(x) == len(x._mgr.blocks[0].mgr_locs) return 0 diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 62fd93026d5e2..f252b5e1ceedf 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -811,7 +811,7 @@ def test_categoricals(a_dtype, b_dtype): a_is_cat = is_categorical_dtype(a.dtype) assert not a_is_cat or a.value_counts().loc[1] == 0 result = crosstab(a, b, margins=True, dropna=False) - values = [[18, 16, 34], [0, 0, np.nan], [34, 32, 66], [52, 48, 100]] + values = [[18, 16, 34], [0, 0, 0], [34, 32, 66], [52, 48, 100]] expected = DataFrame(values, index, columns) if not a_is_cat: expected = expected.loc[[0, 2, "All"]]