From 04a8abfe5187fd8d78cb0e28b23748e499ba5dc0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Jun 2020 17:33:59 -0700 Subject: [PATCH 1/2] CLN: remove libreduction.Reducer --- pandas/_libs/reduction.pyx | 174 +---------------------- pandas/tests/groupby/test_bin_groupby.py | 36 +---- 2 files changed, 2 insertions(+), 208 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 58de682c56d55..97c491776f831 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,17 +1,12 @@ from copy import copy from cython import Py_ssize_t -from cpython.ref cimport Py_INCREF from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, - int64_t, - PyArray_SETITEM, - PyArray_ITER_NEXT, PyArray_ITER_DATA, PyArray_IterNew, - flatiter) +from numpy cimport ndarray, int64_t cnp.import_array() from pandas._libs cimport util @@ -26,146 +21,6 @@ cdef _check_result_array(object obj, Py_ssize_t cnt): raise ValueError('Function does not reduce') -cdef class Reducer: - """ - Performs generic reduction operation on a C or Fortran-contiguous ndarray - while avoiding ndarray construction overhead - """ - cdef: - Py_ssize_t increment, chunksize, nresults - object dummy, f, labels, typ, ityp, index - ndarray arr - - def __init__( - self, ndarray arr, object f, int axis=1, object dummy=None, object labels=None - ): - cdef: - Py_ssize_t n, k - - n, k = (arr).shape - - if axis == 0: - if not arr.flags.f_contiguous: - arr = arr.copy('F') - - self.nresults = k - self.chunksize = n - self.increment = n * arr.dtype.itemsize - else: - if not arr.flags.c_contiguous: - arr = arr.copy('C') - - self.nresults = n - self.chunksize = k - self.increment = k * arr.dtype.itemsize - - self.f = f - self.arr = arr - self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy( - dummy=dummy) - - cdef _check_dummy(self, object dummy=None): - cdef: - object index = None, typ = None, ityp = None - - if dummy is None: - dummy = np.empty(self.chunksize, dtype=self.arr.dtype) - - # our ref is stolen later since we are creating this array - # in cython, so increment first - Py_INCREF(dummy) - - else: - - # we passed a Series - typ = type(dummy) - index = dummy.index - dummy = dummy.values - - if dummy.dtype != self.arr.dtype: - raise ValueError('Dummy array must be same dtype') - if len(dummy) != self.chunksize: - raise ValueError(f'Dummy array must be length {self.chunksize}') - - return dummy, typ, index, ityp - - def get_result(self): - cdef: - char* dummy_buf - ndarray arr, result, chunk - Py_ssize_t i - flatiter it - object res, name, labels - object cached_typ = None - - arr = self.arr - chunk = self.dummy - dummy_buf = chunk.data - chunk.data = arr.data - labels = self.labels - - result = np.empty(self.nresults, dtype='O') - it = PyArray_IterNew(result) - reduction_success = True - - try: - for i in range(self.nresults): - - # create the cached type - # each time just reassign the data - if i == 0: - - if self.typ is not None: - # In this case, we also have self.index - name = labels[i] - cached_typ = self.typ( - chunk, index=self.index, name=name, dtype=arr.dtype) - - # use the cached_typ if possible - if cached_typ is not None: - # In this case, we also have non-None labels - name = labels[i] - - object.__setattr__( - cached_typ._mgr._block, 'values', chunk) - object.__setattr__(cached_typ, 'name', name) - res = self.f(cached_typ) - else: - res = self.f(chunk) - - # TODO: reason for not squeezing here? - extracted_res = _extract_result(res, squeeze=False) - if i == 0: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - # If it does not, return the computed value to be used by the - # pure python implementation, - # so the function won't be called twice on the same object, - # and side effects would occur twice - try: - _check_result_array(extracted_res, len(self.dummy)) - except ValueError as err: - if "Function does not reduce" not in str(err): - # catch only the specific exception - raise - - reduction_success = False - PyArray_SETITEM(result, PyArray_ITER_DATA(it), copy(res)) - break - - PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res) - chunk.data = chunk.data + self.increment - PyArray_ITER_NEXT(it) - - finally: - # so we don't free the wrong memory - chunk.data = dummy_buf - - result = maybe_convert_objects(result) - return result, reduction_success - - cdef class _BaseGrouper: cdef _check_dummy(self, object dummy): # both values and index must be an ndarray! @@ -610,30 +465,3 @@ cdef class BlockSlider: # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] arr.shape[1] = 0 - - -def compute_reduction(arr: ndarray, f, axis: int = 0, dummy=None, labels=None): - """ - - Parameters - ----------- - arr : np.ndarray - f : function - axis : integer axis - dummy : type of reduced output (series) - labels : Index or None - """ - - # We either have both dummy and labels, or neither of them - if (labels is None) ^ (dummy is None): - raise ValueError("Must pass either dummy and labels, or neither") - - if labels is not None: - # Caller is responsible for ensuring we don't have MultiIndex - assert labels.nlevels == 1 - - # pass as an ndarray/ExtensionArray - labels = labels._values - - reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) - return reducer.get_result() diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 9df45f7a23f55..f20eed4575e91 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import ensure_int64 import pandas as pd -from pandas import Index, Series, isna +from pandas import Series, isna import pandas._testing as tm @@ -136,37 +136,3 @@ def _ohlc(group): class TestMoments: pass - - -class TestReducer: - def test_int_index(self): - arr = np.random.randn(100, 4) - - msg = "Must pass either dummy and labels, or neither" - # we must pass either both labels and dummy, or neither - with pytest.raises(ValueError, match=msg): - libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) - - with pytest.raises(ValueError, match=msg): - libreduction.compute_reduction( - arr, np.sum, axis=1, labels=Index(np.arange(100)) - ) - - dummy = Series(0.0, index=np.arange(100)) - result, _ = libreduction.compute_reduction( - arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) - ) - expected = arr.sum(0) - tm.assert_almost_equal(result, expected) - - dummy = Series(0.0, index=np.arange(4)) - result, _ = libreduction.compute_reduction( - arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) - ) - expected = arr.sum(1) - tm.assert_almost_equal(result, expected) - - result, _ = libreduction.compute_reduction( - arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) - ) - tm.assert_almost_equal(result, expected) From afcd3d9cdf6e69733b87f2f8bd6fe30df03cee1f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Jun 2020 18:43:37 -0700 Subject: [PATCH 2/2] remove unused import --- pandas/core/apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 102c457f94a95..9c223d66b727b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -6,7 +6,6 @@ from pandas._config import option_context -from pandas._libs import reduction as libreduction from pandas._typing import Axis from pandas.util._decorators import cache_readonly