diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index b27072aa66708..fdcce6ad1df9e 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -27,126 +27,6 @@ cdef _check_result_array(object obj, Py_ssize_t cnt): raise ValueError('Function does not reduce') -cdef class Reducer: - """ - Performs generic reduction operation on a C or Fortran-contiguous ndarray - while avoiding ndarray construction overhead - """ - cdef: - Py_ssize_t increment, chunksize, nresults - object dummy, f, labels, typ, ityp, index - ndarray arr - - def __init__(self, ndarray arr, object f, axis=1, dummy=None, labels=None): - n, k = (arr).shape - - if axis == 0: - if not arr.flags.f_contiguous: - arr = arr.copy('F') - - self.nresults = k - self.chunksize = n - self.increment = n * arr.dtype.itemsize - else: - if not arr.flags.c_contiguous: - arr = arr.copy('C') - - self.nresults = n - self.chunksize = k - self.increment = k * arr.dtype.itemsize - - self.f = f - self.arr = arr - self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy( - dummy=dummy) - - cdef _check_dummy(self, dummy=None): - cdef: - object index = None, typ = None, ityp = None - - if dummy is None: - dummy = np.empty(self.chunksize, dtype=self.arr.dtype) - - # our ref is stolen later since we are creating this array - # in cython, so increment first - Py_INCREF(dummy) - - else: - - # we passed a Series - typ = type(dummy) - index = dummy.index - dummy = dummy.values - - if dummy.dtype != self.arr.dtype: - raise ValueError('Dummy array must be same dtype') - if len(dummy) != self.chunksize: - raise ValueError(f'Dummy array must be length {self.chunksize}') - - return dummy, typ, index, ityp - - def get_result(self): - cdef: - char* dummy_buf - ndarray arr, result, chunk - Py_ssize_t i - flatiter it - object res, name, labels - object cached_typ = None - - arr = self.arr - chunk = self.dummy - dummy_buf = chunk.data - chunk.data = arr.data - labels = self.labels - - result = np.empty(self.nresults, dtype='O') - it = PyArray_IterNew(result) - - try: - for i in range(self.nresults): - - # create the cached type - # each time just reassign the data - if i == 0: - - if self.typ is not None: - # In this case, we also have self.index - name = labels[i] - cached_typ = self.typ( - chunk, index=self.index, name=name, dtype=arr.dtype) - - # use the cached_typ if possible - if cached_typ is not None: - # In this case, we also have non-None labels - name = labels[i] - - object.__setattr__( - cached_typ._data._block, 'values', chunk) - object.__setattr__(cached_typ, 'name', name) - res = self.f(cached_typ) - else: - res = self.f(chunk) - - # TODO: reason for not squeezing here? - res = _extract_result(res, squeeze=False) - if i == 0: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - _check_result_array(res, len(self.dummy)) - - PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) - chunk.data = chunk.data + self.increment - PyArray_ITER_NEXT(it) - finally: - # so we don't free the wrong memory - chunk.data = dummy_buf - - result = maybe_convert_objects(result) - return result - - cdef class _BaseGrouper: cdef _check_dummy(self, dummy): # both values and index must be an ndarray! @@ -588,30 +468,3 @@ cdef class BlockSlider: # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] arr.shape[1] = 0 - - -def compute_reduction(arr: np.ndarray, f, axis: int = 0, dummy=None, labels=None): - """ - - Parameters - ----------- - arr : np.ndarray - f : function - axis : integer axis - dummy : type of reduced output (series) - labels : Index or None - """ - - # We either have both dummy and labels, or neither of them - if (labels is None) ^ (dummy is None): - raise ValueError("Must pass either dummy and labels, or neither") - - if labels is not None: - # Caller is responsible for ensuring we don't have MultiIndex - assert labels.nlevels == 1 - - # pass as an ndarray/ExtensionArray - labels = labels._values - - reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) - return reducer.get_result() diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 70e0a129c055f..23b0056261fbb 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -4,16 +4,10 @@ import numpy as np -from pandas._libs import reduction as libreduction from pandas._typing import Axis from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import ( - is_dict_like, - is_extension_array_dtype, - is_list_like, - is_sequence, -) +from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype @@ -221,15 +215,7 @@ def apply_empty_result(self): def apply_raw(self): """ apply to the values as a numpy array """ - try: - result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) - except ValueError as err: - if "Function does not reduce" not in str(err): - # catch only ValueError raised intentionally in libreduction - raise - # We expect np.apply_along_axis to give a two-dimensional result, or - # also raise. - result = np.apply_along_axis(self.f, self.axis, self.values) + result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case if result.ndim == 2: @@ -265,51 +251,6 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": return result def apply_standard(self): - - # try to reduce first (by default) - # this only matters if the reduction in values is of different dtype - # e.g. if we want to apply to a SparseFrame, then can't directly reduce - - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if ( - self.result_type in ["reduce", None] - and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow dtypes where setting _index_data will break - # ExtensionArray values, see GH#31182 - and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any() - # Disallow complex_internals since libreduction shortcut raises a TypeError - and not self.agg_axis._has_complex_internals - ): - - values = self.values - index = self.obj._get_axis(self.axis) - labels = self.agg_axis - empty_arr = np.empty(len(index), dtype=values.dtype) - - # Preserve subclass for e.g. test_subclassed_apply - dummy = self.obj._constructor_sliced( - empty_arr, index=index, dtype=values.dtype - ) - - try: - result = libreduction.compute_reduction( - values, self.f, axis=self.axis, dummy=dummy, labels=labels - ) - except ValueError as err: - if "Function does not reduce" not in str(err): - # catch only ValueError raised intentionally in libreduction - raise - except TypeError: - # e.g. test_apply_ignore_failures we just ignore - if not self.ignore_failures: - raise - except ZeroDivisionError: - # reached via numexpr; fall back to python implementation - pass - else: - return self.obj._constructor_sliced(result, index=labels) - # compute the result using the series generator results, res_index = self.apply_series_generator() @@ -350,9 +291,23 @@ def wrap_results( from pandas import Series # see if we can infer the results - if len(results) > 0 and 0 in results and is_sequence(results[0]): + if ( + len(results) > 0 + and 0 in results + and is_sequence(results[0]) + and (not isinstance(results[0], dict) or self.result_type == "expand") + ): - return self.wrap_results_for_axis(results, res_index) + try: + return self.wrap_results_for_axis(results, res_index) + except ValueError as err: + # See e.g. test_agg_listlike_result + # FIXME: kludge for ragged array + if "arrays must all be same length" in str(err): + # Fall back to constructing Series + pass + else: + raise # dict of scalars @@ -440,9 +395,8 @@ def wrap_results_for_axis( # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): - from pandas import Series - result = Series(results) + result = self.obj._constructor_sliced(results) result.index = res_index # we may want to infer results diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index ff74d374e5e3f..b0e354050b9a4 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -5,7 +5,7 @@ from pandas.core.dtypes.common import ensure_int64 -from pandas import Index, Series, isna +from pandas import Series, isna import pandas._testing as tm @@ -111,37 +111,3 @@ def _ohlc(group): class TestMoments: pass - - -class TestReducer: - def test_int_index(self): - arr = np.random.randn(100, 4) - - msg = "Must pass either dummy and labels, or neither" - # we must pass either both labels and dummy, or neither - with pytest.raises(ValueError, match=msg): - libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) - - with pytest.raises(ValueError, match=msg): - libreduction.compute_reduction( - arr, np.sum, axis=1, labels=Index(np.arange(100)) - ) - - dummy = Series(0.0, index=np.arange(100)) - result = libreduction.compute_reduction( - arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) - ) - expected = arr.sum(0) - tm.assert_almost_equal(result, expected) - - dummy = Series(0.0, index=np.arange(4)) - result = libreduction.compute_reduction( - arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) - ) - expected = arr.sum(1) - tm.assert_almost_equal(result, expected) - - result = libreduction.compute_reduction( - arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) - ) - tm.assert_almost_equal(result, expected)