diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index b27072aa66708..db897014a1061 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -454,142 +454,6 @@ cdef class Slider: self.buf.strides[0] = self.orig_stride -class InvalidApply(Exception): - pass - - -def apply_frame_axis0(object frame, object f, object names, - const int64_t[:] starts, const int64_t[:] ends): - cdef: - BlockSlider slider - Py_ssize_t i, n = len(starts) - list results - object piece - dict item_cache - - # We have already checked that we don't have a MultiIndex before calling - assert frame.index.nlevels == 1 - - results = [] - - slider = BlockSlider(frame) - - mutated = False - item_cache = slider.dummy._item_cache - try: - for i in range(n): - slider.move(starts[i], ends[i]) - - item_cache.clear() # ugh - chunk = slider.dummy - object.__setattr__(chunk, 'name', names[i]) - - try: - piece = f(chunk) - except Exception: - # We can't be more specific without knowing something about `f` - raise InvalidApply('Let this error raise above us') - - # Need to infer if low level index slider will cause segfaults - require_slow_apply = i == 0 and piece is chunk - try: - if piece.index is not chunk.index: - mutated = True - except AttributeError: - # `piece` might not have an index, could be e.g. an int - pass - - if not is_scalar(piece): - # Need to copy data to avoid appending references - try: - piece = piece.copy(deep="all") - except (TypeError, AttributeError): - piece = copy(piece) - - results.append(piece) - - # If the data was modified inplace we need to - # take the slow path to not risk segfaults - # we have already computed the first piece - if require_slow_apply: - break - finally: - slider.reset() - - return results, mutated - - -cdef class BlockSlider: - """ - Only capable of sliding on axis=0 - """ - - cdef public: - object frame, dummy, index - int nblocks - Slider idx_slider - list blocks - - cdef: - char **base_ptrs - - def __init__(self, frame): - self.frame = frame - self.dummy = frame[:0] - self.index = self.dummy.index - - self.blocks = [b.values for b in self.dummy._data.blocks] - - for x in self.blocks: - util.set_array_not_contiguous(x) - - self.nblocks = len(self.blocks) - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference to a 1-d - # ndarray like datetime / timedelta / period. - self.idx_slider = Slider( - self.frame.index._index_data, self.dummy.index._index_data) - - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) - for i, block in enumerate(self.blocks): - self.base_ptrs[i] = (block).data - - def __dealloc__(self): - free(self.base_ptrs) - - cdef move(self, int start, int end): - cdef: - ndarray arr - Py_ssize_t i - - # move blocks - for i in range(self.nblocks): - arr = self.blocks[i] - - # axis=1 is the frame's axis=0 - arr.data = self.base_ptrs[i] + arr.strides[1] * start - arr.shape[1] = end - start - - # move and set the index - self.idx_slider.move(start, end) - - object.__setattr__(self.index, '_index_data', self.idx_slider.buf) - self.index._engine.clear_mapping() - - cdef reset(self): - cdef: - ndarray arr - Py_ssize_t i - - # reset blocks - for i in range(self.nblocks): - arr = self.blocks[i] - - # axis=1 is the frame's axis=0 - arr.data = self.base_ptrs[i] - arr.shape[1] = 0 - - def compute_reduction(arr: np.ndarray, f, axis: int = 0, dummy=None, labels=None): """ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7259268ac3f2b..c33748deef1c2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -43,7 +43,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base, grouper +from pandas.core.groupby import grouper from pandas.core.indexes.api import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( @@ -154,37 +154,6 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0): group_keys = self._get_group_keys() result_values = None - sdata: FrameOrSeries = splitter._get_sorted_data() - if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): - # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 - # if we pass EA instead of ndarray - # TODO: can we have a workaround for EAs backed by ndarray? - pass - - elif ( - com.get_callable_name(f) not in base.plotting_methods - and isinstance(splitter, FrameSplitter) - and axis == 0 - # fast_apply/libreduction doesn't allow non-numpy backed indexes - and not sdata.index._has_complex_internals - ): - try: - result_values, mutated = splitter.fast_apply(f, sdata, group_keys) - - except libreduction.InvalidApply as err: - # This Exception is raised if `f` triggers an exception - # but it is preferable to raise the exception in Python. - if "Let this error raise above us" not in str(err): - # TODO: can we infer anything about whether this is - # worth-retrying in pure-python? - raise - - else: - # If the fast apply path could be used we can return here. - # Otherwise we need to fall back to the slow implementation. - if len(result_values) == len(group_keys): - return group_keys, result_values, mutated - for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) @@ -925,11 +894,6 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series: class FrameSplitter(DataSplitter): - def fast_apply(self, f, sdata: FrameOrSeries, names): - # must return keys::list, values::list, mutated::bool - starts, ends = lib.generate_slices(self.slabels, self.ngroups) - return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: if self.axis == 0: return sdata.iloc[slice_obj] diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 18ad5d90b3f60..7afb449dbe37e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -82,39 +82,6 @@ def test_apply_trivial_fail(): tm.assert_frame_equal(result, expected) -def test_fast_apply(): - # make sure that fast apply is correctly called - # rather than raising any kind of error - # otherwise the python path will be callsed - # which slows things down - N = 1000 - labels = np.random.randint(0, 2000, size=N) - labels2 = np.random.randint(0, 3, size=N) - df = DataFrame( - { - "key": labels, - "key2": labels2, - "value1": np.random.randn(N), - "value2": ["foo", "bar", "baz", "qux"] * (N // 4), - } - ) - - def f(g): - return 1 - - g = df.groupby(["key", "key2"]) - - grouper = g.grouper - - splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) - group_keys = grouper._get_group_keys() - sdata = splitter._get_sorted_data() - - values, mutated = splitter.fast_apply(f, sdata, group_keys) - - assert not mutated - - @pytest.mark.parametrize( "df, group_names", [ diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 8e387e9202ef6..eaab2757097bf 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -367,7 +367,6 @@ def test_groupby_selection_with_methods(df): "ffill", "bfill", "pct_change", - "tshift", ] for m in methods: @@ -377,6 +376,11 @@ def test_groupby_selection_with_methods(df): # should always be frames! tm.assert_frame_equal(res, exp) + with pytest.raises(ValueError, match="Freq was not given"): + g.tshift() + with pytest.raises(ValueError, match="Freq was not given"): + g_exp.tshift() + # methods which aren't just .foo() tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) tm.assert_frame_equal(g.dtypes, g_exp.dtypes)