Skip to content

REF: remove fast_apply #32086

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 0 additions & 136 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -454,142 +454,6 @@ cdef class Slider:
self.buf.strides[0] = self.orig_stride


class InvalidApply(Exception):
pass


def apply_frame_axis0(object frame, object f, object names,
const int64_t[:] starts, const int64_t[:] ends):
cdef:
BlockSlider slider
Py_ssize_t i, n = len(starts)
list results
object piece
dict item_cache

# We have already checked that we don't have a MultiIndex before calling
assert frame.index.nlevels == 1

results = []

slider = BlockSlider(frame)

mutated = False
item_cache = slider.dummy._item_cache
try:
for i in range(n):
slider.move(starts[i], ends[i])

item_cache.clear() # ugh
chunk = slider.dummy
object.__setattr__(chunk, 'name', names[i])

try:
piece = f(chunk)
except Exception:
# We can't be more specific without knowing something about `f`
raise InvalidApply('Let this error raise above us')

# Need to infer if low level index slider will cause segfaults
require_slow_apply = i == 0 and piece is chunk
try:
if piece.index is not chunk.index:
mutated = True
except AttributeError:
# `piece` might not have an index, could be e.g. an int
pass

if not is_scalar(piece):
# Need to copy data to avoid appending references
try:
piece = piece.copy(deep="all")
except (TypeError, AttributeError):
piece = copy(piece)

results.append(piece)

# If the data was modified inplace we need to
# take the slow path to not risk segfaults
# we have already computed the first piece
if require_slow_apply:
break
finally:
slider.reset()

return results, mutated


cdef class BlockSlider:
"""
Only capable of sliding on axis=0
"""

cdef public:
object frame, dummy, index
int nblocks
Slider idx_slider
list blocks

cdef:
char **base_ptrs

def __init__(self, frame):
self.frame = frame
self.dummy = frame[:0]
self.index = self.dummy.index

self.blocks = [b.values for b in self.dummy._data.blocks]

for x in self.blocks:
util.set_array_not_contiguous(x)

self.nblocks = len(self.blocks)
# See the comment in indexes/base.py about _index_data.
# We need this for EA-backed indexes that have a reference to a 1-d
# ndarray like datetime / timedelta / period.
self.idx_slider = Slider(
self.frame.index._index_data, self.dummy.index._index_data)

self.base_ptrs = <char**>malloc(sizeof(char*) * len(self.blocks))
for i, block in enumerate(self.blocks):
self.base_ptrs[i] = (<ndarray>block).data

def __dealloc__(self):
free(self.base_ptrs)

cdef move(self, int start, int end):
cdef:
ndarray arr
Py_ssize_t i

# move blocks
for i in range(self.nblocks):
arr = self.blocks[i]

# axis=1 is the frame's axis=0
arr.data = self.base_ptrs[i] + arr.strides[1] * start
arr.shape[1] = end - start

# move and set the index
self.idx_slider.move(start, end)

object.__setattr__(self.index, '_index_data', self.idx_slider.buf)
self.index._engine.clear_mapping()

cdef reset(self):
cdef:
ndarray arr
Py_ssize_t i

# reset blocks
for i in range(self.nblocks):
arr = self.blocks[i]

# axis=1 is the frame's axis=0
arr.data = self.base_ptrs[i]
arr.shape[1] = 0


def compute_reduction(arr: np.ndarray, f, axis: int = 0, dummy=None, labels=None):
"""

Expand Down
38 changes: 1 addition & 37 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base, grouper
from pandas.core.groupby import grouper
from pandas.core.indexes.api import Index, MultiIndex, ensure_index
from pandas.core.series import Series
from pandas.core.sorting import (
Expand Down Expand Up @@ -154,37 +154,6 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0):
group_keys = self._get_group_keys()
result_values = None

sdata: FrameOrSeries = splitter._get_sorted_data()
if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)):
# calling splitter.fast_apply will raise TypeError via apply_frame_axis0
# if we pass EA instead of ndarray
# TODO: can we have a workaround for EAs backed by ndarray?
pass

elif (
com.get_callable_name(f) not in base.plotting_methods
and isinstance(splitter, FrameSplitter)
and axis == 0
# fast_apply/libreduction doesn't allow non-numpy backed indexes
and not sdata.index._has_complex_internals
):
try:
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)

except libreduction.InvalidApply as err:
# This Exception is raised if `f` triggers an exception
# but it is preferable to raise the exception in Python.
if "Let this error raise above us" not in str(err):
# TODO: can we infer anything about whether this is
# worth-retrying in pure-python?
raise

else:
# If the fast apply path could be used we can return here.
# Otherwise we need to fall back to the slow implementation.
if len(result_values) == len(group_keys):
return group_keys, result_values, mutated

for key, (i, group) in zip(group_keys, splitter):
object.__setattr__(group, "name", key)

Expand Down Expand Up @@ -925,11 +894,6 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series:


class FrameSplitter(DataSplitter):
def fast_apply(self, f, sdata: FrameOrSeries, names):
# must return keys::list, values::list, mutated::bool
starts, ends = lib.generate_slices(self.slabels, self.ngroups)
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)

def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
if self.axis == 0:
return sdata.iloc[slice_obj]
Expand Down
33 changes: 0 additions & 33 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,39 +82,6 @@ def test_apply_trivial_fail():
tm.assert_frame_equal(result, expected)


def test_fast_apply():
# make sure that fast apply is correctly called
# rather than raising any kind of error
# otherwise the python path will be callsed
# which slows things down
N = 1000
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
df = DataFrame(
{
"key": labels,
"key2": labels2,
"value1": np.random.randn(N),
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
}
)

def f(g):
return 1

g = df.groupby(["key", "key2"])

grouper = g.grouper

splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
group_keys = grouper._get_group_keys()
sdata = splitter._get_sorted_data()

values, mutated = splitter.fast_apply(f, sdata, group_keys)

assert not mutated


@pytest.mark.parametrize(
"df, group_names",
[
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/groupby/test_whitelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,6 @@ def test_groupby_selection_with_methods(df):
"ffill",
"bfill",
"pct_change",
"tshift",
]

for m in methods:
Expand All @@ -377,6 +376,11 @@ def test_groupby_selection_with_methods(df):
# should always be frames!
tm.assert_frame_equal(res, exp)

with pytest.raises(ValueError, match="Freq was not given"):
g.tshift()
with pytest.raises(ValueError, match="Freq was not given"):
g_exp.tshift()

# methods which aren't just .foo()
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
Expand Down