Skip to content

Commit d037ff6

Browse files
authored
REF: remove libreduction.apply_frame_axis0 (#42992)
1 parent fe8276b commit d037ff6

File tree

4 files changed

+6
-256
lines changed

4 files changed

+6
-256
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ Performance improvements
178178
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
179179
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
180180
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
181+
- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
181182
-
182183

183184
.. ---------------------------------------------------------------------------

pandas/_libs/reduction.pyx

+1-152
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@ from numpy cimport (
1515

1616
cnp.import_array()
1717

18-
from pandas._libs.util cimport (
19-
is_array,
20-
set_array_not_contiguous,
21-
)
18+
from pandas._libs.util cimport is_array
2219

2320
from pandas._libs.lib import is_scalar
2421

@@ -350,151 +347,3 @@ cdef class Slider:
350347
cdef reset(self):
351348
self.buf.data = self.orig_data
352349
self.buf.shape[0] = 0
353-
354-
355-
def apply_frame_axis0(object frame, object f, object names,
356-
const int64_t[:] starts, const int64_t[:] ends):
357-
cdef:
358-
BlockSlider slider
359-
Py_ssize_t i, n = len(starts)
360-
list results
361-
object piece
362-
dict item_cache
363-
364-
# We have already checked that we don't have a MultiIndex before calling
365-
assert frame.index.nlevels == 1
366-
367-
results = []
368-
369-
slider = BlockSlider(frame)
370-
371-
mutated = False
372-
item_cache = slider.dummy._item_cache
373-
try:
374-
for i in range(n):
375-
slider.move(starts[i], ends[i])
376-
377-
item_cache.clear() # ugh
378-
chunk = slider.dummy
379-
object.__setattr__(chunk, 'name', names[i])
380-
381-
piece = f(chunk)
382-
383-
# Need to infer if low level index slider will cause segfaults
384-
require_slow_apply = i == 0 and piece is chunk
385-
try:
386-
if piece.index is not chunk.index:
387-
mutated = True
388-
except AttributeError:
389-
# `piece` might not have an index, could be e.g. an int
390-
pass
391-
392-
if not is_scalar(piece):
393-
# Need to copy data to avoid appending references
394-
try:
395-
piece = piece.copy(deep="all")
396-
except (TypeError, AttributeError):
397-
pass
398-
399-
results.append(piece)
400-
401-
# If the data was modified inplace we need to
402-
# take the slow path to not risk segfaults
403-
# we have already computed the first piece
404-
if require_slow_apply:
405-
break
406-
finally:
407-
slider.reset()
408-
409-
return results, mutated
410-
411-
412-
cdef class BlockSlider:
413-
"""
414-
Only capable of sliding on axis=0
415-
"""
416-
cdef:
417-
object frame, dummy, index, block
418-
list blocks, blk_values
419-
ndarray orig_blklocs, orig_blknos
420-
ndarray values
421-
Slider idx_slider
422-
char **base_ptrs
423-
int nblocks
424-
Py_ssize_t i
425-
426-
def __init__(self, object frame):
427-
self.frame = frame
428-
self.dummy = frame[:0]
429-
self.index = self.dummy.index
430-
431-
# GH#35417 attributes we need to restore at each step in case
432-
# the function modified them.
433-
mgr = self.dummy._mgr
434-
self.orig_blklocs = mgr.blklocs
435-
self.orig_blknos = mgr.blknos
436-
self.blocks = [x for x in self.dummy._mgr.blocks]
437-
438-
self.blk_values = [block.values for block in self.dummy._mgr.blocks]
439-
440-
for values in self.blk_values:
441-
set_array_not_contiguous(values)
442-
443-
self.nblocks = len(self.blk_values)
444-
# See the comment in indexes/base.py about _index_data.
445-
# We need this for EA-backed indexes that have a reference to a 1-d
446-
# ndarray like datetime / timedelta / period.
447-
self.idx_slider = Slider(
448-
self.frame.index._index_data, self.dummy.index._index_data)
449-
450-
self.base_ptrs = <char**>malloc(sizeof(char*) * self.nblocks)
451-
for i, block in enumerate(self.blk_values):
452-
self.base_ptrs[i] = (<ndarray>block).data
453-
454-
def __dealloc__(self):
455-
free(self.base_ptrs)
456-
457-
cdef move(self, int start, int end):
458-
cdef:
459-
ndarray arr
460-
Py_ssize_t i
461-
462-
self._restore_blocks()
463-
464-
# move blocks
465-
for i in range(self.nblocks):
466-
arr = self.blk_values[i]
467-
468-
# axis=1 is the frame's axis=0
469-
arr.data = self.base_ptrs[i] + arr.strides[1] * start
470-
arr.shape[1] = end - start
471-
472-
# move and set the index
473-
self.idx_slider.move(start, end)
474-
475-
object.__setattr__(self.index, '_index_data', self.idx_slider.buf)
476-
self.index._engine.clear_mapping()
477-
self.index._cache.clear() # e.g. inferred_freq must go
478-
479-
cdef reset(self):
480-
cdef:
481-
ndarray arr
482-
Py_ssize_t i
483-
484-
self._restore_blocks()
485-
486-
for i in range(self.nblocks):
487-
arr = self.blk_values[i]
488-
489-
# axis=1 is the frame's axis=0
490-
arr.data = self.base_ptrs[i]
491-
arr.shape[1] = 0
492-
493-
cdef _restore_blocks(self):
494-
"""
495-
Ensure that we have the original blocks, blknos, and blklocs.
496-
"""
497-
mgr = self.dummy._mgr
498-
mgr.blocks = tuple(self.blocks)
499-
mgr._blklocs = self.orig_blklocs
500-
mgr._blknos = self.orig_blknos

pandas/core/groupby/ops.py

+2-61
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,13 @@
8585
import pandas.core.common as com
8686
from pandas.core.frame import DataFrame
8787
from pandas.core.generic import NDFrame
88-
from pandas.core.groupby import (
89-
base,
90-
grouper,
91-
)
88+
from pandas.core.groupby import grouper
9289
from pandas.core.indexes.api import (
9390
CategoricalIndex,
9491
Index,
9592
MultiIndex,
9693
ensure_index,
9794
)
98-
from pandas.core.internals import ArrayManager
9995
from pandas.core.series import Series
10096
from pandas.core.sorting import (
10197
compress_group_index,
@@ -718,60 +714,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
718714
mutated = self.mutated
719715
splitter = self._get_splitter(data, axis=axis)
720716
group_keys = self._get_group_keys()
721-
result_values = None
722-
723-
if data.ndim == 2 and any(
724-
isinstance(x, ExtensionArray) for x in data._iter_column_arrays()
725-
):
726-
# calling splitter.fast_apply will raise TypeError via apply_frame_axis0
727-
# if we pass EA instead of ndarray
728-
# TODO: can we have a workaround for EAs backed by ndarray?
729-
pass
730-
731-
elif isinstance(data._mgr, ArrayManager):
732-
# TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0
733-
# for now -> relies on BlockManager internals
734-
pass
735-
elif (
736-
com.get_callable_name(f) not in base.plotting_methods
737-
and isinstance(splitter, FrameSplitter)
738-
and axis == 0
739-
# fast_apply/libreduction doesn't allow non-numpy backed indexes
740-
and not data.index._has_complex_internals
741-
):
742-
try:
743-
sdata = splitter.sorted_data
744-
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
745-
746-
except IndexError:
747-
# This is a rare case in which re-running in python-space may
748-
# make a difference, see test_apply_mutate.test_mutate_groups
749-
pass
750-
751-
else:
752-
# If the fast apply path could be used we can return here.
753-
# Otherwise we need to fall back to the slow implementation.
754-
if len(result_values) == len(group_keys):
755-
return group_keys, result_values, mutated
756-
757-
if result_values is None:
758-
# result_values is None if fast apply path wasn't taken
759-
# or fast apply aborted with an unexpected exception.
760-
# In either case, initialize the result list and perform
761-
# the slow iteration.
762-
result_values = []
763-
skip_first = False
764-
else:
765-
# If result_values is not None we're in the case that the
766-
# fast apply loop was broken prematurely but we have
767-
# already the result for the first group which we can reuse.
768-
skip_first = True
717+
result_values = []
769718

770719
# This calls DataSplitter.__iter__
771720
zipped = zip(group_keys, splitter)
772-
if skip_first:
773-
# pop the first item from the front of the iterator
774-
next(zipped)
775721

776722
for key, group in zipped:
777723
object.__setattr__(group, "name", key)
@@ -1290,11 +1236,6 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series:
12901236

12911237

12921238
class FrameSplitter(DataSplitter):
1293-
def fast_apply(self, f: F, sdata: FrameOrSeries, names):
1294-
# must return keys::list, values::list, mutated::bool
1295-
starts, ends = lib.generate_slices(self.slabels, self.ngroups)
1296-
return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
1297-
12981239
def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
12991240
# Fastpath equivalent to:
13001241
# if self.axis == 0:

pandas/tests/groupby/test_apply.py

+2-43
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
import pandas.util._test_decorators as td
11-
1210
import pandas as pd
1311
from pandas import (
1412
DataFrame,
@@ -86,40 +84,6 @@ def test_apply_trivial_fail():
8684
tm.assert_frame_equal(result, expected)
8785

8886

89-
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used
90-
def test_fast_apply():
91-
# make sure that fast apply is correctly called
92-
# rather than raising any kind of error
93-
# otherwise the python path will be callsed
94-
# which slows things down
95-
N = 1000
96-
labels = np.random.randint(0, 2000, size=N)
97-
labels2 = np.random.randint(0, 3, size=N)
98-
df = DataFrame(
99-
{
100-
"key": labels,
101-
"key2": labels2,
102-
"value1": np.random.randn(N),
103-
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
104-
}
105-
)
106-
107-
def f(g):
108-
return 1
109-
110-
g = df.groupby(["key", "key2"])
111-
112-
grouper = g.grouper
113-
114-
splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
115-
group_keys = grouper._get_group_keys()
116-
sdata = splitter.sorted_data
117-
118-
values, mutated = splitter.fast_apply(f, sdata, group_keys)
119-
120-
assert not mutated
121-
122-
12387
@pytest.mark.parametrize(
12488
"df, group_names",
12589
[
@@ -216,8 +180,6 @@ def test_group_apply_once_per_group2(capsys):
216180
assert result == expected
217181

218182

219-
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used
220-
@pytest.mark.xfail(reason="GH-34998")
221183
def test_apply_fast_slow_identical():
222184
# GH 31613
223185

@@ -237,16 +199,13 @@ def fast(group):
237199
tm.assert_frame_equal(fast_df, slow_df)
238200

239201

240-
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used
241202
@pytest.mark.parametrize(
242203
"func",
243204
[
244205
lambda x: x,
245-
pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")),
206+
lambda x: x[:],
246207
lambda x: x.copy(deep=False),
247-
pytest.param(
248-
lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998")
249-
),
208+
lambda x: x.copy(deep=True),
250209
],
251210
)
252211
def test_groupby_apply_identity_maybecopy_index_identical(func):

0 commit comments

Comments
 (0)