Skip to content

[ArrayManager] Add libreduction frame Slider for ArrayManager #40171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 98 additions & 4 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ cdef class Slider:
def apply_frame_axis0(object frame, object f, object names,
const int64_t[:] starts, const int64_t[:] ends):
cdef:
BlockSlider slider
FrameSlider slider
Py_ssize_t i, n = len(starts)
list results
object piece
Expand All @@ -359,7 +359,10 @@ def apply_frame_axis0(object frame, object f, object names,

results = []

slider = BlockSlider(frame)
if hasattr(frame._mgr, "blocks"):
slider = <FrameSlider>BlockSlider(frame)
else:
slider = <FrameSlider>ArraySlider(frame)

mutated = False
item_cache = slider.dummy._item_cache
Expand Down Expand Up @@ -402,12 +405,23 @@ def apply_frame_axis0(object frame, object f, object names,
return results, mutated


cdef class BlockSlider:
cdef class FrameSlider:
cdef:
object dummy

cdef move(self, int start, int end):
pass

cdef reset(self):
pass


cdef class BlockSlider(FrameSlider):
"""
Only capable of sliding on axis=0
"""
cdef:
object frame, dummy, index, block
object frame, index, block
list blocks, blk_values
ndarray orig_blklocs, orig_blknos
ndarray values
Expand Down Expand Up @@ -491,3 +505,83 @@ cdef class BlockSlider:
mgr.blocks = self.blocks
mgr._blklocs = self.orig_blklocs
mgr._blknos = self.orig_blknos


cdef class ArraySlider(FrameSlider):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i doubt it will be big, but there might be some speedups available py doing @cython.final on both of BlockSlider and ArraySlider

"""
Only capable of sliding on axis=0
"""
cdef:
object frame, index
list arrays, orig_arrays
# ndarray values
Slider idx_slider
char **base_ptrs
int narrays
Py_ssize_t i

def __init__(self, object frame):
self.frame = frame
self.dummy = frame[:0]
self.index = self.dummy.index

# GH#35417 attributes we need to restore at each step in case
# the function modified them.
self.orig_arrays = self.dummy._mgr.arrays
self.arrays = list(self.dummy._mgr.arrays)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BlockSlider defines self.blk_values = [block.values for block in self.dummy._mgr.blocks] which i think can now use the arrays property to match self.arrays here


# for values in self.arrays:
# set_array_not_contiguous(values)

self.narrays = len(self.arrays)
# See the comment in indexes/base.py about _index_data.
# We need this for EA-backed indexes that have a reference to a 1-d
# ndarray like datetime / timedelta / period.
self.idx_slider = Slider(
self.frame.index._index_data, self.dummy.index._index_data)

self.base_ptrs = <char**>malloc(sizeof(char*) * self.narrays)
for i, arr in enumerate(self.arrays):
self.base_ptrs[i] = (<ndarray>arr).data

def __dealloc__(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like this is the same in both classes; put it in the base class?

free(self.base_ptrs)

cdef move(self, int start, int end):
cdef:
ndarray arr
Py_ssize_t i

self._restore_arrays()

# move arrays
for i in range(self.narrays):
arr = self.arrays[i]
arr.data = self.base_ptrs[i] + arr.strides[0] * start
arr.shape[0] = end - start

# move and set the index
self.idx_slider.move(start, end)

object.__setattr__(self.index, '_index_data', self.idx_slider.buf)
self.index._engine.clear_mapping()
self.index._cache.clear() # e.g. inferred_freq must go

cdef reset(self):
cdef:
ndarray arr
Py_ssize_t i

self._restore_arrays()

for i in range(self.narrays):
arr = self.arrays[i]
arr.data = self.base_ptrs[i]
arr.shape[0] = 0

cdef _restore_arrays(self):
"""
Ensure that we have the original arrays.
"""
mgr = self.dummy._mgr
mgr.arrays = self.orig_arrays
13 changes: 9 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
)

import pandas.core.algorithms as algorithms
from pandas.core.arrays import ExtensionArray
from pandas.core.base import SelectionMixin
import pandas.core.common as com
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -214,11 +215,15 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
# if we pass EA instead of ndarray
# TODO: can we have a workaround for EAs backed by ndarray?
pass

elif isinstance(sdata._mgr, ArrayManager):
# TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0
# for now -> relies on BlockManager internals
elif (
sdata.ndim == 2
and isinstance(sdata._mgr, ArrayManager)
and any(isinstance(arr, ExtensionArray) for arr in sdata._mgr.arrays)
):
# For ArrayManager, we also store datetime-like data as EAs, although
# their dtype is not an extension array dtype (so the above check passes)
pass

elif (
com.get_callable_name(f) not in base.plotting_methods
and isinstance(splitter, FrameSplitter)
Expand Down