Skip to content

BLD: remove blockslider #34014 #37006

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 40 additions & 131 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
from copy import copy

from cython import Py_ssize_t

from libc.stdlib cimport free, malloc

import numpy as np

cimport numpy as cnp
from numpy cimport int64_t, ndarray

cnp.import_array()

from pandas._libs cimport util
from pandas._libs.util cimport is_array

from pandas._libs.lib import is_scalar, maybe_convert_objects


cpdef check_result_array(object obj, Py_ssize_t cnt):

if (util.is_array(obj) or
if (is_array(obj) or
(isinstance(obj, list) and len(obj) == cnt) or
getattr(obj, 'shape', None) == (cnt,)):
raise ValueError('Must produce aggregated value')
Expand All @@ -33,7 +29,7 @@ cdef class _BaseGrouper:
if (dummy.dtype != self.arr.dtype
and values.dtype != self.arr.dtype):
raise ValueError('Dummy array must be same dtype')
if util.is_array(values) and not values.flags.contiguous:
if is_array(values) and not values.flags.contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy()
index = dummy.index.values
Expand Down Expand Up @@ -106,7 +102,7 @@ cdef class SeriesBinGrouper(_BaseGrouper):
self.f = f

values = series.values
if util.is_array(values) and not values.flags.c_contiguous:
if is_array(values) and not values.flags.c_contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy('C')
self.arr = values
Expand Down Expand Up @@ -204,7 +200,7 @@ cdef class SeriesGrouper(_BaseGrouper):
self.f = f

values = series.values
if util.is_array(values) and not values.flags.c_contiguous:
if is_array(values) and not values.flags.c_contiguous:
# e.g. Categorical has no `flags` attribute
values = values.copy('C')
self.arr = values
Expand Down Expand Up @@ -288,9 +284,9 @@ cpdef inline extract_result(object res, bint squeeze=True):
res = res._values
if squeeze and res.ndim == 1 and len(res) == 1:
res = res[0]
if hasattr(res, 'values') and util.is_array(res.values):
if hasattr(res, 'values') and is_array(res.values):
res = res.values
if util.is_array(res):
if is_array(res):
if res.ndim == 0:
res = res.item()
elif squeeze and res.ndim == 1 and len(res) == 1:
Expand All @@ -304,7 +300,7 @@ cdef class Slider:
"""
cdef:
ndarray values, buf
Py_ssize_t stride, orig_len, orig_stride
Py_ssize_t stride
char *orig_data

def __init__(self, ndarray values, ndarray buf):
Expand All @@ -319,8 +315,6 @@ cdef class Slider:
self.stride = values.strides[0]

self.orig_data = self.buf.data
self.orig_len = self.buf.shape[0]
self.orig_stride = self.buf.strides[0]

self.buf.data = self.values.data
self.buf.strides[0] = self.stride
Expand All @@ -333,10 +327,8 @@ cdef class Slider:
self.buf.shape[0] = end - start

cdef reset(self):

self.buf.shape[0] = self.orig_len
self.buf.data = self.orig_data
self.buf.strides[0] = self.orig_stride
self.buf.shape[0] = 0


class InvalidApply(Exception):
Expand All @@ -346,7 +338,6 @@ class InvalidApply(Exception):
def apply_frame_axis0(object frame, object f, object names,
const int64_t[:] starts, const int64_t[:] ends):
cdef:
BlockSlider slider
Py_ssize_t i, n = len(starts)
list results
object piece
Expand All @@ -357,124 +348,42 @@ def apply_frame_axis0(object frame, object f, object names,

results = []

slider = BlockSlider(frame)

mutated = False
item_cache = slider.dummy._item_cache
try:
for i in range(n):
slider.move(starts[i], ends[i])
item_cache = frame._item_cache

item_cache.clear() # ugh
chunk = slider.dummy
object.__setattr__(chunk, 'name', names[i])
for i in range(n):
item_cache.clear() # ugh
chunk = frame[starts[i]:ends[i]]
object.__setattr__(chunk, 'name', names[i])

try:
piece = f(chunk)
except Exception:
# We can't be more specific without knowing something about `f`
raise InvalidApply('Let this error raise above us')
try:
piece = f(chunk)
except Exception:
# We can't be more specific without knowing something about `f`
raise InvalidApply('Let this error raise above us')

# Need to infer if low level index slider will cause segfaults
require_slow_apply = i == 0 and piece is chunk
# Need to infer if low level index slider will cause segfaults
require_slow_apply = i == 0 and piece is chunk
try:
if not piece.index is chunk.index:
mutated = True
except AttributeError:
# `piece` might not have an index, could be e.g. an int
pass

if not is_scalar(piece):
# Need to copy data to avoid appending references
try:
if not piece.index is chunk.index:
mutated = True
except AttributeError:
# `piece` might not have an index, could be e.g. an int
pass

if not is_scalar(piece):
# Need to copy data to avoid appending references
try:
piece = piece.copy(deep="all")
except (TypeError, AttributeError):
piece = copy(piece)

results.append(piece)

# If the data was modified inplace we need to
# take the slow path to not risk segfaults
# we have already computed the first piece
if require_slow_apply:
break
finally:
slider.reset()

return results, mutated
piece = piece.copy(deep="all")
except (TypeError, AttributeError):
piece = copy(piece)

results.append(piece)

cdef class BlockSlider:
"""
Only capable of sliding on axis=0
"""

cdef public:
object frame, dummy, index
int nblocks
Slider idx_slider
list blocks

cdef:
char **base_ptrs

def __init__(self, object frame):
cdef:
Py_ssize_t i
object b

self.frame = frame
self.dummy = frame[:0]
self.index = self.dummy.index
# If the data was modified inplace we need to
# take the slow path to not risk segfaults
# we have already computed the first piece
if require_slow_apply:
break

self.blocks = [b.values for b in self.dummy._mgr.blocks]

for x in self.blocks:
util.set_array_not_contiguous(x)

self.nblocks = len(self.blocks)
# See the comment in indexes/base.py about _index_data.
# We need this for EA-backed indexes that have a reference to a 1-d
# ndarray like datetime / timedelta / period.
self.idx_slider = Slider(
self.frame.index._index_data, self.dummy.index._index_data)

self.base_ptrs = <char**>malloc(sizeof(char*) * len(self.blocks))
for i, block in enumerate(self.blocks):
self.base_ptrs[i] = (<ndarray>block).data

def __dealloc__(self):
free(self.base_ptrs)

cdef move(self, int start, int end):
cdef:
ndarray arr
Py_ssize_t i

# move blocks
for i in range(self.nblocks):
arr = self.blocks[i]

# axis=1 is the frame's axis=0
arr.data = self.base_ptrs[i] + arr.strides[1] * start
arr.shape[1] = end - start

# move and set the index
self.idx_slider.move(start, end)

object.__setattr__(self.index, '_index_data', self.idx_slider.buf)
self.index._engine.clear_mapping()
self.index._cache.clear() # e.g. inferred_freq must go

cdef reset(self):
cdef:
ndarray arr
Py_ssize_t i

# reset blocks
for i in range(self.nblocks):
arr = self.blocks[i]

# axis=1 is the frame's axis=0
arr.data = self.base_ptrs[i]
arr.shape[1] = 0
return results, mutated