Skip to content

Commit 9773ae2

Browse files
authored
PERF: implement NDArrayBackedBlock (#41175)
1 parent fd3e205 commit 9773ae2

File tree

7 files changed

+49
-7
lines changed

7 files changed

+49
-7
lines changed

pandas/_libs/arrays.pxd

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
from numpy cimport ndarray
3+
4+
5+
cdef class NDArrayBacked:
6+
cdef:
7+
readonly ndarray _ndarray
8+
readonly object _dtype
9+
10+
cpdef NDArrayBacked _from_backing_data(self, ndarray values)
11+
cpdef __setstate__(self, state)

pandas/_libs/arrays.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ cdef class NDArrayBacked:
4545
# TODO: implement take in terms of cnp.PyArray_TakeFrom
4646
# TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate
4747

48-
cdef:
49-
readonly ndarray _ndarray
50-
readonly object _dtype
48+
# cdef:
49+
# readonly ndarray _ndarray
50+
# readonly object _dtype
5151

5252
def __init__(self, ndarray values, object dtype):
5353
self._ndarray = values

pandas/_libs/internals.pyi

+5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ from pandas._typing import (
1212
)
1313

1414
from pandas import Index
15+
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
1516
from pandas.core.internals.blocks import Block as B
1617

1718
def slice_len(slc: slice, objlen: int = ...) -> int: ...
@@ -67,6 +68,10 @@ class NumpyBlock(SharedBlock):
6768
values: np.ndarray
6869
def getitem_block_index(self: T, slicer: slice) -> T: ...
6970

71+
class NDArrayBackedBlock(SharedBlock):
72+
values: NDArrayBackedExtensionArray
73+
def getitem_block_index(self: T, slicer: slice) -> T: ...
74+
7075
class Block(SharedBlock):
7176
...
7277

pandas/_libs/internals.pyx

+24
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ cnp.import_array()
2323

2424
from pandas._libs.algos import ensure_int64
2525

26+
from pandas._libs.arrays cimport NDArrayBacked
2627
from pandas._libs.util cimport is_integer_object
2728

2829

@@ -527,6 +528,29 @@ cdef class NumpyBlock(SharedBlock):
527528
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
528529

529530

531+
cdef class NDArrayBackedBlock(SharedBlock):
532+
"""
533+
Block backed by NDArrayBackedExtensionArray
534+
"""
535+
cdef public:
536+
NDArrayBacked values
537+
538+
def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
539+
# set values here the (implicit) call to SharedBlock.__cinit__ will
540+
# set placement and ndim
541+
self.values = values
542+
543+
# @final # not useful in cython, but we _would_ annotate with @final
544+
cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
545+
"""
546+
Perform __getitem__-like specialized to slicing along index.
547+
548+
Assumes self.ndim == 2
549+
"""
550+
new_values = self.values[..., slicer]
551+
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)
552+
553+
530554
cdef class Block(SharedBlock):
531555
cdef:
532556
public object values

pandas/core/apply.py

+2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
from pandas.core.construction import (
5252
array as pd_array,
5353
create_series_with_explicit_dtype,
54+
ensure_wrapped_if_datetimelike,
5455
)
5556

5657
if TYPE_CHECKING:
@@ -908,6 +909,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
908909
@property
909910
def series_generator(self):
910911
values = self.values
912+
values = ensure_wrapped_if_datetimelike(values)
911913
assert len(values) > 0
912914

913915
# We create one Series object, and will swap out the data inside

pandas/core/groupby/ops.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
is_categorical_dtype,
5050
is_complex_dtype,
5151
is_datetime64_any_dtype,
52-
is_extension_array_dtype,
5352
is_integer_dtype,
5453
is_numeric_dtype,
5554
is_sparse,
@@ -978,7 +977,7 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]:
978977
# SeriesGrouper would raise if we were to call _aggregate_series_fast
979978
result, counts = self._aggregate_series_pure_python(obj, func)
980979

981-
elif is_extension_array_dtype(obj.dtype):
980+
elif not isinstance(obj._values, np.ndarray):
982981
# _aggregate_series_fast would raise TypeError when
983982
# calling libreduction.Slider
984983
# In the datetime64tz case it would incorrectly cast to tz-naive

pandas/core/internals/blocks.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def make_block_same_class(
263263
if placement is None:
264264
placement = self._mgr_locs
265265

266-
if values.dtype.kind == "m":
266+
if values.dtype.kind in ["m", "M"]:
267267
# TODO: remove this once fastparquet has stopped relying on it
268268
values = ensure_wrapped_if_datetimelike(values)
269269

@@ -1663,12 +1663,13 @@ class NumericBlock(NumpyBlock):
16631663
is_numeric = True
16641664

16651665

1666-
class NDArrayBackedExtensionBlock(libinternals.Block, EABackedBlock):
1666+
class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock):
16671667
"""
16681668
Block backed by an NDArrayBackedExtensionArray
16691669
"""
16701670

16711671
values: NDArrayBackedExtensionArray
1672+
getitem_block_index = libinternals.NDArrayBackedBlock.getitem_block_index
16721673

16731674
@property
16741675
def is_view(self) -> bool:

0 commit comments

Comments
 (0)