Skip to content

PERF: implement NDArrayBackedBlock #41175

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pandas/_libs/arrays.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

from numpy cimport ndarray


cdef class NDArrayBacked:
cdef:
readonly ndarray _ndarray
readonly object _dtype

cpdef NDArrayBacked _from_backing_data(self, ndarray values)
cpdef __setstate__(self, state)
6 changes: 3 additions & 3 deletions pandas/_libs/arrays.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ cdef class NDArrayBacked:
# TODO: implement take in terms of cnp.PyArray_TakeFrom
# TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate

cdef:
readonly ndarray _ndarray
readonly object _dtype
# cdef:
# readonly ndarray _ndarray
# readonly object _dtype

def __init__(self, ndarray values, object dtype):
self._ndarray = values
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/internals.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ from pandas._typing import (
)

from pandas import Index
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
from pandas.core.internals.blocks import Block as B

def slice_len(slc: slice, objlen: int = ...) -> int: ...
Expand Down Expand Up @@ -67,6 +68,10 @@ class NumpyBlock(SharedBlock):
values: np.ndarray
def getitem_block_index(self: T, slicer: slice) -> T: ...

class NDArrayBackedBlock(SharedBlock):
values: NDArrayBackedExtensionArray
def getitem_block_index(self: T, slicer: slice) -> T: ...

class Block(SharedBlock):
...

Expand Down
24 changes: 24 additions & 0 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ cnp.import_array()

from pandas._libs.algos import ensure_int64

from pandas._libs.arrays cimport NDArrayBacked
from pandas._libs.util cimport is_integer_object


Expand Down Expand Up @@ -527,6 +528,29 @@ cdef class NumpyBlock(SharedBlock):
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)


cdef class NDArrayBackedBlock(SharedBlock):
"""
Block backed by NDArrayBackedExtensionArray
"""
cdef public:
NDArrayBacked values

def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
# set values here the (implicit) call to SharedBlock.__cinit__ will
# set placement and ndim
self.values = values

# @final # not useful in cython, but we _would_ annotate with @final
cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
"""
Perform __getitem__-like specialized to slicing along index.

Assumes self.ndim == 2
"""
new_values = self.values[..., slicer]
return type(self)(new_values, self._mgr_locs, ndim=self.ndim)


cdef class Block(SharedBlock):
cdef:
public object values
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from pandas.core.construction import (
array as pd_array,
create_series_with_explicit_dtype,
ensure_wrapped_if_datetimelike,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -908,6 +909,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
@property
def series_generator(self):
values = self.values
values = ensure_wrapped_if_datetimelike(values)
assert len(values) > 0

# We create one Series object, and will swap out the data inside
Expand Down
3 changes: 1 addition & 2 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
is_categorical_dtype,
is_complex_dtype,
is_datetime64_any_dtype,
is_extension_array_dtype,
is_integer_dtype,
is_numeric_dtype,
is_sparse,
Expand Down Expand Up @@ -978,7 +977,7 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]:
# SeriesGrouper would raise if we were to call _aggregate_series_fast
result, counts = self._aggregate_series_pure_python(obj, func)

elif is_extension_array_dtype(obj.dtype):
elif not isinstance(obj._values, np.ndarray):
# _aggregate_series_fast would raise TypeError when
# calling libreduction.Slider
# In the datetime64tz case it would incorrectly cast to tz-naive
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def make_block_same_class(
if placement is None:
placement = self._mgr_locs

if values.dtype.kind == "m":
if values.dtype.kind in ["m", "M"]:
# TODO: remove this once fastparquet has stopped relying on it
values = ensure_wrapped_if_datetimelike(values)

Expand Down Expand Up @@ -1663,12 +1663,13 @@ class NumericBlock(NumpyBlock):
is_numeric = True


class NDArrayBackedExtensionBlock(libinternals.Block, EABackedBlock):
class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock):
"""
Block backed by an NDArrayBackedExtensionArray
"""

values: NDArrayBackedExtensionArray
getitem_block_index = libinternals.NDArrayBackedBlock.getitem_block_index

@property
def is_view(self) -> bool:
Expand Down