From e51169191f5ee17ed6a28c05fe300aab5cb43f84 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Apr 2021 17:46:49 -0700 Subject: [PATCH 1/4] PERF: implement NDArrayBackedBlock --- pandas/_libs/arrays.pxd | 11 +++++++++++ pandas/_libs/arrays.pyx | 6 +++--- pandas/_libs/internals.pyx | 24 ++++++++++++++++++++++++ pandas/core/apply.py | 2 ++ pandas/core/internals/blocks.py | 4 ++-- 5 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 pandas/_libs/arrays.pxd diff --git a/pandas/_libs/arrays.pxd b/pandas/_libs/arrays.pxd new file mode 100644 index 0000000000000..737da29da46a4 --- /dev/null +++ b/pandas/_libs/arrays.pxd @@ -0,0 +1,11 @@ + +from numpy cimport ndarray + + +cdef class NDArrayBacked: + cdef: + readonly ndarray _ndarray + readonly object _dtype + + cpdef NDArrayBacked _from_backing_data(self, ndarray values) + cpdef __setstate__(self, state) diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 1f4a47c4e252a..a2d4cf3000ee1 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -45,9 +45,9 @@ cdef class NDArrayBacked: # TODO: implement take in terms of cnp.PyArray_TakeFrom # TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate - cdef: - readonly ndarray _ndarray - readonly object _dtype + # cdef: + # readonly ndarray _ndarray + # readonly object _dtype def __init__(self, ndarray values, object dtype): self._ndarray = values diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index f3bc70ad8a26b..7cc068f735556 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -23,6 +23,7 @@ cnp.import_array() from pandas._libs.algos import ensure_int64 +from pandas._libs.arrays cimport NDArrayBacked from pandas._libs.util cimport is_integer_object @@ -525,6 +526,29 @@ cdef class NumpyBlock(SharedBlock): return type(self)(new_values, self._mgr_locs, ndim=self.ndim) +cdef class NDArrayBackedBlock(SharedBlock): + """ + Block backed by NDArrayBackedExtensionArray + """ + cdef public: + NDArrayBacked values + + def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim): + # set values here the (implicit) call to SharedBlock.__cinit__ will + # set placement and ndim + self.values = values + + # @final # not useful in cython, but we _would_ annotate with @final + cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer): + """ + Perform __getitem__-like specialized to slicing along index. + + Assumes self.ndim == 2 + """ + new_values = self.values[..., slicer] + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + + cdef class Block(SharedBlock): cdef: public object values diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 86cde647cc798..bd2485f0c0221 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -51,6 +51,7 @@ from pandas.core.construction import ( array as pd_array, create_series_with_explicit_dtype, + ensure_wrapped_if_datetimelike, ) if TYPE_CHECKING: @@ -902,6 +903,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: @property def series_generator(self): values = self.values + values = ensure_wrapped_if_datetimelike(values) assert len(values) > 0 # We create one Series object, and will swap out the data inside diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 61396fdf372d5..18594ed9de854 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -263,7 +263,7 @@ def make_block_same_class( if placement is None: placement = self._mgr_locs - if values.dtype.kind == "m": + if values.dtype.kind in ["m", "M"]: # TODO: remove this once fastparquet has stopped relying on it values = ensure_wrapped_if_datetimelike(values) @@ -1668,7 +1668,7 @@ class NumericBlock(NumpyBlock): is_numeric = True -class NDArrayBackedExtensionBlock(libinternals.Block, EABackedBlock): +class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray """ From 5a265712bd8928ae008e6e4cafcc6a59d5297a6d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Apr 2021 18:47:41 -0700 Subject: [PATCH 2/4] mypy fixup --- pandas/_libs/internals.pyi | 5 +++++ pandas/core/internals/blocks.py | 1 + 2 files changed, 6 insertions(+) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index f3436e9c7afba..4d5e03eb4d733 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -6,6 +6,7 @@ from typing import ( import numpy as np +from pandas._libs.arrays import NDArrayBacked from pandas._typing import ( ArrayLike, T, @@ -67,6 +68,10 @@ class NumpyBlock(SharedBlock): values: np.ndarray def getitem_block_index(self: T, slicer: slice) -> T: ... +class NDArrayBackedBlock(SharedBlock): + values: NDArrayBacked + def getitem_block_index(self: T, slicer: slice) -> T: ... + class Block(SharedBlock): ... diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 18594ed9de854..77ee47a0f9a72 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1674,6 +1674,7 @@ class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock """ values: NDArrayBackedExtensionArray + getitem_block_index = libinternals.NDArrayBackedBlock.getitem_block_index @property def is_view(self) -> bool: From bb590893de2d0804592294f115162e942d45822c Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 May 2021 14:15:21 -0700 Subject: [PATCH 3/4] fix suppressed TypeError --- pandas/core/groupby/ops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 26812a07b4be3..3f9854ec95bfc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -48,7 +48,6 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, - is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, @@ -975,7 +974,7 @@ def agg_series(self, obj: Series, func: F) -> tuple[ArrayLike, np.ndarray]: # SeriesGrouper would raise if we were to call _aggregate_series_fast result, counts = self._aggregate_series_pure_python(obj, func) - elif is_extension_array_dtype(obj.dtype): + elif not isinstance(obj._values, np.ndarray): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # In the datetime64tz case it would incorrectly cast to tz-naive From ed9f412ce2ca286eb37bf3bc81bb3649183a365e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 May 2021 15:06:45 -0700 Subject: [PATCH 4/4] mypy fixup --- pandas/_libs/internals.pyi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 4d5e03eb4d733..74ca311b35ed7 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -6,13 +6,13 @@ from typing import ( import numpy as np -from pandas._libs.arrays import NDArrayBacked from pandas._typing import ( ArrayLike, T, ) from pandas import Index +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.internals.blocks import Block as B def slice_len(slc: slice, objlen: int = ...) -> int: ... @@ -69,7 +69,7 @@ class NumpyBlock(SharedBlock): def getitem_block_index(self: T, slicer: slice) -> T: ... class NDArrayBackedBlock(SharedBlock): - values: NDArrayBacked + values: NDArrayBackedExtensionArray def getitem_block_index(self: T, slicer: slice) -> T: ... class Block(SharedBlock):