From 5781f9fd2698c4a0eb44917407ed363ee322fc64 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 May 2023 16:40:57 -0700 Subject: [PATCH 1/4] REF: tighter typing, better names for manager indexing methods --- pandas/_libs/internals.pyi | 8 +++--- pandas/_libs/internals.pyx | 10 ++++---- pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/blocks.py | 32 ++++++++++++++++-------- pandas/core/internals/concat.py | 5 +++- pandas/core/internals/managers.py | 4 +-- pandas/core/series.py | 6 ++--- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- 9 files changed, 42 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 9996fbfc346df..ce112413f8a64 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -10,7 +10,7 @@ import numpy as np from pandas._typing import ( ArrayLike, - T, + Self, npt, ) @@ -76,12 +76,12 @@ class SharedBlock: class NumpyBlock(SharedBlock): values: np.ndarray @final - def getitem_block_index(self: T, slicer: slice) -> T: ... + def slice_block_rows(self, slicer: slice) -> Self: ... class NDArrayBackedBlock(SharedBlock): values: NDArrayBackedExtensionArray @final - def getitem_block_index(self: T, slicer: slice) -> T: ... + def slice_block_rows(self, slicer: slice) -> Self: ... class Block(SharedBlock): ... @@ -95,7 +95,7 @@ class BlockManager: def __init__( self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=... ) -> None: ... - def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ... + def get_slice(self, slobj: slice, axis: int = ...) -> Self: ... def _rebuild_blknos_and_blklocs(self) -> None: ... class BlockValuesRefs: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index cfc43521cf606..adf4e8c926fa3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -715,7 +715,7 @@ cdef class NumpyBlock(SharedBlock): # set placement, ndim and refs self.values = values - cpdef NumpyBlock getitem_block_index(self, slice slicer): + cpdef NumpyBlock slice_block_rows(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. @@ -743,7 +743,7 @@ cdef class NDArrayBackedBlock(SharedBlock): # set placement, ndim and refs self.values = values - cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer): + cpdef NDArrayBackedBlock slice_block_rows(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. @@ -899,7 +899,7 @@ cdef class BlockManager: # ------------------------------------------------------------------- # Indexing - cdef BlockManager _get_index_slice(self, slice slobj): + cdef BlockManager _slice_mgr_rows(self, slice slobj): cdef: SharedBlock blk, nb BlockManager mgr @@ -907,7 +907,7 @@ cdef class BlockManager: nbs = [] for blk in self.blocks: - nb = blk.getitem_block_index(slobj) + nb = blk.slice_block_rows(slobj) nbs.append(nb) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] @@ -926,7 +926,7 @@ cdef class BlockManager: if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: - return self._get_index_slice(slobj) + return self._slice_mgr_rows(slobj) else: raise IndexError("Requested axis not found in manager") diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 15f3630b2d735..f04ac76998adc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1275,7 +1275,7 @@ def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager: new_index = self.index._getitem_slice(slobj) return type(self)([new_array], [new_index], verify_integrity=False) - def getitem_mgr(self, indexer) -> SingleArrayManager: + def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> SingleArrayManager: new_array = self.array[indexer] new_index = self.index[indexer] return type(self)([new_array], [new_index]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 911cd4ae255ce..077388fc1ace1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -36,6 +36,7 @@ FillnaOptions, IgnoreRaise, QuantileInterpolation, + Self, Shape, npt, ) @@ -259,26 +260,35 @@ def __len__(self) -> int: return len(self.values) @final - def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block: + def slice_block_columns(self, slc: slice) -> Self: + """ + Perform __getitem__-like, return result as block. + """ + new_mgr_locs = self._mgr_locs[slc] + + new_values = self._slice(slc) + refs = self.refs + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) + + @final + def take_block_columns(self, indices: npt.NDArray[np.intp]) -> Self: """ Perform __getitem__-like, return result as block. Only supports slices that preserve dimensionality. """ - # Note: the only place where we are called with ndarray[intp] - # is from internals.concat, and we can verify that never happens - # with 1-column blocks, i.e. never for ExtensionBlock. + # Note: only called from is from internals.concat, and we can verify + # that never happens with 1-column blocks, i.e. never for ExtensionBlock. - new_mgr_locs = self._mgr_locs[slicer] + new_mgr_locs = self._mgr_locs[indices] - new_values = self._slice(slicer) - refs = self.refs if isinstance(slicer, slice) else None - return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) + new_values = self._slice(indices) + return type(self)(new_values, new_mgr_locs, self.ndim, refs=None) @final def getitem_block_columns( self, slicer: slice, new_mgr_locs: BlockPlacement - ) -> Block: + ) -> Self: """ Perform __getitem__-like, return result as block. @@ -1997,7 +2007,7 @@ def _slice( ------- ExtensionArray """ - # Notes: ndarray[bool] is only reachable when via getitem_mgr, which + # Notes: ndarray[bool] is only reachable when via get_rows_with_mask, which # is only for Series, i.e. self.ndim == 1. # return same dims as we currently have @@ -2023,7 +2033,7 @@ def _slice( return self.values[slicer] @final - def getitem_block_index(self, slicer: slice) -> ExtensionBlock: + def slice_block_rows(self, slicer: slice) -> Self: """ Perform __getitem__-like specialized to slicing along index. """ diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index cc46977cee6dc..1d22ed3fe8897 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -328,7 +328,10 @@ def _get_block_for_concat_plan( slc = lib.maybe_indices_to_slice(ax0_blk_indexer, max_len) # TODO: in all extant test cases 2023-04-08 we have a slice here. # Will this always be the case? - nb = blk.getitem_block(slc) + if isinstance(slc, slice): + nb = blk.slice_block_columns(slc) + else: + nb = blk.take_block_columns(slc) # assert nb.shape == (len(bp), mgr.shape[1]) return nb diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 31827a6bcf6d0..c1fe52518f4cd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1872,7 +1872,7 @@ def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: # We need to do getitem_block here otherwise we would be altering # blk.mgr_locs in place, which would render it invalid. This is only # relevant in the copy=False case. - nb = blk.getitem_block(slice(None)) + nb = blk.slice_block_columns(slice(None)) nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) @@ -2018,7 +2018,7 @@ def _blklocs(self): """compat with BlockManager""" return None - def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager: + def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: # similar to get_slice, but not restricted to slice indexer blk = self._block if ( diff --git a/pandas/core/series.py b/pandas/core/series.py index 540a770b1c897..6106a1680fc7d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -998,7 +998,7 @@ def __getitem__(self, key): if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) key = np.asarray(key, dtype=bool) - return self._get_values(key) + return self._get_rows_with_mask(key) return self._get_with(key) @@ -1054,8 +1054,8 @@ def _get_values_tuple(self, key: tuple): new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] return new_ser.__finalize__(self) - def _get_values(self, indexer: slice | npt.NDArray[np.bool_]) -> Series: - new_mgr = self._mgr.getitem_mgr(indexer) + def _get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Series: + new_mgr = self._mgr.get_rows_with_mask(indexer) return self._constructor(new_mgr, fastpath=True).__finalize__(self) def _get_value(self, label, takeable: bool = False): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index cf51d9d693155..890dc0a1a43a3 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -285,7 +285,7 @@ def test_getitem_slice(self, data): assert isinstance(result, type(data)) def test_getitem_ellipsis_and_slice(self, data): - # GH#40353 this is called from getitem_block_index + # GH#40353 this is called from slice_block_rows result = data[..., :] self.assert_extension_array_equal(result, data) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 79eb4110cfba9..727ff5aca43ae 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -943,7 +943,7 @@ def assert_slice_ok(mgr, axis, slobj): if isinstance(slobj, slice): sliced = mgr.get_slice(slobj, axis=axis) elif mgr.ndim == 1 and axis == 0: - sliced = mgr.getitem_mgr(slobj) + sliced = mgr.get_rows_with_mask(slobj) else: # BlockManager doesn't support non-slice, SingleBlockManager # doesn't support axis > 0 From 07291388998cd7ca2f7432663a7f4c32a8943433 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 May 2023 18:32:14 -0700 Subject: [PATCH 2/4] REF: dont need _slice in SingleManager calls --- pandas/core/internals/blocks.py | 4 ---- pandas/core/internals/managers.py | 16 +++------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 077388fc1ace1..a08a44a11866a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -295,10 +295,6 @@ def getitem_block_columns( Only supports slices that preserve dimensionality. """ new_values = self._slice(slicer) - - if new_values.ndim != self.values.ndim: - raise ValueError("Only same dim slicing is allowed") - return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs) @final diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c1fe52518f4cd..8e60bf02fed75 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -54,7 +54,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -2021,18 +2020,9 @@ def _blklocs(self): def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: # similar to get_slice, but not restricted to slice indexer blk = self._block - if ( - using_copy_on_write() - and isinstance(indexer, np.ndarray) - and len(indexer) > 0 - and com.is_bool_indexer(indexer) - and indexer.all() - ): + if using_copy_on_write() and len(indexer) > 0 and indexer.all(): return type(self)(blk.copy(deep=False), self.index) - array = blk._slice(indexer) - if array.ndim > 1: - # This will be caught by Series._get_values - raise ValueError("dimension-expanding indexing not allowed") + array = blk.values[indexer] bp = BlockPlacement(slice(0, len(array))) # TODO(CoW) in theory only need to track reference if new_array is a view @@ -2048,7 +2038,7 @@ def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: raise IndexError("Requested axis not found in manager") blk = self._block - array = blk._slice(slobj) + array = blk.values[slobj] bp = BlockPlacement(slice(0, len(array))) # TODO this method is only used in groupby SeriesSplitter at the moment, # so passing refs is not yet covered by the tests From 24210276a72f2b397d224a8df930701873a1f95d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 May 2023 10:36:46 -0700 Subject: [PATCH 3/4] fix CoW test --- pandas/tests/internals/test_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 727ff5aca43ae..390efe9e06c70 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -980,7 +980,7 @@ def assert_slice_ok(mgr, axis, slobj): # fancy indexer assert_slice_ok(mgr, ax, []) - assert_slice_ok(mgr, ax, list(range(mgr.shape[ax]))) + assert_slice_ok(mgr, ax, np.array(list(range(mgr.shape[ax])))) if mgr.shape[ax] >= 3: assert_slice_ok(mgr, ax, [0, 1, 2]) From b4c37f80503299c7cb84ff9a0ceaf12b9fad7da1 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 May 2023 13:52:37 -0700 Subject: [PATCH 4/4] fix test --- pandas/tests/internals/test_internals.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 390efe9e06c70..47e7092743b00 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -942,12 +942,17 @@ def assert_slice_ok(mgr, axis, slobj): if isinstance(slobj, slice): sliced = mgr.get_slice(slobj, axis=axis) - elif mgr.ndim == 1 and axis == 0: + elif ( + mgr.ndim == 1 + and axis == 0 + and isinstance(slobj, np.ndarray) + and slobj.dtype == bool + ): sliced = mgr.get_rows_with_mask(slobj) else: # BlockManager doesn't support non-slice, SingleBlockManager # doesn't support axis > 0 - return + raise TypeError(slobj) mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( @@ -978,14 +983,6 @@ def assert_slice_ok(mgr, axis, slobj): mgr, ax, np.array([True, True, False], dtype=np.bool_) ) - # fancy indexer - assert_slice_ok(mgr, ax, []) - assert_slice_ok(mgr, ax, np.array(list(range(mgr.shape[ax])))) - - if mgr.shape[ax] >= 3: - assert_slice_ok(mgr, ax, [0, 1, 2]) - assert_slice_ok(mgr, ax, [-1, -2, -3]) - @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): def assert_take_ok(mgr, axis, indexer):