From 60a8649cd9e3c442b7922476f9de5990f9da617c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 16:08:36 -0800 Subject: [PATCH 1/3] CLN: BlockManager.get_slice require only slice arg --- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 32 +++++++++++++++++------- pandas/core/series.py | 3 ++- pandas/tests/internals/test_internals.py | 11 +++++++- 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 597023cb5b000..1d65c45910875 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -362,7 +362,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: """ Perform __getitem__-like, return result as block. - As of now, only supports slices that preserve dimensionality. + Only supports slices that preserve dimensionality. """ if new_mgr_locs is None: axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 744d3453c8a96..f271866dcf0e3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -788,6 +788,7 @@ def _combine( return type(self).from_blocks(new_blocks, axes) def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + assert isinstance(slobj, slice), type(slobj) if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) @@ -1188,7 +1189,9 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False - def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): + def insert( + self, loc: int, item: Hashable, value: ArrayLike, allow_duplicates: bool = False + ): """ Insert item at selected position. @@ -1196,7 +1199,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False ---------- loc : int item : hashable - value : array_like + value : np.ndarray or ExtensionArray allow_duplicates: bool If False, trying to insert non-unique item will raise @@ -1213,11 +1216,9 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False if value.ndim == 2: value = value.T - elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): - # TODO(EA2D): special case not needed with 2D EAs - value = ensure_block_shape(value, ndim=2) + else: + value = ensure_block_shape(value, ndim=self.ndim) - # TODO: type value as ArrayLike block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): @@ -1322,7 +1323,7 @@ def reindex_indexer( def _slice_take_blocks_ax0( self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False - ): + ) -> List[Block]: """ Slice/take blocks along axis=0. @@ -1354,6 +1355,7 @@ def _slice_take_blocks_ax0( # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: return [] + # TODO: tests all have isinstance(slobj, slice), other possibilities? return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: @@ -1363,9 +1365,11 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ - blk.getitem_block([ml], new_mgr_locs=i) + blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i) for i, ml in enumerate(slobj) ] + # We have + # all(np.shares_memory(nb.values, blk.values) for nb in blocks) return blocks else: return [ @@ -1427,7 +1431,8 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies for i, ml in zip(taker, mgr_locs): - nb = blk.getitem_block([i], new_mgr_locs=ml) + nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml) + # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) @@ -1591,7 +1596,15 @@ def _blklocs(self): """ compat with BlockManager """ return None + def getitem_mgr(self, indexer) -> SingleBlockManager: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + array = blk._slice(indexer) + block = blk.make_block_same_class(array, placement=slice(0, len(array))) + return type(self)(block, self.index[indexer]) + def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: + assert isinstance(slobj, slice), type(slobj) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1962,6 +1975,7 @@ def _preprocess_slice_or_indexer(slice_or_indexer, length: int, allow_fill: bool ): return "mask", slice_or_indexer, slice_or_indexer.sum() else: + # TODO: np.intp? indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) if not allow_fill: indexer = maybe_convert_indices(indexer, length) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a5d1c44b312c..f1cc38d8c96a7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -983,7 +983,8 @@ def _get_values_tuple(self, key): def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) + new_mgr = self._mgr.getitem_mgr(indexer) + return self._constructor(new_mgr).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 683006d9b3b9c..a7f318498a8ac 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -823,7 +823,16 @@ def assert_slice_ok(mgr, axis, slobj): slobj = np.concatenate( [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] ) - sliced = mgr.get_slice(slobj, axis=axis) + + if isinstance(slobj, slice): + sliced = mgr.get_slice(slobj, axis=axis) + elif mgr.ndim == 1 and axis == 0: + sliced = mgr.getitem_mgr(slobj) + else: + # BlockManager doesnt support non-slice, SingleBlockManager + # doesnt support axis > 0 + return + mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( mat[mat_slobj], sliced.as_array(), check_dtype=False From b17ad656657859b89cc3e051dd7636976b6e53c0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 17:48:38 -0800 Subject: [PATCH 2/3] mypy fixup --- pandas/core/internals/array_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 905fa448ff033..67f4cb4511389 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -779,6 +779,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: return type(self)(arrays, new_axes, verify_integrity=False) + getitem_mgr = get_slice + def fast_xs(self, loc: int) -> ArrayLike: """ Return the array corresponding to `frame.iloc[loc]`. From 5afce0407028adc012c2f2d168f26748c587e783 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 6 Mar 2021 11:57:50 -0800 Subject: [PATCH 3/3] PERF: implement Index._getitem_slice --- pandas/core/indexes/base.py | 7 +++++++ pandas/core/indexes/multi.py | 18 ++++++++++++++++++ pandas/core/indexes/range.py | 7 +++++++ pandas/core/internals/managers.py | 5 +++-- 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 09c143468bc31..a81a085dc6ae6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4560,6 +4560,13 @@ def __getitem__(self, key): else: return result + def _getitem_slice(self: _IndexT, slobj: slice) -> _IndexT: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._data[slobj] + return type(self)._simple_new(res, name=self._name) + @final def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd0e0ef5fa799..990cc0907f46a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2097,6 +2097,24 @@ def __getitem__(self, key): verify_integrity=False, ) + def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + sortorder = None + if slobj.step is None or slobj.step > 0: + sortorder = self.sortorder + + new_codes = [level_codes[slobj] for level_codes in self.codes] + + return type(self)( + levels=self.levels, + codes=new_codes, + names=self._names, + sortorder=sortorder, + verify_integrity=False, + ) + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take( self: MultiIndex, indices, axis=0, allow_fill=True, fill_value=None, **kwargs diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index bf5a9825f04d0..c12fbb21c313b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -816,6 +816,13 @@ def __getitem__(self, key): # fall back to Int64Index return super().__getitem__(key) + def _getitem_slice(self: RangeIndex, slobj: slice) -> RangeIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._range[slobj] + return type(self)._simple_new(res, name=self._name) + @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cc8730c31f65e..2daa1ce8dc9a4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -813,7 +813,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] + new_axes[axis] = new_axes[axis]._getitem_slice(slobj) return type(self)._simple_new(tuple(new_blocks), new_axes) @@ -1624,7 +1624,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) block = blk.make_block_same_class(array, placement=slice(0, len(array))) - return type(self)(block, self.index[slobj]) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) @property def index(self) -> Index: