From 60a8649cd9e3c442b7922476f9de5990f9da617c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 16:08:36 -0800 Subject: [PATCH 01/13] CLN: BlockManager.get_slice require only slice arg --- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 32 +++++++++++++++++------- pandas/core/series.py | 3 ++- pandas/tests/internals/test_internals.py | 11 +++++++- 4 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 597023cb5b000..1d65c45910875 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -362,7 +362,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: """ Perform __getitem__-like, return result as block. - As of now, only supports slices that preserve dimensionality. + Only supports slices that preserve dimensionality. """ if new_mgr_locs is None: axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 744d3453c8a96..f271866dcf0e3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -788,6 +788,7 @@ def _combine( return type(self).from_blocks(new_blocks, axes) def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + assert isinstance(slobj, slice), type(slobj) if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) @@ -1188,7 +1189,9 @@ def value_getitem(placement): # Newly created block's dtype may already be present. self._known_consolidated = False - def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): + def insert( + self, loc: int, item: Hashable, value: ArrayLike, allow_duplicates: bool = False + ): """ Insert item at selected position. @@ -1196,7 +1199,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False ---------- loc : int item : hashable - value : array_like + value : np.ndarray or ExtensionArray allow_duplicates: bool If False, trying to insert non-unique item will raise @@ -1213,11 +1216,9 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False if value.ndim == 2: value = value.T - elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): - # TODO(EA2D): special case not needed with 2D EAs - value = ensure_block_shape(value, ndim=2) + else: + value = ensure_block_shape(value, ndim=self.ndim) - # TODO: type value as ArrayLike block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self.blknos[loc:]): @@ -1322,7 +1323,7 @@ def reindex_indexer( def _slice_take_blocks_ax0( self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False - ): + ) -> List[Block]: """ Slice/take blocks along axis=0. @@ -1354,6 +1355,7 @@ def _slice_take_blocks_ax0( # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: return [] + # TODO: tests all have isinstance(slobj, slice), other possibilities? return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: @@ -1363,9 +1365,11 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ - blk.getitem_block([ml], new_mgr_locs=i) + blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i) for i, ml in enumerate(slobj) ] + # We have + # all(np.shares_memory(nb.values, blk.values) for nb in blocks) return blocks else: return [ @@ -1427,7 +1431,8 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies for i, ml in zip(taker, mgr_locs): - nb = blk.getitem_block([i], new_mgr_locs=ml) + nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml) + # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) @@ -1591,7 +1596,15 @@ def _blklocs(self): """ compat with BlockManager """ return None + def getitem_mgr(self, indexer) -> SingleBlockManager: + # similar to get_slice, but not restricted to slice indexer + blk = self._block + array = blk._slice(indexer) + block = blk.make_block_same_class(array, placement=slice(0, len(array))) + return type(self)(block, self.index[indexer]) + def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: + assert isinstance(slobj, slice), type(slobj) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1962,6 +1975,7 @@ def _preprocess_slice_or_indexer(slice_or_indexer, length: int, allow_fill: bool ): return "mask", slice_or_indexer, slice_or_indexer.sum() else: + # TODO: np.intp? indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) if not allow_fill: indexer = maybe_convert_indices(indexer, length) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a5d1c44b312c..f1cc38d8c96a7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -983,7 +983,8 @@ def _get_values_tuple(self, key): def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) + new_mgr = self._mgr.getitem_mgr(indexer) + return self._constructor(new_mgr).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 683006d9b3b9c..a7f318498a8ac 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -823,7 +823,16 @@ def assert_slice_ok(mgr, axis, slobj): slobj = np.concatenate( [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] ) - sliced = mgr.get_slice(slobj, axis=axis) + + if isinstance(slobj, slice): + sliced = mgr.get_slice(slobj, axis=axis) + elif mgr.ndim == 1 and axis == 0: + sliced = mgr.getitem_mgr(slobj) + else: + # BlockManager doesnt support non-slice, SingleBlockManager + # doesnt support axis > 0 + return + mat_slobj = (slice(None),) * axis + (slobj,) tm.assert_numpy_array_equal( mat[mat_slobj], sliced.as_array(), check_dtype=False From b17ad656657859b89cc3e051dd7636976b6e53c0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 17:48:38 -0800 Subject: [PATCH 02/13] mypy fixup --- pandas/core/internals/array_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 905fa448ff033..67f4cb4511389 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -779,6 +779,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: return type(self)(arrays, new_axes, verify_integrity=False) + getitem_mgr = get_slice + def fast_xs(self, loc: int) -> ArrayLike: """ Return the array corresponding to `frame.iloc[loc]`. From 5afce0407028adc012c2f2d168f26748c587e783 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 6 Mar 2021 11:57:50 -0800 Subject: [PATCH 03/13] PERF: implement Index._getitem_slice --- pandas/core/indexes/base.py | 7 +++++++ pandas/core/indexes/multi.py | 18 ++++++++++++++++++ pandas/core/indexes/range.py | 7 +++++++ pandas/core/internals/managers.py | 5 +++-- 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 09c143468bc31..a81a085dc6ae6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4560,6 +4560,13 @@ def __getitem__(self, key): else: return result + def _getitem_slice(self: _IndexT, slobj: slice) -> _IndexT: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._data[slobj] + return type(self)._simple_new(res, name=self._name) + @final def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd0e0ef5fa799..990cc0907f46a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2097,6 +2097,24 @@ def __getitem__(self, key): verify_integrity=False, ) + def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + sortorder = None + if slobj.step is None or slobj.step > 0: + sortorder = self.sortorder + + new_codes = [level_codes[slobj] for level_codes in self.codes] + + return type(self)( + levels=self.levels, + codes=new_codes, + names=self._names, + sortorder=sortorder, + verify_integrity=False, + ) + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take( self: MultiIndex, indices, axis=0, allow_fill=True, fill_value=None, **kwargs diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index bf5a9825f04d0..c12fbb21c313b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -816,6 +816,13 @@ def __getitem__(self, key): # fall back to Int64Index return super().__getitem__(key) + def _getitem_slice(self: RangeIndex, slobj: slice) -> RangeIndex: + """ + Fastpath for __getitem__ when we know we have a slice. + """ + res = self._range[slobj] + return type(self)._simple_new(res, name=self._name) + @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cc8730c31f65e..2daa1ce8dc9a4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -813,7 +813,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] + new_axes[axis] = new_axes[axis]._getitem_slice(slobj) return type(self)._simple_new(tuple(new_blocks), new_axes) @@ -1624,7 +1624,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) block = blk.make_block_same_class(array, placement=slice(0, len(array))) - return type(self)(block, self.index[slobj]) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) @property def index(self) -> Index: From 36a8530baac875de3c60fffa1a7b0125d8109bfe Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Mar 2021 18:37:17 -0800 Subject: [PATCH 04/13] PERF: implement getitem_block_columns --- pandas/core/groupby/ops.py | 40 +++++++++++++++++------- pandas/core/internals/blocks.py | 25 ++++++++++----- pandas/core/internals/managers.py | 17 +++++++--- pandas/tests/internals/test_internals.py | 33 ++++++++++--------- 4 files changed, 79 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a61e8872a7ce7..79021309b3f10 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -997,19 +997,27 @@ def __iter__(self): starts, ends = lib.generate_slices(self.slabels, self.ngroups) - for i, (start, end) in enumerate(zip(starts, ends)): - yield i, self._chop(sdata, slice(start, end)) + if self.axis == 0: + for i, (start, end) in enumerate(zip(starts, ends)): + yield i, self._chop_index(sdata, slice(start, end)) + + else: + for i, (start, end) in enumerate(zip(starts, ends)): + yield i, self._chop_columns(sdata, slice(start, end)) @cache_readonly def sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata, slice_obj: slice) -> NDFrame: + def _chop_columns(self, sdata, slice_obj: slice) -> NDFrame: + raise AbstractMethodError(self) + + def _chop_index(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): - def _chop(self, sdata: Series, slice_obj: slice) -> Series: + def _chop_index(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) # __finalize__ not called here, must be applied by caller if applicable @@ -1028,13 +1036,23 @@ def fast_apply(self, f: F, sdata: FrameOrSeries, names): starts, ends = lib.generate_slices(self.slabels, self.ngroups) return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: - # Fastpath equivalent to: - # if self.axis == 0: - # return sdata.iloc[slice_obj] - # else: - # return sdata.iloc[:, slice_obj] - mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) + def _chop_index(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: + """ + Fastpath equivalent to `sdata.iloc[slice_obj]` + """ + mgr = sdata._mgr.get_slice(slice_obj, axis=1) + # __finalize__ not called here, must be applied by caller if applicable + + # fastpath equivalent to `return sdata._constructor(mgr)` + obj = type(sdata)._from_mgr(mgr) + object.__setattr__(obj, "_flags", sdata._flags) + return obj + + def _chop_columns(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: + """ + Fastpath equivalent to `sdata.iloc[:, slice_obj]` + """ + mgr = sdata._mgr.get_slice(slice_obj, axis=0) # __finalize__ not called here, must be applied by caller if applicable # fastpath equivalent to `return sdata._constructor(mgr)` diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d4f3b3aff8168..0c4962b46f81b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -358,21 +358,32 @@ def _slice(self, slicer): return self.values[slicer] @final - def getitem_block(self, slicer, new_mgr_locs=None) -> Block: + def getitem_block(self, slicer) -> Block: """ Perform __getitem__-like, return result as block. Only supports slices that preserve dimensionality. """ - if new_mgr_locs is None: - axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer - new_mgr_locs = self.mgr_locs[axis0_slicer] - elif not isinstance(new_mgr_locs, BlockPlacement): - new_mgr_locs = BlockPlacement(new_mgr_locs) + axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if new_values.ndim != self.values.ndim: + raise ValueError("Only same dim slicing is allowed") + + return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) + @final + def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: + """ + Perform __getitem__-like, return result as block. + + Only supports slices that preserve dimensionality. + """ new_values = self._slice(slicer) - if self._validate_ndim and new_values.ndim != self.ndim: + if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2daa1ce8dc9a4..a32c05f66cc61 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -23,6 +23,7 @@ internals as libinternals, lib, ) +from pandas._libs.internals import BlockPlacement from pandas._typing import ( ArrayLike, Dtype, @@ -1369,7 +1370,11 @@ def _slice_take_blocks_ax0( if sllen == 0: return [] # TODO: tests all have isinstance(slobj, slice), other possibilities? - return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] + return [ + blk.getitem_block_columns( + slobj, new_mgr_locs=BlockPlacement(slice(0, sllen)) + ) + ] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -1378,7 +1383,9 @@ def _slice_take_blocks_ax0( # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ - blk.getitem_block(slice(ml, ml + 1), new_mgr_locs=i) + blk.getitem_block_columns( + slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i) + ) for i, ml in enumerate(slobj) ] # We have @@ -1438,13 +1445,15 @@ def _slice_take_blocks_ax0( taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): - nb = blk.getitem_block(taker, new_mgr_locs=mgr_locs) + nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies for i, ml in zip(taker, mgr_locs): - nb = blk.getitem_block(slice(i, i + 1), new_mgr_locs=ml) + slc = slice(i, i + 1) + bp = BlockPlacement(ml) + nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) else: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a7f318498a8ac..f55ccfb6b904e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -848,22 +848,27 @@ def assert_slice_ok(mgr, axis, slobj): assert_slice_ok(mgr, ax, slice(1, 4)) assert_slice_ok(mgr, ax, slice(3, 0, -2)) - # boolean mask - assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) - assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) - assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) - - if mgr.shape[ax] >= 3: - assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) - assert_slice_ok(mgr, ax, np.array([True, True, False], dtype=np.bool_)) + if mgr.ndim < 2: + # 2D only support slice objects + + # boolean mask + assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) + + if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) + assert_slice_ok( + mgr, ax, np.array([True, True, False], dtype=np.bool_) + ) - # fancy indexer - assert_slice_ok(mgr, ax, []) - assert_slice_ok(mgr, ax, list(range(mgr.shape[ax]))) + # fancy indexer + assert_slice_ok(mgr, ax, []) + assert_slice_ok(mgr, ax, list(range(mgr.shape[ax]))) - if mgr.shape[ax] >= 3: - assert_slice_ok(mgr, ax, [0, 1, 2]) - assert_slice_ok(mgr, ax, [-1, -2, -3]) + if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, [0, 1, 2]) + assert_slice_ok(mgr, ax, [-1, -2, -3]) @pytest.mark.parametrize("mgr", MANAGERS) def test_take(self, mgr): From 14ee96511281aa1be92db2523a49805c9a7dd04f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Mar 2021 08:43:21 -0800 Subject: [PATCH 05/13] PERF: repeated slicing along index in groupby --- pandas/core/arrays/string_arrow.py | 9 +++++++++ pandas/core/groupby/ops.py | 2 +- pandas/core/internals/base.py | 6 ++++++ pandas/core/internals/blocks.py | 10 ++++++++++ pandas/core/internals/managers.py | 9 +++++++++ 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e003efeabcb66..290c65e4c99bf 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -349,6 +349,15 @@ def __getitem__(self, item: Any) -> Any: "Only integers, slices and integer or " "boolean arrays are valid indices." ) + elif isinstance(item, tuple): + # possibly unpack arr[:, n] to arr[n] + if len(item) == 1: + item = item[0] + elif len(item) == 2: + if item[0] is Ellipsis: + item = item[1] + elif item[1] is Ellipsis: + item = item[0] # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 348f128ac4e1e..8150476a98296 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1040,7 +1040,7 @@ def _chop_index(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: """ Fastpath equivalent to `sdata.iloc[slice_obj]` """ - mgr = sdata._mgr.get_slice(slice_obj, axis=1) + mgr = sdata._mgr.get_slice_index(slice_obj) # __finalize__ not called here, must be applied by caller if applicable # fastpath equivalent to `return sdata._constructor(mgr)` diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 0e4b5ce2e7452..7deace0429962 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -99,6 +99,12 @@ def equals(self, other: object) -> bool: return self._equal_values(other) + def get_slice_index(self: T, slobj: slice) -> T: + """ + get_slice specialized to axis=self.ndim-1 + """ + return self.get_slice(slobj, axis=self.ndim - 1) + class SingleDataManager(DataManager): ndim = 1 diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ef21b399a3efe..80ff8732fbbbc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -380,6 +380,16 @@ def getitem_block(self, slicer) -> Block: return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) + @final + def getitem_block_index(self, slicer: slice) -> Block: + """ + Perform __getitem__-like specialized to slicing along index. + + Assumes self.ndim == 2 + """ + new_values = self.values[..., slicer] + return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim) + @final def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3b00083bc5062..58a482a707fad 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -818,6 +818,15 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: return type(self)._simple_new(tuple(new_blocks), new_axes) + def get_slice_index(self, slobj: slice) -> BlockManager: + # get_slice specialized to axis = 1 and ndim == 2 + new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] + + axes = self.axes + new_axes = [axes[0], axes[1]._getitem_slice(slobj)] + + return type(self)._simple_new(tuple(new_blocks), new_axes) + @property def nblocks(self) -> int: return len(self.blocks) From f6655ad857e670754913028c90832c9ecaf47fd1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Mar 2021 09:46:36 -0800 Subject: [PATCH 06/13] mypy fixup --- pandas/core/internals/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 7deace0429962..14c41d47891cc 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -99,6 +99,9 @@ def equals(self, other: object) -> bool: return self._equal_values(other) + def get_slice(self: T, slobj: slice, axis: int = 0) -> T: + raise AbstractMethodError(self) + def get_slice_index(self: T, slobj: slice) -> T: """ get_slice specialized to axis=self.ndim-1 From 8ff516733b6d049c4201d9cd2cc8c0f30419607d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Mar 2021 15:34:59 -0800 Subject: [PATCH 07/13] mypy fixup --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 99838602eeb63..803d6d1dd00b2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -283,7 +283,7 @@ def _from_factorized(cls, values, original): # ------------------------------------------------------------------------ def __getitem__( - self, item: Union[int, slice, np.ndarray] + self, item: Union[int, slice, np.ndarray, Ellipsis] ) -> Union[ExtensionArray, Any]: """ Select a subset of self. From 2ade2bbdd4814179f84efba33dc1b53ad8f9fa26 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Mar 2021 18:03:55 -0800 Subject: [PATCH 08/13] revert --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 803d6d1dd00b2..99838602eeb63 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -283,7 +283,7 @@ def _from_factorized(cls, values, original): # ------------------------------------------------------------------------ def __getitem__( - self, item: Union[int, slice, np.ndarray, Ellipsis] + self, item: Union[int, slice, np.ndarray] ) -> Union[ExtensionArray, Any]: """ Select a subset of self. From c594fd09d2a435cef993de931c378bb264d8d5f6 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Mar 2021 18:05:39 -0800 Subject: [PATCH 09/13] comment typo fixup --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0f86ec69538d2..57a57cbd80d8c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -355,7 +355,7 @@ def __getitem__(self, item: Any) -> Any: "boolean arrays are valid indices." ) elif isinstance(item, tuple): - # possibly unpack arr[:, n] to arr[n] + # possibly unpack arr[..., n] to arr[n] if len(item) == 1: item = item[0] elif len(item) == 2: From e9d0a92b285b60ea02e0551a9edce6c5801ea73b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Mar 2021 19:24:08 -0800 Subject: [PATCH 10/13] type:ignore --- pandas/core/internals/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 916b64df21a0f..ca9cd54f0fcd2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -345,7 +345,9 @@ def getitem_block_index(self, slicer: slice) -> Block: Assumes self.ndim == 2 """ - new_values = self.values[..., slicer] + # error: Invalid index type "Tuple[ellipsis, slice]" for + # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]" + new_values = self.values[..., slicer] # type: ignore[index] return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim) @final From 14497dfe0a36645a1a68baee68f93f4df32ad5b8 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 11:26:35 -0800 Subject: [PATCH 11/13] TST: EA[..., slc] --- pandas/core/arrays/sparse/array.py | 2 ++ pandas/tests/extension/base/getitem.py | 11 +++++++++++ pandas/tests/extension/json/array.py | 7 +++++++ 3 files changed, 20 insertions(+) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c798870e4126a..f4c2894c05e41 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -817,6 +817,8 @@ def value_counts(self, dropna: bool = True): def __getitem__(self, key): if isinstance(key, tuple): + if len(key) and key[0] is Ellipsis: + key = key[1:] if len(key) > 1: raise IndexError("too many indices for array.") key = key[0] diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index a7b99c2e09e88..27feb86070f5e 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -245,6 +245,17 @@ def test_getitem_slice(self, data): result = data[slice(1)] # scalar assert isinstance(result, type(data)) + def test_getitem_ellipsis_and_slice(self, data): + # GH#40353 this is called from getitem_block_index + result = data[..., :] + self.assert_extension_array_equal(result, data) + + result = data[..., :3] + self.assert_extension_array_equal(result, data[:3]) + + result = data[..., ::2] + self.assert_extension_array_equal(result, data[::2]) + def test_get(self, data): # GH 20882 s = pd.Series(data, index=[2 * i for i in range(len(data))]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ca593da6d97bc..370770b7e5e19 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -83,6 +83,13 @@ def _from_factorized(cls, values, original): return cls([UserDict(x) for x in values if x != ()]) def __getitem__(self, item): + if isinstance(item, tuple): + if len(item) and item[0] is Ellipsis: + item = item[1:] + if len(item) > 1: + raise IndexError("too many indices for array.") + item = item[0] + if isinstance(item, numbers.Integral): return self.data[item] elif isinstance(item, slice) and item == slice(None): From 23cf40d26bfe9b36fcf58b536ca099b5a8bffa29 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Mar 2021 16:57:09 -0700 Subject: [PATCH 12/13] recert get_slice_index --- pandas/core/groupby/ops.py | 40 +++++++++---------------------- pandas/core/internals/base.py | 9 ------- pandas/core/internals/managers.py | 12 +--------- 3 files changed, 12 insertions(+), 49 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ffb21592c6db3..e505359987eb3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1005,27 +1005,19 @@ def __iter__(self): starts, ends = lib.generate_slices(self.slabels, self.ngroups) - if self.axis == 0: - for i, (start, end) in enumerate(zip(starts, ends)): - yield i, self._chop_index(sdata, slice(start, end)) - - else: - for i, (start, end) in enumerate(zip(starts, ends)): - yield i, self._chop_columns(sdata, slice(start, end)) + for i, (start, end) in enumerate(zip(starts, ends)): + yield i, self._chop(sdata, slice(start, end)) @cache_readonly def sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) - def _chop_columns(self, sdata, slice_obj: slice) -> NDFrame: - raise AbstractMethodError(self) - - def _chop_index(self, sdata, slice_obj: slice) -> NDFrame: + def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): - def _chop_index(self, sdata: Series, slice_obj: slice) -> Series: + def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) # __finalize__ not called here, must be applied by caller if applicable @@ -1044,23 +1036,13 @@ def fast_apply(self, f: F, sdata: FrameOrSeries, names): starts, ends = lib.generate_slices(self.slabels, self.ngroups) return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop_index(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: - """ - Fastpath equivalent to `sdata.iloc[slice_obj]` - """ - mgr = sdata._mgr.get_slice_index(slice_obj) - # __finalize__ not called here, must be applied by caller if applicable - - # fastpath equivalent to `return sdata._constructor(mgr)` - obj = type(sdata)._from_mgr(mgr) - object.__setattr__(obj, "_flags", sdata._flags) - return obj - - def _chop_columns(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: - """ - Fastpath equivalent to `sdata.iloc[:, slice_obj]` - """ - mgr = sdata._mgr.get_slice(slice_obj, axis=0) + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: + # Fastpath equivalent to: + # if self.axis == 0: + # return sdata.iloc[slice_obj] + # else: + # return sdata.iloc[:, slice_obj] + mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) # __finalize__ not called here, must be applied by caller if applicable # fastpath equivalent to `return sdata._constructor(mgr)` diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 120044831dc7d..7afb6e5d7e544 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -122,15 +122,6 @@ def apply( def isna(self: T, func) -> T: return self.apply("apply", func=func) - def get_slice(self: T, slobj: slice, axis: int = 0) -> T: - raise AbstractMethodError(self) - - def get_slice_index(self: T, slobj: slice) -> T: - """ - get_slice specialized to axis=self.ndim-1 - """ - return self.get_slice(slobj, axis=self.ndim - 1) - class SingleDataManager(DataManager): ndim = 1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 067eb48556d47..84289c46ab501 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -800,8 +800,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: - slicer = (slice(None), slobj) - new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] + new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] else: raise IndexError("Requested axis not found in manager") @@ -810,15 +809,6 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: return type(self)._simple_new(tuple(new_blocks), new_axes) - def get_slice_index(self, slobj: slice) -> BlockManager: - # get_slice specialized to axis = 1 and ndim == 2 - new_blocks = [blk.getitem_block_index(slobj) for blk in self.blocks] - - axes = self.axes - new_axes = [axes[0], axes[1]._getitem_slice(slobj)] - - return type(self)._simple_new(tuple(new_blocks), new_axes) - @property def nblocks(self) -> int: return len(self.blocks) From e36b7b087ba66e16447bce2f4d92dbeb8daaeed0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 Mar 2021 07:31:19 -0700 Subject: [PATCH 13/13] TST: arr[foo, ...] --- pandas/core/arrays/sparse/array.py | 7 +++++-- pandas/tests/extension/base/getitem.py | 9 +++++++++ pandas/tests/extension/json/array.py | 7 +++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 04737619e247e..aa16dc9a22f4e 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -817,8 +817,11 @@ def value_counts(self, dropna: bool = True): def __getitem__(self, key): if isinstance(key, tuple): - if len(key) and key[0] is Ellipsis: - key = key[1:] + if len(key) > 1: + if key[0] is Ellipsis: + key = key[1:] + elif key[-1] is Ellipsis: + key = key[:-1] if len(key) > 1: raise IndexError("too many indices for array.") key = key[0] diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 27feb86070f5e..971da37c105bd 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -250,12 +250,21 @@ def test_getitem_ellipsis_and_slice(self, data): result = data[..., :] self.assert_extension_array_equal(result, data) + result = data[:, ...] + self.assert_extension_array_equal(result, data) + result = data[..., :3] self.assert_extension_array_equal(result, data[:3]) + result = data[:3, ...] + self.assert_extension_array_equal(result, data[:3]) + result = data[..., ::2] self.assert_extension_array_equal(result, data[::2]) + result = data[::2, ...] + self.assert_extension_array_equal(result, data[::2]) + def test_get(self, data): # GH 20882 s = pd.Series(data, index=[2 * i for i in range(len(data))]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 370770b7e5e19..a4fedd9a4c5da 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -84,8 +84,11 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, tuple): - if len(item) and item[0] is Ellipsis: - item = item[1:] + if len(item) > 1: + if item[0] is Ellipsis: + item = item[1:] + elif item[-1] is Ellipsis: + item = item[:-1] if len(item) > 1: raise IndexError("too many indices for array.") item = item[0]