diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 1791cbb85c355..5da32409c317f 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -52,6 +52,7 @@ class BlockPlacement: def delete(self, loc) -> BlockPlacement: ... def append(self, others: list[BlockPlacement]) -> BlockPlacement: ... def tile_for_unstack(self, factor: int) -> np.ndarray: ... + def to_slices(self) -> list[BlockPlacement]: ... class SharedBlock: _mgr_locs: BlockPlacement diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 559359bdf3353..dd72bf364cc68 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -264,6 +264,20 @@ cdef class BlockPlacement: new_placement = np.concatenate(mapped) return new_placement + def to_slices(self): + """ + Decompose a BlockPlacement into a list of BlockPlacements, each of + which is slice-like. + + Returns + ------- + List[BlockPlacement] + """ + if self.is_slice_like: + return [self] + slices = indexer_as_slices(self.indexer) + return [BlockPlacement(x) for x in slices] + cdef slice slice_canonize(slice s): """ @@ -303,7 +317,7 @@ cdef slice slice_canonize(slice s): stop = start if start < 0 or (stop < 0 and s.stop is not None and step > 0): - raise ValueError("unbounded slice") + raise ValueError("unbounded slice", start, stop, step, s) if stop < 0: return slice(start, None, step) @@ -386,6 +400,50 @@ cdef slice_getitem(slice slc, ind): return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind] +@cython.boundscheck(False) +@cython.wraparound(False) +cdef list indexer_as_slices(int64_t[:] vals): + """ + Convert an indexer to a list of slices. + """ + # TODO: there may be more efficient ways to decompose an indexer into slices + cdef: + Py_ssize_t i, n, start, stop + int64_t d + + if vals is None: + raise TypeError("vals must be ndarray") + + n = vals.shape[0] + + if n == 0: + return [] + + if n == 1: + return [slice(vals[0], vals[0] + 1, 1)] + + # n >= 2 + d = vals[1] - vals[0] + + if d == 0: + # i guess we have duplicate values (TODO: how?) + return [slice(vals[0], vals[0] + 1, 1)] + indexer_as_slices(vals[1:]) + + for i in range(2, n): + if vals[i] - vals[i - 1] != d: + # we need to start a new slice + start = vals[0] + stop = vals[i - 1] + d + return [slice(start, stop, d)] + indexer_as_slices(vals[i:]) + + start = vals[0] + stop = start + n * d + if stop < 0 and d < 0: + return [slice(start, None, d)] + else: + return [slice(start, stop, d)] + + @cython.boundscheck(False) @cython.wraparound(False) cdef slice indexer_as_slice(intp_t[:] vals): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e02a88aafcf34..266b9c728ddd6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4589,6 +4589,9 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) columns, method, copy, level, fill_value, limit, tolerance ) + # If we have made a copy, no need to make another one + copy = False + index = axes["index"] if index is not None: frame = frame._reindex_index( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b355ae2426884..ec84c378deb0e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -661,6 +661,8 @@ def reindex_indexer( new_blocks = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice ) + if copy: + new_blocks = [x.copy() for x in new_blocks] else: new_blocks = [ blk.take_nd( @@ -696,6 +698,7 @@ def _slice_take_blocks_ax0( only_slice : bool, default False If True, we always return views on existing arrays, never copies. This is used when called from ops.blockwise.operate_blockwise. + Ignored; TODO: remove argument. Returns ------- @@ -721,7 +724,7 @@ def _slice_take_blocks_ax0( if allow_fill and fill_value is None: fill_value = blk.fill_value - if not allow_fill and only_slice: + if not allow_fill: # GH#33597 slice instead of take, so we get # views instead of copies blocks = [ @@ -758,8 +761,7 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] - group = not only_slice - for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=False): if blkno == -1: # If we've got here, fill_value was not lib.no_default @@ -785,24 +787,31 @@ def _slice_take_blocks_ax0( # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice: - taker = lib.maybe_indices_to_slice(taker, max_len) + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): + bp = libinternals.BlockPlacement(taker) + bps = bp.to_slices() + assert len(bps) == 1 # just checking nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) - elif only_slice: + else: # GH#33597 slice instead of take, so we get # views instead of copies - for i, ml in zip(taker, mgr_locs): - slc = slice(i, i + 1) - bp = BlockPlacement(ml) - nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) + bp = libinternals.BlockPlacement(taker) + bps = bp.to_slices() + left, right = 0, 0 + + for sub in bps: + right += len(sub) + new_locs = mgr_locs[left:right] + # we have isinstance(sub.indexer, slice) + nb = blk.getitem_block_columns( + sub.indexer, new_mgr_locs=new_locs + ) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) - else: - nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) - blocks.append(nb) + left += len(sub) return blocks diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index d2704876c31c5..8c7f05d40552d 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -374,12 +374,11 @@ def test_xs_droplevel_false_view(self, using_array_manager): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - # with mixed dataframe, modifying the parent doesn't modify result - # TODO the "split" path behaves differently here as with single block + # with mixed dataframe, we still get a view on the parent df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - expected = DataFrame({"a": [1]}) + expected = DataFrame({"a": [2]}) tm.assert_frame_equal(result, expected) def test_xs_list_indexer_droplevel_false(self): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 362252e1a6b72..ad5f21e3f7ee5 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -717,7 +717,8 @@ def test_reindex_items(self): mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2") reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) - assert reindexed.nblocks == 2 + assert reindexed.nblocks == 3 + assert all(b.mgr_locs.is_slice_like for b in reindexed.blocks) tm.assert_index_equal(reindexed.items, Index(["g", "c", "a", "d"])) tm.assert_almost_equal( mgr.iget(6).internal_values(), reindexed.iget(0).internal_values()