diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 65ea14da08c79..a5bba8e5592c7 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -476,6 +476,19 @@ def time_assign_list_of_columns_concat(self): concat([self.df, df], axis=1) +class Setitem: + def setup(self): + N = 500_000 + cols = 500 + self.df = DataFrame(np.random.rand(N, cols)) + + def time_setitem(self): + self.df[100] = 100 + + def time_setitem_list(self): + self.df[[100, 200, 300]] = 100 + + class ChainIndexing: params = [None, "warn"] diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5b725eb4d2a98..de3fc3c1968a5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -778,6 +778,7 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) +- Performance improvement in :meth:`DataFrame.__setitem__` (:issue:`46267`) - Performance improvement in ``var`` and ``std`` for nullable dtypes (:issue:`48379`). - Performance improvement when iterating over pyarrow and nullable dtypes (:issue:`49825`, :issue:`49851`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f65722ac9685b..aa859fac04921 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1315,11 +1315,48 @@ def quantile( # --------------------------------------------------------------------- # Abstract Methods Overridden By EABackedBlock and NumpyBlock - def delete(self, loc) -> Block: - """ - Return a new Block with the given loc(s) deleted. + def delete(self, loc) -> list[Block]: + """Deletes the locs from the block. + + We split the block to avoid copying the underlying data. We create new + blocks for every connected segment of the initial block that is not deleted. + The new blocks point to the initial array. """ - raise AbstractMethodError(self) + if not is_list_like(loc): + loc = [loc] + + if self.ndim == 1: + values = cast(np.ndarray, self.values) + values = np.delete(values, loc) + mgr_locs = self._mgr_locs.delete(loc) + return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + + if np.max(loc) >= self.values.shape[0]: + raise IndexError + + # Add one out-of-bounds indexer as maximum to collect + # all columns after our last indexer if any + loc = np.concatenate([loc, [self.values.shape[0]]]) + mgr_locs_arr = self._mgr_locs.as_array + new_blocks: list[Block] = [] + + previous_loc = -1 + for idx in loc: + + if idx == previous_loc + 1: + # There is no column between current and last idx + pass + else: + # No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[slice, slice]" + values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa + locs = mgr_locs_arr[previous_loc + 1 : idx] + nb = type(self)(values, placement=BlockPlacement(locs), ndim=self.ndim) + new_blocks.append(nb) + + previous_loc = idx + + return new_blocks @property def is_view(self) -> bool: @@ -1532,11 +1569,13 @@ def putmask(self, mask, new) -> list[Block]: return [self] - def delete(self, loc) -> Block: + def delete(self, loc) -> list[Block]: # This will be unnecessary if/when __array_function__ is implemented - values = self.values.delete(loc) - mgr_locs = self._mgr_locs.delete(loc) - return type(self)(values, placement=mgr_locs, ndim=self.ndim) + if self.ndim == 1: + values = self.values.delete(loc) + mgr_locs = self._mgr_locs.delete(loc) + return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + return super().delete(loc) @cache_readonly def array_values(self) -> ExtensionArray: @@ -1851,11 +1890,6 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: def values_for_json(self) -> np.ndarray: return self.values - def delete(self, loc) -> Block: - values = np.delete(self.values, loc, 0) - mgr_locs = self._mgr_locs.delete(loc) - return type(self)(values, placement=mgr_locs, ndim=self.ndim) - class NumericBlock(NumpyBlock): __slots__ = () diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c4e869a3f6a45..c90787ee24277 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1235,14 +1235,24 @@ def value_getitem(placement): if len(val_locs) == len(blk.mgr_locs): removed_blknos.append(blkno_l) else: - nb = blk.delete(blk_locs) + nbs = blk.delete(blk_locs) + # Add first block where old block was and remaining blocks at + # the end to avoid updating all block numbers + first_nb = nbs[0] + nbs_tup = tuple(nbs[1:]) + nr_blocks = len(self.blocks) blocks_tup = ( - self.blocks[:blkno_l] + (nb,) + self.blocks[blkno_l + 1 :] + self.blocks[:blkno_l] + + (first_nb,) + + self.blocks[blkno_l + 1 :] + + nbs_tup ) self.blocks = blocks_tup - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - # blk.delete gives a copy, so we can remove a possible reference - self._clear_reference_block(blkno_l) + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks if len(removed_blknos): # Remove blocks & update blknos and refs accordingly @@ -2034,7 +2044,7 @@ def idelete(self, indexer) -> SingleBlockManager: Ensures that self.blocks doesn't become empty. """ - nb = self._block.delete(indexer) + nb = self._block.delete(indexer)[0] self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) self._cache.clear() diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2be2a052401ed..5034a8473fe63 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1325,8 +1325,15 @@ def test_unstack_non_slice_like_blocks(using_array_manager): # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like mi = MultiIndex.from_product([range(5), ["A", "B", "C"]]) - df = DataFrame(np.random.randn(15, 4), index=mi) - df[1] = df[1].astype(np.int64) + df = DataFrame( + { + 0: np.random.randn(15), + 1: np.random.randn(15).astype(np.int64), + 2: np.random.randn(15), + 3: np.random.randn(15), + }, + index=mi, + ) if not using_array_manager: assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3044aecc26b4f..c253c4bd1ca53 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -285,7 +285,7 @@ def test_copy(self, fblock): def test_delete(self, fblock): newb = fblock.copy() locs = newb.mgr_locs - nb = newb.delete(0) + nb = newb.delete(0)[0] assert newb.mgr_locs is locs assert nb is not newb @@ -299,21 +299,25 @@ def test_delete(self, fblock): newb = fblock.copy() locs = newb.mgr_locs nb = newb.delete(1) + assert len(nb) == 2 assert newb.mgr_locs is locs tm.assert_numpy_array_equal( - nb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp) + nb[0].mgr_locs.as_array, np.array([0], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + nb[1].mgr_locs.as_array, np.array([4], dtype=np.intp) ) assert not (newb.values[1] == 2).all() - assert (nb.values[1] == 2).all() + assert (nb[1].values[0] == 2).all() newb = fblock.copy() - locs = newb.mgr_locs nb = newb.delete(2) + assert len(nb) == 1 tm.assert_numpy_array_equal( - nb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp) + nb[0].mgr_locs.as_array, np.array([0, 2], dtype=np.intp) ) - assert (nb.values[1] == 1).all() + assert (nb[0].values[1] == 1).all() newb = fblock.copy() @@ -328,14 +332,18 @@ def test_delete_datetimelike(self): assert isinstance(blk.values, TimedeltaArray) nb = blk.delete(1) - assert isinstance(nb.values, TimedeltaArray) + assert len(nb) == 2 + assert isinstance(nb[0].values, TimedeltaArray) + assert isinstance(nb[1].values, TimedeltaArray) df = DataFrame(arr.view("M8[ns]")) blk = df._mgr.blocks[0] assert isinstance(blk.values, DatetimeArray) nb = blk.delete([1, 3]) - assert isinstance(nb.values, DatetimeArray) + assert len(nb) == 2 + assert isinstance(nb[0].values, DatetimeArray) + assert isinstance(nb[1].values, DatetimeArray) def test_split(self): # GH#37799