pandas-dev · mroeschke · Jan 9, 2023 · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -476,6 +476,19 @@ def time_assign_list_of_columns_concat(self):
         concat([self.df, df], axis=1)
 
 
+class Setitem:
+    def setup(self):
+        N = 500_000
+        cols = 500
+        self.df = DataFrame(np.random.rand(N, cols))
+
+    def time_setitem(self):
+        self.df[100] = 100
+
+    def time_setitem_list(self):
+        self.df[[100, 200, 300]] = 100
+
+
 class ChainIndexing:
 
     params = [None, "warn"]

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -666,6 +666,7 @@ Performance improvements
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
 - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
 - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
+- Performance improvement in :meth:`DataFrame.__setitem__` (:issue:`46267`)
 - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
 - Performance improvement when iterating over pyarrow and nullable dtypes (:issue:`49825`, :issue:`49851`)
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1314,7 +1314,7 @@ def quantile(
     # ---------------------------------------------------------------------
     # Abstract Methods Overridden By EABackedBlock and NumpyBlock
 
-    def delete(self, loc) -> Block:
+    def delete(self, loc) -> list[Block]:
         """
         Return a new Block with the given loc(s) deleted.
         """
@@ -1531,11 +1531,11 @@ def putmask(self, mask, new) -> list[Block]:
 
         return [self]
 
-    def delete(self, loc) -> Block:
+    def delete(self, loc) -> list[Block]:
         # This will be unnecessary if/when __array_function__ is implemented
         values = self.values.delete(loc)
         mgr_locs = self._mgr_locs.delete(loc)
-        return type(self)(values, placement=mgr_locs, ndim=self.ndim)
+        return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
 
     @cache_readonly
     def array_values(self) -> ExtensionArray:
@@ -1850,10 +1850,39 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
     def values_for_json(self) -> np.ndarray:
         return self.values
 
-    def delete(self, loc) -> Block:
-        values = np.delete(self.values, loc, 0)
-        mgr_locs = self._mgr_locs.delete(loc)
-        return type(self)(values, placement=mgr_locs, ndim=self.ndim)
+    def delete(self, loc) -> list[Block]:
+        if not is_list_like(loc):
+            loc = [loc]
+
+        if self.ndim == 1:
+            values = np.delete(self.values, loc)
+            mgr_locs = self._mgr_locs.delete(loc)
+            return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
+
+        if np.max(loc) >= self.values.shape[0]:
+            raise IndexError
+
+        # Add one out-of-bounds indexer as maximum to collect
+        # all columns after our last indexer if any
+        loc = np.concatenate([loc, [self.values.shape[0]]])
+        mgr_locs_arr = self._mgr_locs.as_array
+        new_blocks: list[Block] = []
+
+        previous_loc = -1
+        for idx in loc:
+
+            if idx == previous_loc + 1:
+                # There is now column between current and last idx
+                pass
+            else:
+                values = self.values[previous_loc + 1 : idx, :]
+                locs = mgr_locs_arr[previous_loc + 1 : idx]
+                nb = type(self)(values, placement=BlockPlacement(locs), ndim=self.ndim)
+                new_blocks.append(nb)
+
+            previous_loc = idx
+
+        return new_blocks
 
 
 class NumericBlock(NumpyBlock):

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1235,12 +1235,24 @@ def value_getitem(placement):
                 if len(val_locs) == len(blk.mgr_locs):
                     removed_blknos.append(blkno_l)
                 else:
-                    nb = blk.delete(blk_locs)
+                    nbs = blk.delete(blk_locs)
+                    # Add first block where old block was and remaining blocks at
+                    # the end to avoid updating all block numbers
+                    first_nb = nbs[0]
+                    nbs_tup = tuple(nb for nb in nbs[1:])
+                    nr_blocks = len(self.blocks)
                     blocks_tup = (
-                        self.blocks[:blkno_l] + (nb,) + self.blocks[blkno_l + 1 :]
+                        self.blocks[:blkno_l]
+                        + (first_nb,)
+                        + self.blocks[blkno_l + 1 :]
+                        + nbs_tup
                     )
                     self.blocks = blocks_tup
-                    self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
+                    self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
+
+                    for i, nb in enumerate(nbs_tup):
+                        self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
+                        self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
                     # blk.delete gives a copy, so we can remove a possible reference
                     self._clear_reference_block(blkno_l)
 
@@ -2034,7 +2046,7 @@ def idelete(self, indexer) -> SingleBlockManager:
 
         Ensures that self.blocks doesn't become empty.
         """
-        nb = self._block.delete(indexer)
+        nb = self._block.delete(indexer)[0]
         self.blocks = (nb,)
         self.axes[0] = self.axes[0].delete(indexer)
         self._cache.clear()

diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1323,8 +1323,23 @@ def test_unstack_non_slice_like_blocks(using_array_manager):
     # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like
 
     mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
-    df = DataFrame(np.random.randn(15, 4), index=mi)
-    df[1] = df[1].astype(np.int64)
+    df = DataFrame(
+        {
+            0: np.random.randn(
+                15,
+            ),
+            1: np.random.randn(
+                15,
+            ).astype(np.int64),
+            2: np.random.randn(
+                15,
+            ),
+            3: np.random.randn(
+                15,
+            ),
+        },
+        index=mi,
+    )
     if not using_array_manager:
         assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)
 

diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -285,7 +285,7 @@ def test_copy(self, fblock):
     def test_delete(self, fblock):
         newb = fblock.copy()
         locs = newb.mgr_locs
-        nb = newb.delete(0)
+        nb = newb.delete(0)[0]
         assert newb.mgr_locs is locs
 
         assert nb is not newb
@@ -299,21 +299,25 @@ def test_delete(self, fblock):
         newb = fblock.copy()
         locs = newb.mgr_locs
         nb = newb.delete(1)
+        assert len(nb) == 2
         assert newb.mgr_locs is locs
 
         tm.assert_numpy_array_equal(
-            nb.mgr_locs.as_array, np.array([0, 4], dtype=np.intp)
+            nb[0].mgr_locs.as_array, np.array([0], dtype=np.intp)
+        )
+        tm.assert_numpy_array_equal(
+            nb[1].mgr_locs.as_array, np.array([4], dtype=np.intp)
         )
         assert not (newb.values[1] == 2).all()
-        assert (nb.values[1] == 2).all()
+        assert (nb[1].values[0] == 2).all()
 
         newb = fblock.copy()
-        locs = newb.mgr_locs
         nb = newb.delete(2)
+        assert len(nb) == 1
         tm.assert_numpy_array_equal(
-            nb.mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
+            nb[0].mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
         )
-        assert (nb.values[1] == 1).all()
+        assert (nb[0].values[1] == 1).all()
 
         newb = fblock.copy()
 
@@ -328,14 +332,16 @@ def test_delete_datetimelike(self):
         assert isinstance(blk.values, TimedeltaArray)
 
         nb = blk.delete(1)
-        assert isinstance(nb.values, TimedeltaArray)
+        assert len(nb) == 1
+        assert isinstance(nb[0].values, TimedeltaArray)
 
         df = DataFrame(arr.view("M8[ns]"))
         blk = df._mgr.blocks[0]
         assert isinstance(blk.values, DatetimeArray)
 
         nb = blk.delete([1, 3])
-        assert isinstance(nb.values, DatetimeArray)
+        assert len(nb) == 1
+        assert isinstance(nb[0].values, DatetimeArray)
 
     def test_split(self):
         # GH#37799