BUG: item_cache invalidation on DataFrame.insert (pandas-dev#38380)

jbrockmendel · luckyvs1 · commit efb2091ba7a8 · 2021-01-19T23:18:34.000-08:00
* BUG: item_cache invalidation on DataFrame.insert

* Whatsnew
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -127,7 +127,7 @@ Interval
 
 Indexing
 ^^^^^^^^
-
+- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
 -
 -
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -18,6 +18,7 @@
 
 from pandas._libs import internals as libinternals, lib
 from pandas._typing import ArrayLike, DtypeObj, Label, Shape
+from pandas.errors import PerformanceWarning
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.cast import (
@@ -1222,7 +1223,14 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False):
         self._known_consolidated = False
 
         if len(self.blocks) > 100:
-            self._consolidate_inplace()
+            warnings.warn(
+                "DataFrame is highly fragmented.  This is usually the result "
+                "of calling `frame.insert` many times, which has poor performance.  "
+                "Consider using pd.concat instead.  To get a de-fragmented frame, "
+                "use `newframe = frame.copy()`",
+                PerformanceWarning,
+                stacklevel=5,
+            )
 
     def reindex_axis(
         self,
diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import PerformanceWarning
+
 from pandas import DataFrame, Index
 import pandas._testing as tm
 
@@ -66,3 +68,15 @@ def test_insert_with_columns_dups(self):
             [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
         )
         tm.assert_frame_equal(df, exp)
+
+    def test_insert_item_cache(self):
+        df = DataFrame(np.random.randn(4, 3))
+        ser = df[0]
+
+        with tm.assert_produces_warning(PerformanceWarning):
+            for n in range(100):
+                df[n + 3] = df[1] * n
+
+        ser.values[0] = 99
+
+        assert df.iloc[0, 0] == df[0][0]
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import PerformanceWarning
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -329,12 +331,13 @@ def test_strange_column_corruption_issue(self):
         df[0] = np.nan
         wasCol = {}
 
-        for i, dt in enumerate(df.index):
-            for col in range(100, 200):
-                if col not in wasCol:
-                    wasCol[col] = 1
-                    df[col] = np.nan
-                df[col][dt] = i
+        with tm.assert_produces_warning(PerformanceWarning):
+            for i, dt in enumerate(df.index):
+                for col in range(100, 200):
+                    if col not in wasCol:
+                        wasCol[col] = 1
+                        df[col] = np.nan
+                    df[col][dt] = i
 
         myid = 100
 
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pytest
 
-from pandas.errors import EmptyDataError
+from pandas.errors import EmptyDataError, PerformanceWarning
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -194,7 +194,10 @@ def test_compact_numerical_values(datapath):
 def test_many_columns(datapath):
     # Test for looking for column information in more places (PR #22628)
     fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
-    df = pd.read_sas(fname, encoding="latin-1")
+    with tm.assert_produces_warning(PerformanceWarning):
+        # Many DataFrame.insert calls
+        df = pd.read_sas(fname, encoding="latin-1")
+
     fname = datapath("io", "sas", "data", "many_columns.csv")
     df0 = pd.read_csv(fname, encoding="latin-1")
     tm.assert_frame_equal(df, df0)

Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ Interval`
`127`	`127`
`128`	`128`	`Indexing`
`129`	`129`	`^^^^^^^^`
`130`		`-`
	`130`	+- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
`131`	`131`	`-`
`132`	`132`	`-`
`133`	`133`