Skip to content

Commit efb2091

Browse files
jbrockmendelluckyvs1
authored andcommitted
BUG: item_cache invalidation on DataFrame.insert (pandas-dev#38380)
* BUG: item_cache invalidation on DataFrame.insert * Whatsnew
1 parent a88a756 commit efb2091

File tree

5 files changed

+38
-10
lines changed

5 files changed

+38
-10
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ Interval
127127

128128
Indexing
129129
^^^^^^^^
130-
130+
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
131131
-
132132
-
133133

pandas/core/internals/managers.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from pandas._libs import internals as libinternals, lib
2020
from pandas._typing import ArrayLike, DtypeObj, Label, Shape
21+
from pandas.errors import PerformanceWarning
2122
from pandas.util._validators import validate_bool_kwarg
2223

2324
from pandas.core.dtypes.cast import (
@@ -1222,7 +1223,14 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False):
12221223
self._known_consolidated = False
12231224

12241225
if len(self.blocks) > 100:
1225-
self._consolidate_inplace()
1226+
warnings.warn(
1227+
"DataFrame is highly fragmented. This is usually the result "
1228+
"of calling `frame.insert` many times, which has poor performance. "
1229+
"Consider using pd.concat instead. To get a de-fragmented frame, "
1230+
"use `newframe = frame.copy()`",
1231+
PerformanceWarning,
1232+
stacklevel=5,
1233+
)
12261234

12271235
def reindex_axis(
12281236
self,

pandas/tests/frame/indexing/test_insert.py

+14
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import numpy as np
77
import pytest
88

9+
from pandas.errors import PerformanceWarning
10+
911
from pandas import DataFrame, Index
1012
import pandas._testing as tm
1113

@@ -66,3 +68,15 @@ def test_insert_with_columns_dups(self):
6668
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
6769
)
6870
tm.assert_frame_equal(df, exp)
71+
72+
def test_insert_item_cache(self):
73+
df = DataFrame(np.random.randn(4, 3))
74+
ser = df[0]
75+
76+
with tm.assert_produces_warning(PerformanceWarning):
77+
for n in range(100):
78+
df[n + 3] = df[1] * n
79+
80+
ser.values[0] = 99
81+
82+
assert df.iloc[0, 0] == df[0][0]

pandas/tests/frame/test_block_internals.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import numpy as np
66
import pytest
77

8+
from pandas.errors import PerformanceWarning
9+
810
import pandas as pd
911
from pandas import (
1012
Categorical,
@@ -329,12 +331,13 @@ def test_strange_column_corruption_issue(self):
329331
df[0] = np.nan
330332
wasCol = {}
331333

332-
for i, dt in enumerate(df.index):
333-
for col in range(100, 200):
334-
if col not in wasCol:
335-
wasCol[col] = 1
336-
df[col] = np.nan
337-
df[col][dt] = i
334+
with tm.assert_produces_warning(PerformanceWarning):
335+
for i, dt in enumerate(df.index):
336+
for col in range(100, 200):
337+
if col not in wasCol:
338+
wasCol[col] = 1
339+
df[col] = np.nan
340+
df[col][dt] = i
338341

339342
myid = 100
340343

pandas/tests/io/sas/test_sas7bdat.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy as np
88
import pytest
99

10-
from pandas.errors import EmptyDataError
10+
from pandas.errors import EmptyDataError, PerformanceWarning
1111
import pandas.util._test_decorators as td
1212

1313
import pandas as pd
@@ -194,7 +194,10 @@ def test_compact_numerical_values(datapath):
194194
def test_many_columns(datapath):
195195
# Test for looking for column information in more places (PR #22628)
196196
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
197-
df = pd.read_sas(fname, encoding="latin-1")
197+
with tm.assert_produces_warning(PerformanceWarning):
198+
# Many DataFrame.insert calls
199+
df = pd.read_sas(fname, encoding="latin-1")
200+
198201
fname = datapath("io", "sas", "data", "many_columns.csv")
199202
df0 = pd.read_csv(fname, encoding="latin-1")
200203
tm.assert_frame_equal(df, df0)

0 commit comments

Comments
 (0)