From 8c2e6bab012473b4e14c4631d33edba1abdb489e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Dec 2020 17:10:32 -0800 Subject: [PATCH 1/3] BUG: item_cache invalidation --- pandas/core/internals/managers.py | 21 ++++++++++++------- pandas/tests/frame/indexing/test_insert.py | 14 +++++++++++++ pandas/tests/frame/methods/test_quantile.py | 12 +++++++++++ .../tests/frame/methods/test_sort_values.py | 12 +++++++++++ 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 93ab207d8ce12..f4ceb64433f5f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -18,6 +18,7 @@ from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape +from pandas.errors import PerformanceWarning from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -442,7 +443,6 @@ def apply( def quantile( self, axis: int = 0, - consolidate: bool = True, transposed: bool = False, interpolation="linear", qs=None, @@ -472,9 +472,6 @@ def quantile( # simplify some of the code here and in the blocks assert self.ndim >= 2 - if consolidate: - self._consolidate_inplace() - def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 @@ -1226,7 +1223,14 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): self._known_consolidated = False if len(self.blocks) > 100: - self._consolidate_inplace() + warnings.warn( + "DataFrame is highly fragmented. This is usually the result " + "of calling `frame.insert` many times, which has poor performance. " + "Consider using pd.concat instead. To get a de-fragmented frame, " + "use `newframe = frame.copy()`", + PerformanceWarning, + stacklevel=3, + ) def reindex_axis( self, @@ -1455,7 +1459,6 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True """ Take items along any axis. """ - self._consolidate_inplace() indexer = ( np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") if isinstance(indexer, slice) @@ -1472,7 +1475,11 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True new_labels = self.axes[axis].take(indexer) return self.reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + new_axis=new_labels, + indexer=indexer, + axis=axis, + allow_dups=True, + consolidate=False, ) def equals(self, other: object) -> bool: diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 622c93d1c2fdc..6e4deb5469777 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + from pandas import DataFrame, Index import pandas._testing as tm @@ -66,3 +68,15 @@ def test_insert_with_columns_dups(self): [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] ) tm.assert_frame_equal(df, exp) + + def test_insert_item_cache(self): + df = DataFrame(np.random.randn(4, 3)) + ser = df[0] + + with tm.assert_produces_warning(PerformanceWarning): + for n in range(100): + df[n + 3] = df[1] * n + + ser.values[0] = 99 + + assert df.iloc[0, 0] == df[0][0] diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 13e00c97d6f71..6ddba8b5e7064 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -517,3 +517,15 @@ def test_quantile_empty_no_columns(self): expected = DataFrame([], index=[0.5], columns=[]) expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) + + def test_quantile_item_cache(self): + # previous behavior incorrect retained an invalid _item_cache entry + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) + df["D"] = df["A"] * 2 + ser = df["A"] + assert len(df._mgr.blocks) == 2 + + df.quantile(numeric_only=False) + ser.values[0] = 99 + + assert df.iloc[0, 0] == df["A"][0] diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index b94f54a4819c0..1bb969956e074 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -544,6 +544,18 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) + def test_sort_values_item_cache(self): + # previous behavior incorrect retained an invalid _item_cache entry + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) + df["D"] = df["A"] * 2 + ser = df["A"] + assert len(df._mgr.blocks) == 2 + + df.sort_values(by="A") + ser.values[0] = 99 + + assert df.iloc[0, 0] == df["A"][0] + class TestDataFrameSortKey: # test key sorting (issue 27237) def test_sort_values_inplace_key(self, sort_by_key): From 8029aa18287526f3f8061504c7c706a1700b877f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Dec 2020 18:38:31 -0800 Subject: [PATCH 2/3] Revert broken --- pandas/core/internals/managers.py | 10 +--------- pandas/tests/frame/indexing/test_insert.py | 14 -------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f4ceb64433f5f..0b3f1079cdb16 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -18,7 +18,6 @@ from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape -from pandas.errors import PerformanceWarning from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -1223,14 +1222,7 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): self._known_consolidated = False if len(self.blocks) > 100: - warnings.warn( - "DataFrame is highly fragmented. This is usually the result " - "of calling `frame.insert` many times, which has poor performance. " - "Consider using pd.concat instead. To get a de-fragmented frame, " - "use `newframe = frame.copy()`", - PerformanceWarning, - stacklevel=3, - ) + self._consolidate_inplace() def reindex_axis( self, diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 6e4deb5469777..622c93d1c2fdc 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas.errors import PerformanceWarning - from pandas import DataFrame, Index import pandas._testing as tm @@ -68,15 +66,3 @@ def test_insert_with_columns_dups(self): [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] ) tm.assert_frame_equal(df, exp) - - def test_insert_item_cache(self): - df = DataFrame(np.random.randn(4, 3)) - ser = df[0] - - with tm.assert_produces_warning(PerformanceWarning): - for n in range(100): - df[n + 3] = df[1] * n - - ser.values[0] = 99 - - assert df.iloc[0, 0] == df[0][0] From a3aa331cfb1b5af3f5a35b9147746272bacbb028 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 8 Dec 2020 08:38:02 -0800 Subject: [PATCH 3/3] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b40f012f034b6..90f611c55e710 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -104,7 +104,7 @@ Timezones Numeric ^^^^^^^ - +- Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`) - -