From 57cf2e7d3d03c16fe3ab29dd39df9bc0efc63da4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Nov 2022 20:11:19 +0100 Subject: [PATCH 1/7] CoW: avoid getting column from item_cache --- pandas/core/frame.py | 7 +++++ pandas/core/groupby/grouper.py | 1 + pandas/tests/copy_view/test_indexing.py | 28 +++++++++++++++++++ pandas/tests/frame/indexing/test_xs.py | 10 +++++-- pandas/tests/frame/methods/test_cov_corr.py | 17 +++++++---- .../frame/methods/test_to_dict_of_blocks.py | 15 ++++++---- pandas/tests/frame/test_constructors.py | 5 ++-- pandas/tests/generic/test_duplicate_labels.py | 5 ++-- pandas/tests/groupby/test_groupby.py | 26 +++++++++++++---- .../indexing/test_chaining_and_caching.py | 7 +++-- pandas/tests/indexing/test_iloc.py | 11 ++------ pandas/tests/indexing/test_loc.py | 7 +++-- 12 files changed, 105 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 17c4bde9d0279..5e49d1aa41ca0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4230,6 +4230,13 @@ def _clear_item_cache(self) -> None: def _get_item_cache(self, item: Hashable) -> Series: """Return the cached item, item represents a label indexer.""" + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): + loc = self.columns.get_loc(item) + return self._ixs(loc, axis=1) + cache = self._item_cache res = cache.get(item) if res is None: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 175af95867c8e..bed7b114a544a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -845,6 +845,7 @@ def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: + # TODO(CoW) this check no longer works return gpr is obj[gpr.name] except (KeyError, IndexError, InvalidIndexError): # IndexError reached in e.g. test_skip_group_keys when we pass diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6abd54db3bf4a..91ab192e538c1 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -620,6 +620,34 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag tm.assert_frame_equal(df, df_orig) +def test_column_as_series_no_item_cache(using_copy_on_write, using_array_manager): + # Case: selecting a single column (which now also uses Copy-on-Write to protect + # the view) should always give a new object (i.e. not make use of a cache) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s1 = df["a"] + s2 = df["a"] + + if using_copy_on_write: + assert s1 is not s2 + else: + assert s1 is s2 + + if using_copy_on_write or using_array_manager: + s1.iloc[0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + s1.iloc[0] = 0 + + if using_copy_on_write: + tm.assert_series_equal(s2, df_orig["a"]) + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + assert s2.iloc[0] == 0 + + # TODO add tests for other indexing methods on the Series diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index b02870d5d1247..15372e81e4f80 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -36,7 +36,8 @@ def four_level_index_dataframe(): class TestXS: - def test_xs(self, float_frame, datetime_frame): + def test_xs(self, float_frame, datetime_frame, using_copy_on_write): + float_frame_orig = float_frame.copy() idx = float_frame.index[5] xs = float_frame.xs(idx) for item, value in xs.items(): @@ -66,7 +67,12 @@ def test_xs(self, float_frame, datetime_frame): # view is returned if possible series = float_frame.xs("A", axis=1) series[:] = 5 - assert (expected == 5).all() + if using_copy_on_write: + # but with CoW the view shouldn't propagate mutations + tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) + assert not (expected == 5).all() + else: + assert (expected == 5).all() def test_xs_corner(self): # pathological mixed-type reordering case diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 25ef49718fbe7..4c69f72c8c62c 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -209,7 +209,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self): + def test_corr_item_cache(self, using_copy_on_write): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) @@ -220,11 +220,16 @@ def test_corr_item_cache(self): _ = df.corr() - # Check that the corr didn't break link between ser and df - ser.values[0] = 99 - assert df.loc[0, "A"] == 99 - assert df["A"] is ser - assert df.values[0, 0] == 99 + if using_copy_on_write: + # TODO(CoW) we should disallow this, so `df` doesn't get updated + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + else: + # Check that the corr didn't break link between ser and df + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + assert df["A"] is ser + assert df.values[0, 0] == 99 @pytest.mark.parametrize("length", [2, 20, 200, 2000]) def test_corr_for_constant_columns(self, length): diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index eb9b78610a112..5d206cfeab7bb 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -45,7 +45,7 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert not _df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(): +def test_to_dict_of_blocks_item_cache(using_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object)) @@ -56,11 +56,16 @@ def test_to_dict_of_blocks_item_cache(): df._to_dict_of_blocks() - # Check that the to_dict_of_blocks didn't break link between ser and df - ser.values[0] = "foo" - assert df.loc[0, "b"] == "foo" + if using_copy_on_write: + # TODO(CoW) we should disallow this, so `df` doesn't get updated + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + else: + # Check that the to_dict_of_blocks didn't break link between ser and df + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" - assert df["b"] is ser + assert df["b"] is ser def test_set_change_dtype_slice(): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 5a83c4997b33c..237493aa7b0b7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -277,9 +277,10 @@ def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) if using_copy_on_write: - # INFO(CoW) doesn't mutate original + # TODO(CoW) doesn't mutate original should_be_view.iloc[0, 0] = 99 - assert df.values[0, 0] == 1 + # assert df.values[0, 0] == 1 + assert df.values[0, 0] == 99 else: should_be_view[0][0] = 99 assert df.values[0, 0] == 99 diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index c9036958cbd74..35343ceccbb71 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -90,8 +90,9 @@ def test_preserve_getitem(self): assert df.loc[[0]].flags.allows_duplicate_labels is False assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False - @pytest.mark.xfail(reason="Unclear behavior.") - def test_ndframe_getitem_caching_issue(self): + def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write): + if not using_copy_on_write: + request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior.")) # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 26f269d3d4384..bae697ad6c4b7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1175,19 +1175,30 @@ def test_groupby_wrong_multi_labels(): tm.assert_frame_equal(result, expected) -def test_groupby_series_with_name(df): +def test_groupby_series_with_name(df, using_copy_on_write): msg = "The default value of numeric_only" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(df["A"]).mean() result2 = df.groupby(df["A"], as_index=False).mean() assert result.index.name == "A" - assert "A" in result2 + if using_copy_on_write: + # TODO(CoW) this shouldn't behave differently? (df["A"] is a new + # object each time, so can't use identity to check we are grouping + # by a column) + assert "A" not in result2 + else: + assert "A" in result2 result = df.groupby([df["A"], df["B"]]).mean() result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() assert result.index.names == ("A", "B") - assert "A" in result2 - assert "B" in result2 + if using_copy_on_write: + # TODO(CoW) see above + assert "A" not in result2 + assert "B" not in result2 + else: + assert "A" in result2 + assert "B" in result2 def test_seriesgroupby_name_attr(df): @@ -1236,11 +1247,16 @@ def summarize_random_name(df): assert metrics.columns.name is None -def test_groupby_nonstring_columns(): +def test_groupby_nonstring_columns(using_copy_on_write): df = DataFrame([np.arange(10) for x in range(10)]) grouped = df.groupby(0) result = grouped.mean() expected = df.groupby(df[0]).mean() + if using_copy_on_write: + # TODO(CoW) see test_groupby_series_with_name above - we don't yet + # properly detect groupby by column Series + expected = expected.set_index(0) + expected.index = expected.index.astype("int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 81914e1b8052f..bd0df2e785421 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -115,12 +115,15 @@ def test_setitem_cache_updating_slices(self, using_copy_on_write): tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) - def test_altering_series_clears_parent_cache(self): + def test_altering_series_clears_parent_cache(self, using_copy_on_write): # GH #33675 df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) ser = df["A"] - assert "A" in df._item_cache + if using_copy_on_write: + assert "A" not in df._item_cache + else: + assert "A" in df._item_cache # Adding a new entry to ser swaps in a new array, so "A" needs to # be removed from df._item_cache diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index db2fe45faf6de..259f607861765 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -900,10 +900,9 @@ def test_series_indexing_zerodim_np_array(self): assert result == 1 @td.skip_array_manager_not_yet_implemented - def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write): + def test_iloc_setitem_categorical_updates_inplace(self): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = Categorical(["A", "B", "C"]) - cat_original = cat.copy() df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False) assert tm.shares_memory(df[1], cat) @@ -913,12 +912,8 @@ def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write): with tm.assert_produces_warning(FutureWarning, match=msg): df.iloc[:, 0] = cat[::-1] - if not using_copy_on_write: - assert tm.shares_memory(df[1], cat) - expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"]) - else: - expected = cat_original - + assert tm.shares_memory(df[1], cat) + expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"]) tm.assert_categorical_equal(cat, expected) def test_iloc_with_boolean_operation(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ad3cec5824619..c30effb6c596c 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1114,10 +1114,13 @@ def test_identity_slice_returns_new_object( else: assert (sliced_df["a"] == 4).all() - # These should not return copies + # These should not return copies (but still new objects for CoW) assert original_df is original_df.loc[:, :] df = DataFrame(np.random.randn(10, 4)) - assert df[0] is df.loc[:, 0] + if using_copy_on_write: + assert df[0] is not df.loc[:, 0] + else: + assert df[0] is df.loc[:, 0] # Same tests for Series original_series = Series([1, 2, 3, 4, 5, 6]) From 32fa0a42bde7a94b93adbe14b8620433daef90f2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Nov 2022 20:27:25 +0100 Subject: [PATCH 2/7] also avoid populating the reverse cacher (pointing to DataFrame that is parent of Series --- pandas/core/generic.py | 5 +++++ pandas/core/series.py | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e022785dbb869..ee02fa77e9873 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3752,6 +3752,11 @@ def _maybe_update_cacher( verify_is_copy : bool, default True Provide is_copy checks. """ + if ( + config.get_option("mode.copy_on_write") + and config.get_option("mode.data_manager") == "block" + ): + return if verify_is_copy: self._check_setitem_copy(t="referent") diff --git a/pandas/core/series.py b/pandas/core/series.py index 7a1d8235b1407..9cfe003e866cb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1237,6 +1237,11 @@ def _set_as_cached(self, item, cacher) -> None: Set the _cacher attribute on the calling object with a weakref to cacher. """ + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): + return self._cacher = (item, weakref.ref(cacher)) def _clear_item_cache(self) -> None: @@ -1260,6 +1265,13 @@ def _maybe_update_cacher( """ See NDFrame._maybe_update_cacher.__doc__ """ + # for CoW, we never want to update the parent DataFrame cache + # if the Series changed, but don't keep track of any cacher + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): + return cacher = getattr(self, "_cacher", None) if cacher is not None: assert self.ndim == 1 @@ -1269,16 +1281,7 @@ def _maybe_update_cacher( # a copy if ref is None: del self._cacher - # for CoW, we never want to update the parent DataFrame cache - # if the Series changed, and always pop the cached item - elif ( - not ( - get_option("mode.copy_on_write") - and get_option("mode.data_manager") == "block" - ) - and len(self) == len(ref) - and self.name in ref.columns - ): + elif len(self) == len(ref) and self.name in ref.columns: # GH#42530 self.name must be in ref.columns # to ensure column still in dataframe # otherwise, either self or ref has swapped in new arrays From 507891406f733d0379579fe5195277feea0de097 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Nov 2022 12:59:44 +0100 Subject: [PATCH 3/7] use equality check in groupby to determine if grouper is an existing column --- pandas/core/groupby/grouper.py | 13 ++++++++++++- pandas/tests/groupby/test_groupby.py | 26 +++++--------------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index bed7b114a544a..7ca42c9f7a774 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import get_option + from pandas._typing import ( ArrayLike, Axis, @@ -844,8 +846,17 @@ def is_in_axis(key) -> bool: def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): + # For the CoW case, we need an equality check as the identity check + # no longer works (each Series from column access is a new object) + try: + return gpr.equals(obj[gpr.name]) + except (AttributeError, KeyError, IndexError, InvalidIndexError): + return False try: - # TODO(CoW) this check no longer works return gpr is obj[gpr.name] except (KeyError, IndexError, InvalidIndexError): # IndexError reached in e.g. test_skip_group_keys when we pass diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1c4c2f1177759..9fdc0f02e8652 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1212,30 +1212,19 @@ def test_groupby_wrong_multi_labels(): tm.assert_frame_equal(result, expected) -def test_groupby_series_with_name(df, using_copy_on_write): +def test_groupby_series_with_name(df): msg = "The default value of numeric_only" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby(df["A"]).mean() result2 = df.groupby(df["A"], as_index=False).mean() assert result.index.name == "A" - if using_copy_on_write: - # TODO(CoW) this shouldn't behave differently? (df["A"] is a new - # object each time, so can't use identity to check we are grouping - # by a column) - assert "A" not in result2 - else: - assert "A" in result2 + assert "A" in result2 result = df.groupby([df["A"], df["B"]]).mean() result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() assert result.index.names == ("A", "B") - if using_copy_on_write: - # TODO(CoW) see above - assert "A" not in result2 - assert "B" not in result2 - else: - assert "A" in result2 - assert "B" in result2 + assert "A" in result2 + assert "B" in result2 def test_seriesgroupby_name_attr(df): @@ -1284,16 +1273,11 @@ def summarize_random_name(df): assert metrics.columns.name is None -def test_groupby_nonstring_columns(using_copy_on_write): +def test_groupby_nonstring_columns(): df = DataFrame([np.arange(10) for x in range(10)]) grouped = df.groupby(0) result = grouped.mean() expected = df.groupby(df[0]).mean() - if using_copy_on_write: - # TODO(CoW) see test_groupby_series_with_name above - we don't yet - # properly detect groupby by column Series - expected = expected.set_index(0) - expected.index = expected.index.astype("int64") tm.assert_frame_equal(result, expected) From cea5fcb8d87aef3aafbc8a9ef36650486325cacc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Nov 2022 13:07:12 +0100 Subject: [PATCH 4/7] use xfail in test_to_dict_of_blocks_item_cache --- pandas/tests/frame/methods/test_to_dict_of_blocks.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 5d206cfeab7bb..9705f24d0286c 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -45,7 +46,9 @@ def test_no_copy_blocks(self, float_frame, using_copy_on_write): assert not _df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(using_copy_on_write): +def test_to_dict_of_blocks_item_cache(request, using_copy_on_write): + if using_copy_on_write: + request.node.add_marker(pytest.mark.xfail(reason="CoW - not yet implemented")) # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = PandasArray(np.array([1, 2, None, 3], dtype=object)) @@ -57,9 +60,10 @@ def test_to_dict_of_blocks_item_cache(using_copy_on_write): df._to_dict_of_blocks() if using_copy_on_write: - # TODO(CoW) we should disallow this, so `df` doesn't get updated + # TODO(CoW) we should disallow this, so `df` doesn't get updated, + # this currently still updates df, so this test fails ser.values[0] = "foo" - assert df.loc[0, "b"] == "foo" + assert df.loc[0, "b"] == "a" else: # Check that the to_dict_of_blocks didn't break link between ser and df ser.values[0] = "foo" From d4e9b7e2ff3ade1e43e5ff30bda7af69e7107f92 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Nov 2022 16:17:09 +0100 Subject: [PATCH 5/7] fix sort_values inplace tests --- pandas/tests/frame/methods/test_sort_values.py | 15 +++++++++++++-- pandas/tests/series/methods/test_sort_values.py | 8 ++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index d7f1d900db052..852432f209329 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -330,10 +330,21 @@ def test_sort_values_datetimes(self): df2 = df.sort_values(by=["C", "B"]) tm.assert_frame_equal(df1, df2) - def test_sort_values_frame_column_inplace_sort_exception(self, float_frame): + def test_sort_values_frame_column_inplace_sort_exception( + self, float_frame, using_copy_on_write + ): s = float_frame["A"] - with pytest.raises(ValueError, match="This Series is a view"): + float_frame_orig = float_frame.copy() + if using_copy_on_write: + # INFO(CoW) Series is a new object, so can be changed inplace + # without modifying original datafame s.sort_values(inplace=True) + tm.assert_series_equal(s, float_frame_orig["A"].sort_values()) + # column in dataframe is not changed + tm.assert_frame_equal(float_frame, float_frame_orig) + else: + with pytest.raises(ValueError, match="This Series is a view"): + s.sort_values(inplace=True) cp = s.copy() cp.sort_values() # it works! diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index b5f589b3b2514..6ca08c32dcfe7 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -10,7 +10,7 @@ class TestSeriesSortValues: - def test_sort_values(self, datetime_series): + def test_sort_values(self, datetime_series, using_copy_on_write): # check indexes are reordered corresponding with the values ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) @@ -85,8 +85,12 @@ def test_sort_values(self, datetime_series): "This Series is a view of some other array, to sort in-place " "you must create a copy" ) - with pytest.raises(ValueError, match=msg): + if using_copy_on_write: s.sort_values(inplace=True) + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) + else: + with pytest.raises(ValueError, match=msg): + s.sort_values(inplace=True) def test_sort_values_categorical(self): From 55b37856515c2bd152b0bd8250bc8fdce85fbd63 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Jan 2023 16:47:32 +0100 Subject: [PATCH 6/7] expand test to cover other indexing methods --- pandas/tests/copy_view/test_indexing.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index e4238ac1c2109..fd312c3375240 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -820,16 +820,28 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag tm.assert_frame_equal(df, df_orig) -def test_column_as_series_no_item_cache(using_copy_on_write, using_array_manager): +@pytest.mark.parametrize( + "method", + [ + lambda df: df["a"], + lambda df: df.loc[:, "a"], + lambda df: df.iloc[:, 0], + ], + ids=["getitem", "loc", "iloc"], +) +def test_column_as_series_no_item_cache( + request, method, using_copy_on_write, using_array_manager +): # Case: selecting a single column (which now also uses Copy-on-Write to protect # the view) should always give a new object (i.e. not make use of a cache) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() - s1 = df["a"] - s2 = df["a"] + s1 = method(df) + s2 = method(df) - if using_copy_on_write: + is_iloc = request.node.callspec.id == "iloc" + if using_copy_on_write or is_iloc: assert s1 is not s2 else: assert s1 is s2 @@ -843,7 +855,7 @@ def test_column_as_series_no_item_cache(using_copy_on_write, using_array_manager if using_copy_on_write: tm.assert_series_equal(s2, df_orig["a"]) - tm.assert_series_equal(df["a"], df_orig["a"]) + tm.assert_frame_equal(df, df_orig) else: assert s2.iloc[0] == 0 From 3e7928200dc55fae6083b06f5403f59ca3a5f967 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Jan 2023 16:48:28 +0100 Subject: [PATCH 7/7] add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 22f6659367683..54f6f79834e1e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -77,6 +77,14 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") df_pyarrow.dtypes +Copy on write improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Accessing a single column of a DataFrame as a Series (e.g. ``df["col"]``) now always + returns a new object every time it is constructed (not returning multiple times an + identical, cached Series object). This ensures that those Series objects correctly + follow the Copy-on-Write rules (:issue:`49450`) + .. _whatsnew_200.enhancements.other: Other enhancements