From 096d67924a5f7cb4edf7923e624b118331c20beb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 11:02:12 +0200 Subject: [PATCH 1/3] TST: add copy/view test for setting columns with an array/series --- pandas/conftest.py | 8 +++ pandas/tests/copy_view/test_setitem.py | 95 ++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 pandas/tests/copy_view/test_setitem.py diff --git a/pandas/conftest.py b/pandas/conftest.py index d330c2de9d23f..c0ae92c1778bc 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1797,3 +1797,11 @@ def using_array_manager(): Fixture to check if the array manager is being used. """ return pd.options.mode.data_manager == "array" + + +@pytest.fixture +def using_copy_on_write(): + """ + Fixture to check if Copy-on-Write is enabled. + """ + return False diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py new file mode 100644 index 0000000000000..1c0aa8b38bb5d --- /dev/null +++ b/pandas/tests/copy_view/test_setitem.py @@ -0,0 +1,95 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + RangeIndex, + Series, +) +import pandas._testing as tm + +# ----------------------------------------------------------------------------- +# Copy/view behaviour for the values that are set in a DataFrame + + +def test_set_column_with_array(): + # Case: setting an array as a new column (df[col] = arr) copies that data + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array([1, 2, 3]) + + df["c"] = arr + + # the array data is copied + assert not np.shares_memory(df["c"].values, arr) + # and thus modifying the array does not modify the DataFrame + arr[0] = 0 + tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + + +def test_set_column_with_series(using_copy_on_write): + # Case: setting a series as a new column (df[col] = s) copies that data + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + s = Series([1, 2, 3]) + + df["c"] = s + + if using_copy_on_write: + # with CoW we can delay the copy + assert np.shares_memory(df["c"].values, s.values) + else: + # the series data is copied + assert not np.shares_memory(df["c"].values, s.values) + + # and modifying the series does not modify the DataFrame + s.iloc[0] = 0 + tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + + +def test_set_column_with_index(using_copy_on_write): + # Case: setting an index as a new column (df[col] = idx) copies that data + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + idx = Index([1, 2, 3]) + + df["c"] = idx + + # the index data is copied + assert not np.shares_memory(df["c"].values, idx.values) + + # and thus modifying the index does not modify the DataFrame + idx.values[0] = 0 + tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) + + # however, in case of a RangeIndex, we currently don't copy the cached + # "materialized" values + idx = RangeIndex(1, 4) + arr = idx.values + + df["d"] = idx + + if using_copy_on_write: + assert not np.shares_memory(df["d"].values, arr) + arr[0] = 0 + tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) + else: + assert np.shares_memory(df["d"].values, arr) + arr[0] = 0 + tm.assert_series_equal(df["d"], Series([0, 2, 3], name="d")) + + +def test_set_columns_with_dataframe(using_copy_on_write): + # Case: setting a DataFrame as new columns copies that data + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) + + df[["c", "d"]] = df2 + + if using_copy_on_write: + # with CoW we can delay the copy + assert np.shares_memory(df["c"].values, df2["c"].values) + else: + # the data is copied + assert not np.shares_memory(df["c"].values, df2["c"].values) + + # and modifying the set DataFrame does not modify the original DataFrame + df2.iloc[0, 0] = 0 + tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c")) From 5c0a8b4dab1d16b65c0ff82284e2a1dbb69bd7f2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 14:58:40 +0200 Subject: [PATCH 2/3] Update pandas/tests/copy_view/test_setitem.py --- pandas/tests/copy_view/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 1c0aa8b38bb5d..82ea680278a7f 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -15,7 +15,7 @@ def test_set_column_with_array(): # Case: setting an array as a new column (df[col] = arr) copies that data df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - arr = np.array([1, 2, 3]) + arr = np.array([1, 2, 3], dtype="int64") df["c"] = arr From 91386a1df55bf7e6929031d95ad268068756ac09 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 22 May 2022 14:57:20 +0200 Subject: [PATCH 3/3] address feedback --- pandas/tests/copy_view/test_setitem.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 82ea680278a7f..37714e346eee7 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -29,19 +29,20 @@ def test_set_column_with_array(): def test_set_column_with_series(using_copy_on_write): # Case: setting a series as a new column (df[col] = s) copies that data df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - s = Series([1, 2, 3]) + ser = Series([1, 2, 3]) - df["c"] = s + df["c"] = ser if using_copy_on_write: # with CoW we can delay the copy - assert np.shares_memory(df["c"].values, s.values) + assert np.shares_memory(df["c"].values, ser.values) else: # the series data is copied - assert not np.shares_memory(df["c"].values, s.values) + assert not np.shares_memory(df["c"].values, ser.values) # and modifying the series does not modify the DataFrame - s.iloc[0] = 0 + ser.iloc[0] = 0 + assert ser.iloc[0] == 0 tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))