From ed2e322a23421702ad295d57268159c1bef8b43a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 17 Jan 2023 22:03:11 +0100 Subject: [PATCH 01/23] ENH: Add lazy copy to astype --- pandas/core/generic.py | 10 ++++-- pandas/core/internals/blocks.py | 7 ++++ pandas/core/internals/managers.py | 16 +++++++-- pandas/tests/copy_view/test_methods.py | 46 ++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 700601cb4f024..57067eae577c2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6011,7 +6011,7 @@ def dtypes(self): return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) def astype( - self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise" + self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise" ) -> NDFrameT: """ Cast a pandas object to a specified dtype ``dtype``. @@ -6159,7 +6159,11 @@ def astype( for i, (col_name, col) in enumerate(self.items()): cdt = dtype_ser.iat[i] if isna(cdt): - res_col = col.copy() if copy else col + if using_copy_on_write(): + # Make a shallow copy even if copy=False for CoW + res_col = col.copy(deep=copy) + else: + res_col = col if copy is False else col.copy() else: try: res_col = col.astype(dtype=cdt, copy=copy, errors=errors) @@ -6186,7 +6190,7 @@ def astype( # GH 33113: handle empty frame or series if not results: - return self.copy() + return self.copy(deep=None) # GH 19920: retain column metadata after concat result = concat(results, axis=1, copy=False) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4bb4882574228..2af52ddf82f62 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,9 +11,12 @@ cast, final, ) +import weakref import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import ( Timestamp, internals as libinternals, @@ -152,6 +155,7 @@ class Block(PandasObject): is_extension = False _can_consolidate = True _validate_ndim = True + _ref = None @final @cache_readonly @@ -496,6 +500,9 @@ def astype( f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) + if using_copy_on_write(): + if not copy and values is new_values: + newb._ref = weakref.ref(self) return newb @final diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a0aba0778da5..943028e0c27b9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -435,8 +435,20 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) - def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: - return self.apply("astype", dtype=dtype, copy=copy, errors=errors) + def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + + result = self.apply("astype", dtype=dtype, copy=copy, errors=errors) + if using_copy_on_write() and not copy: + refs = [blk._ref for blk in result.blocks] + if any(ref is not None for ref in refs): + result.refs = refs + result.parent = self + return result def convert(self: T, copy: bool) -> T: return self.apply( diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 8fd9d5c5126c1..5f4ed77b08d2e 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -417,6 +417,52 @@ def test_to_frame(using_copy_on_write): tm.assert_frame_equal(df, expected) +def test_astype_single_dtype(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) + df_orig = df.copy() + df2 = df.astype("float64") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +def test_astype_dict_dtypes(using_copy_on_write): + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} + ) + df_orig = df.copy() + df2 = df.astype({"a": "float64", "c": "float64"}) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + + df2.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize("ax", ["index", "columns"]) def test_swapaxes_noop(using_copy_on_write, ax): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) From 1967a2a47486ab51d950fbb79b9c9a6dedacdefc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 17 Jan 2023 23:09:01 +0100 Subject: [PATCH 02/23] Fix --- pandas/core/internals/blocks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2af52ddf82f62..aaa745a380d06 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -501,7 +501,8 @@ def astype( f"({newb.dtype.name} [{newb.shape}])" ) if using_copy_on_write(): - if not copy and values is new_values: + if not copy: + # This tracks more references than necessary. newb._ref = weakref.ref(self) return newb From 45ea9892a56fb1c5ce7f41a0a0cc50944a07be5a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 17 Jan 2023 23:21:50 +0100 Subject: [PATCH 03/23] Fix test --- pandas/tests/series/test_constructors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 05e40e20f1226..8bf35621a1632 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -874,13 +874,16 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp with pytest.raises(IntCastingNaNError, match=msg): Series(np.array(vals), dtype=any_int_numpy_dtype) - def test_constructor_dtype_no_cast(self): + def test_constructor_dtype_no_cast(self, using_copy_on_write): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 - assert s[1] == 5 + if using_copy_on_write: + assert s[1] == 2 + else: + assert s[1] == 5 def test_constructor_datelike_coercion(self): From a2e1e537d37030dfb93534e0936107ab7019823e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 18 Jan 2023 12:18:15 +0100 Subject: [PATCH 04/23] Fix array manager --- pandas/core/internals/array_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b8ef925362e7b..f30b7c4c28859 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -371,7 +371,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast ) - def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: + def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + if copy is None: + copy = True + return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors) def convert(self: T, copy: bool) -> T: From 8f29fdaa364a71bb04503609f30372b2cfe910d6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 19 Jan 2023 14:24:20 +0100 Subject: [PATCH 05/23] Narrow down --- pandas/core/internals/blocks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index aaa745a380d06..fc43d35001e9e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -501,7 +501,15 @@ def astype( f"({newb.dtype.name} [{newb.shape}])" ) if using_copy_on_write(): - if not copy: + if ( + not copy + and isinstance(values.dtype, np.dtype) + and isinstance(new_values.dtype, np.dtype) + and values is not new_values + ): + # We made a copy + pass + elif not copy: # This tracks more references than necessary. newb._ref = weakref.ref(self) return newb From a1847e8f0a29011102e28ba7813da6076247adb7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 20:29:53 -0500 Subject: [PATCH 06/23] Update astype --- pandas/core/dtypes/astype.py | 33 +++++++++++ pandas/core/internals/blocks.py | 16 ++---- pandas/tests/copy_view/test_methods.py | 78 ++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index e5b0b5658534f..6dde94748efc6 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -26,6 +26,7 @@ is_dtype_equal, is_integer_dtype, is_object_dtype, + is_string_dtype, is_timedelta64_dtype, pandas_dtype, ) @@ -246,3 +247,35 @@ def astype_array_safe( raise return new_values + + +def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: + """Checks if astype avoided copying the data. + + Parameters + ---------- + dtype : Original dtype + new_dtype : target dtype + + Returns + ------- + True if new data is a view, False otherwise + """ + if dtype == new_dtype: + return True + + if isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype): + # Only equal numpy dtypes avoid a copy + return False + + if is_string_dtype(dtype) and is_string_dtype(new_dtype): + return True + elif is_string_dtype(dtype) or is_string_dtype(new_dtype): + return False + + if getattr(dtype, "numpy_dtype", dtype) == getattr( + new_dtype, "numpy_dtype", new_dtype + ): + return True + + return False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 437631bc232e6..1663a491f41e7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -41,7 +41,10 @@ from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.astype import astype_array_safe +from pandas.core.dtypes.astype import ( + astype_array_safe, + astype_is_view, +) from pandas.core.dtypes.cast import ( LossySetitemError, can_hold_element, @@ -501,16 +504,7 @@ def astype( f"({newb.dtype.name} [{newb.shape}])" ) if using_copy_on_write(): - if ( - not copy - and isinstance(values.dtype, np.dtype) - and isinstance(new_values.dtype, np.dtype) - and values is not new_values - ): - # We made a copy - pass - elif not copy: - # This tracks more references than necessary. + if astype_is_view(values.dtype, new_values.dtype): newb._ref = weakref.ref(self) return newb diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6a64dde97bfc5..b45907bac16c4 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat import pa_version_under6p0 + from pandas import ( DataFrame, Index, @@ -509,6 +511,82 @@ def test_astype_single_dtype(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) +def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): + if dtype == "int32[pyarrow]" and pa_version_under6p0: + pytest.skip("pyarrow not installed") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + if new_dtype == "int64[pyarrow]": + pytest.skip("Does not make a copy") + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) +def test_astype_different_target_dtype(using_copy_on_write, dtype): + if dtype == "int32[pyarrow]" and pa_version_under6p0: + pytest.skip("pyarrow not installed") + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.astype(dtype) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 5 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = "x" + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object_update_original( + using_copy_on_write, dtype, new_dtype +): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df2 = df.astype(new_dtype) + df_orig = df2.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df.iloc[0, 0] = "x" + tm.assert_frame_equal(df2, df_orig) + + def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} From 75bdf1d2c3b2b141297213a72d84b5903d00ac26 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 20:39:22 -0500 Subject: [PATCH 07/23] Fix datetime case --- pandas/core/dtypes/astype.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 6dde94748efc6..88d0bee46c8db 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -264,16 +264,20 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: if dtype == new_dtype: return True - if isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype): + elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype): # Only equal numpy dtypes avoid a copy return False if is_string_dtype(dtype) and is_string_dtype(new_dtype): return True + elif is_string_dtype(dtype) or is_string_dtype(new_dtype): return False - if getattr(dtype, "numpy_dtype", dtype) == getattr( + elif dtype.kind in "mM" and new_dtype.kind in "mM": + return True + + elif getattr(dtype, "numpy_dtype", dtype) == getattr( new_dtype, "numpy_dtype", new_dtype ): return True From 528b0fb24597a280291c72583163638ee098406c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 25 Jan 2023 21:41:17 -0500 Subject: [PATCH 08/23] Skip --- pandas/tests/copy_view/test_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index b45907bac16c4..d8ccc34087b6f 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -514,7 +514,7 @@ def test_astype_single_dtype(using_copy_on_write): @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): - if dtype == "int32[pyarrow]" and pa_version_under6p0: + if dtype == "int64[pyarrow]" and pa_version_under6p0: pytest.skip("pyarrow not installed") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) df_orig = df.copy() From f29a0be285520fbe6b86299660486bf96d08279f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 26 Jan 2023 19:21:34 -0500 Subject: [PATCH 09/23] Fix test --- pandas/tests/copy_view/test_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index d8ccc34087b6f..ce58364fbca76 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -514,7 +514,7 @@ def test_astype_single_dtype(using_copy_on_write): @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): - if dtype == "int64[pyarrow]" and pa_version_under6p0: + if new_dtype == "int64[pyarrow]" and pa_version_under6p0: pytest.skip("pyarrow not installed") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) df_orig = df.copy() From 35b9fa4941645c9d25fa352d71074586ca79d492 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 26 Jan 2023 19:36:18 -0500 Subject: [PATCH 10/23] Refactor --- pandas/core/internals/managers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 943028e0c27b9..c8ebb7c46e1f6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -436,13 +436,19 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: ) def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: + return self._apply_and_handle_refs( + "astype", copy=copy, dtype=dtype, errors=errors + ) + + def _apply_and_handle_refs(self, func: str, copy: bool | None, **kwargs): if copy is None: if using_copy_on_write(): copy = False else: copy = True - result = self.apply("astype", dtype=dtype, copy=copy, errors=errors) + result = self.apply(func, copy=copy, **kwargs) + if using_copy_on_write() and not copy: refs = [blk._ref for blk in result.blocks] if any(ref is not None for ref in refs): From 664c31ba660746622bdc88b58a3b154350f7efdf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 30 Jan 2023 22:13:40 +0100 Subject: [PATCH 11/23] Address review --- pandas/core/dtypes/astype.py | 2 ++ pandas/core/generic.py | 6 +----- pandas/tests/copy_view/test_constructors.py | 8 +++++--- pandas/tests/copy_view/test_methods.py | 22 +++++++++++++++------ 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 88d0bee46c8db..ebb0c4f189b7a 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -280,6 +280,8 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: elif getattr(dtype, "numpy_dtype", dtype) == getattr( new_dtype, "numpy_dtype", new_dtype ): + # If underlying numpy dtype is the same, no copy is made, e.g. + # int64 -> Int64 or int64[pyarrow] return True return False diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3e100779f0111..2848bee655716 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6290,11 +6290,7 @@ def astype( for i, (col_name, col) in enumerate(self.items()): cdt = dtype_ser.iat[i] if isna(cdt): - if using_copy_on_write(): - # Make a shallow copy even if copy=False for CoW - res_col = col.copy(deep=copy) - else: - res_col = col if copy is False else col.copy() + res_col = col.copy(deep=copy) else: try: res_col = col.astype(dtype=cdt, copy=copy, errors=errors) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index c04c733e5ee1d..f8a57b67f29fb 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import Series @@ -6,13 +7,14 @@ # Copy/view behaviour for Series / DataFrame constructors -def test_series_from_series(using_copy_on_write): +@pytest.mark.parametrize("dtype", [None, "int64"]) +def test_series_from_series(dtype, using_copy_on_write): # Case: constructing a Series from another Series object follows CoW rules: # a new object is returned and thus mutations are not propagated ser = Series([1, 2, 3], name="name") # default is copy=False -> new Series is a shallow copy / view of original - result = Series(ser) + result = Series(ser, dtype=dtype) # the shallow copy still shares memory assert np.shares_memory(ser.values, result.values) @@ -34,7 +36,7 @@ def test_series_from_series(using_copy_on_write): assert np.shares_memory(ser.values, result.values) # the same when modifying the parent - result = Series(ser) + result = Series(ser, dtype=dtype) if using_copy_on_write: # mutating original doesn't mutate new series diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index ce58364fbca76..cb36ded7f80e9 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -510,6 +510,11 @@ def test_astype_single_dtype(using_copy_on_write): assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) + # mutating parent also doesn't update result + df2 = df.astype("float64") + df.iloc[0, 2] = 5.5 + tm.assert_frame_equal(df2, df_orig.astype("float64")) + @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) @@ -523,8 +528,6 @@ def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: - if new_dtype == "int64[pyarrow]": - pytest.skip("Does not make a copy") assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block @@ -533,6 +536,11 @@ def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) + # mutating parent also doesn't update result + df2 = df.astype(new_dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(new_dtype)) + @pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) def test_astype_different_target_dtype(using_copy_on_write, dtype): @@ -542,14 +550,16 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): df_orig = df.copy() df2 = df.astype(dtype) - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 0] = 5 tm.assert_frame_equal(df, df_orig) + # mutating parent also doesn't update result + df2 = df.astype(dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(dtype)) + @pytest.mark.parametrize( "dtype, new_dtype", [("object", "string"), ("string", "object")] From e1dc971997b767e5846dd76e1f7a3532b57f6342 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 30 Jan 2023 22:57:25 +0100 Subject: [PATCH 12/23] Fix decimal case --- pandas/core/dtypes/astype.py | 4 ++++ pandas/tests/arrays/test_array.py | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index ebb0c4f189b7a..0bac9b463939b 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -271,6 +271,10 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: if is_string_dtype(dtype) and is_string_dtype(new_dtype): return True + elif is_object_dtype(dtype) and new_dtype.kind == "O": + # When the underlying array has dtype object, we don't have to make a copy + return True + elif is_string_dtype(dtype) or is_string_dtype(new_dtype): return False diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 2288ac408d99e..2d25f5a650343 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -421,3 +421,14 @@ def test_array_to_numpy_na(): result = arr.to_numpy(na_value=True, dtype=bool) expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) + + +def test_array_copy_on_write(using_copy_on_write): + df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object") + df2 = df.astype(DecimalDtype()) + df.iloc[0, 0] = 0 + if using_copy_on_write: + expected = pd.DataFrame( + {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype() + ) + tm.assert_equal(df2.values, expected.values) From 273b5aa5243cac54f4f29ffcbc68987634be4a46 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 21:32:12 +0100 Subject: [PATCH 13/23] Merge CoW ref tracking --- doc/source/development/copy_on_write.rst | 32 +++ doc/source/development/index.rst | 1 + pandas/_libs/internals.pyi | 14 +- pandas/_libs/internals.pyx | 113 +++++++-- pandas/compat/pickle_compat.py | 2 +- pandas/core/generic.py | 6 +- pandas/core/internals/blocks.py | 33 ++- pandas/core/internals/concat.py | 18 +- pandas/core/internals/managers.py | 223 ++++-------------- pandas/core/internals/ops.py | 2 +- pandas/core/reshape/concat.py | 16 +- pandas/tests/copy_view/test_constructors.py | 4 +- .../copy_view/test_core_functionalities.py | 88 +++++++ pandas/tests/copy_view/test_internals.py | 34 +-- pandas/tests/copy_view/test_methods.py | 6 +- 15 files changed, 320 insertions(+), 272 deletions(-) create mode 100644 doc/source/development/copy_on_write.rst create mode 100644 pandas/tests/copy_view/test_core_functionalities.py diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst new file mode 100644 index 0000000000000..9d39c1a7ed42b --- /dev/null +++ b/doc/source/development/copy_on_write.rst @@ -0,0 +1,32 @@ +.. _copy_on_write: + +{{ header }} + +************* +Copy on write +************* + +Copy on Write is a mechanism to simplify the indexing API and improve +performance through avoiding copies if possible. +CoW means that any DataFrame or Series derived from another in any way always +behaves as a copy. + +Reference tracking +------------------ + +To be able to determine, if we have to make a copy when writing into a DataFrame, +we have to be aware, if the values are shared with another DataFrame. pandas +keeps track of all ``Blocks`` that share values with another block internally to +be able to tell when a copy needs to be triggered. The reference tracking +mechanism is implemented on the Block level. + +We use a custom reference tracker object, ``BlockValuesRefs``, that keeps +track of every block, whose values share memory with each other. The reference +is held through a weak-reference. Every two blocks that share some memory should +point to the same ``BlockValuesRefs`` object. If one block goes out of +scope, the reference to this block dies. As a consequence, the reference tracker +object always knows how many blocks are alive and share memory. + +We can ask the reference tracking object if there is another block alive that shares +data with us before writing into the values. We can trigger a copy before +writing if there is in fact another block alive. diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index c741441cf67a1..69f04494a271c 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -18,6 +18,7 @@ Development contributing_codebase maintaining internals + copy_on_write debugging_extensions extending developer diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 79bdbea71e4d8..5dfcc3726c84f 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -4,6 +4,7 @@ from typing import ( final, overload, ) +import weakref import numpy as np @@ -59,8 +60,13 @@ class SharedBlock: _mgr_locs: BlockPlacement ndim: int values: ArrayLike + refs: BlockValuesRefs def __init__( - self, values: ArrayLike, placement: BlockPlacement, ndim: int + self, + values: ArrayLike, + placement: BlockPlacement, + ndim: int, + refs: BlockValuesRefs | None = ..., ) -> None: ... class NumpyBlock(SharedBlock): @@ -87,3 +93,9 @@ class BlockManager: ) -> None: ... def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ... def _rebuild_blknos_and_blklocs(self) -> None: ... + +class BlockValuesRefs: + referenced_blocks: list[weakref.ref] + def __init__(self, blk: SharedBlock) -> None: ... + def add_reference(self, blk: SharedBlock) -> None: ... + def has_reference(self) -> bool: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 3333ac1115177..c08e21d02aa6d 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -580,9 +580,16 @@ cdef class SharedBlock: """ cdef: public BlockPlacement _mgr_locs + public BlockValuesRefs refs readonly int ndim - def __cinit__(self, values, placement: BlockPlacement, ndim: int): + def __cinit__( + self, + values, + placement: BlockPlacement, + ndim: int, + refs: BlockValuesRefs | None = None, + ): """ Parameters ---------- @@ -591,9 +598,22 @@ cdef class SharedBlock: placement : BlockPlacement ndim : int 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame + refs: BlockValuesRefs, optional + Ref tracking object or None if block does not have any refs. """ self._mgr_locs = placement self.ndim = ndim + if refs is None: + # if no refs are passed, that means we are creating a Block from + # new values that it uniquely owns -> start a new BlockValuesRefs + # object that only references this block + self.refs = BlockValuesRefs(self) + else: + # if refs are passed, this is the BlockValuesRefs object that is shared + # with the parent blocks which share the values, and a reference to this + # new block is added + refs.add_reference(self) + self.refs = refs cpdef __reduce__(self): args = (self.values, self.mgr_locs.indexer, self.ndim) @@ -619,9 +639,15 @@ cdef class NumpyBlock(SharedBlock): cdef: public ndarray values - def __cinit__(self, ndarray values, BlockPlacement placement, int ndim): + def __cinit__( + self, + ndarray values, + BlockPlacement placement, + int ndim, + refs: BlockValuesRefs | None = None, + ): # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement and ndim + # set placement, ndim and refs self.values = values cpdef NumpyBlock getitem_block_index(self, slice slicer): @@ -631,7 +657,7 @@ cdef class NumpyBlock(SharedBlock): Assumes self.ndim == 2 """ new_values = self.values[..., slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) cdef class NDArrayBackedBlock(SharedBlock): @@ -641,9 +667,15 @@ cdef class NDArrayBackedBlock(SharedBlock): cdef public: NDArrayBacked values - def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim): + def __cinit__( + self, + NDArrayBacked values, + BlockPlacement placement, + int ndim, + refs: BlockValuesRefs | None = None, + ): # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement and ndim + # set placement, ndim and refs self.values = values cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer): @@ -653,16 +685,22 @@ cdef class NDArrayBackedBlock(SharedBlock): Assumes self.ndim == 2 """ new_values = self.values[..., slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) cdef class Block(SharedBlock): cdef: public object values - def __cinit__(self, object values, BlockPlacement placement, int ndim): + def __cinit__( + self, + object values, + BlockPlacement placement, + int ndim, + refs: BlockValuesRefs | None = None, + ): # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement and ndim + # set placement, ndim and refs self.values = values @@ -673,15 +711,11 @@ cdef class BlockManager: public list axes public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs - public list refs - public object parent def __cinit__( self, blocks=None, axes=None, - refs=None, - parent=None, verify_integrity=True, ): # None as defaults for unpickling GH#42345 @@ -695,8 +729,6 @@ cdef class BlockManager: self.blocks = blocks self.axes = axes.copy() # copy to make sure we are not remotely-mutable - self.refs = refs - self.parent = parent # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False @@ -805,16 +837,12 @@ cdef class BlockManager: ndarray blknos, blklocs nbs = [] - nrefs = [] for blk in self.blocks: nb = blk.getitem_block_index(slobj) nbs.append(nb) - nrefs.append(weakref.ref(blk)) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] - mgr = type(self)( - tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False - ) + mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos blklocs = self._blklocs @@ -827,7 +855,7 @@ cdef class BlockManager: def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: if axis == 0: - new_blocks, new_refs = self._slice_take_blocks_ax0(slobj) + new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: return self._get_index_slice(slobj) else: @@ -836,6 +864,43 @@ cdef class BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)( - tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False - ) + return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) + + +cdef class BlockValuesRefs: + """Tracks all references to a given array. + + Keeps track of all blocks (through weak references) that reference the same + data. + """ + cdef: + public list referenced_blocks + + def __cinit__(self, blk: SharedBlock) -> None: + self.referenced_blocks = [weakref.ref(blk)] + + def add_reference(self, blk: SharedBlock) -> None: + """Adds a new reference to our reference collection. + + Parameters + ---------- + blk: SharedBlock + The block that the new references should point to. + """ + self.referenced_blocks.append(weakref.ref(blk)) + + def has_reference(self) -> bool: + """Checks if block has foreign references. + + A reference is only relevant if it is still alive. The reference to + ourselves does not count. + + Returns + ------- + bool + """ + self.referenced_blocks = [ + obj for obj in self.referenced_blocks if obj() is not None + ] + # Checking for more references than block pointing to itself + return len(self.referenced_blocks) > 1 diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 2fbd3a6cdb046..bfee616d52aac 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -168,7 +168,7 @@ def load_newobj(self) -> None: arr = np.array([], dtype="m8[ns]") obj = cls.__new__(cls, arr, arr.dtype) elif cls is BlockManager and not args: - obj = cls.__new__(cls, (), [], None, False) + obj = cls.__new__(cls, (), [], False) else: obj = cls.__new__(cls, *args) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index caecfac9d25fe..72f9a2f10015d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -768,8 +768,10 @@ def swapaxes( ) assert isinstance(new_mgr, BlockManager) assert isinstance(self._mgr, BlockManager) - new_mgr.parent = self._mgr - new_mgr.refs = [weakref.ref(self._mgr.blocks[0])] + new_mgr.blocks[0].refs = self._mgr.blocks[0].refs + new_mgr.blocks[0].refs.add_reference( + new_mgr.blocks[0] # type: ignore[arg-type] + ) return self._constructor(new_mgr).__finalize__(self, method="swapaxes") elif (copy or copy is None) and self._mgr.is_single_block: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1455b3ce86a0d..41b8e63cc7a64 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -23,7 +23,10 @@ lib, writers, ) -from pandas._libs.internals import BlockPlacement +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) from pandas._libs.missing import NA from pandas._libs.tslibs import IncompatibleFrequency from pandas._typing import ( @@ -150,6 +153,7 @@ class Block(PandasObject): values: np.ndarray | ExtensionArray ndim: int + refs: BlockValuesRefs __init__: Callable __slots__ = () @@ -272,7 +276,8 @@ def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block: if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)(new_values, new_mgr_locs, self.ndim) + refs = self.refs if isinstance(slicer, slice) else None + return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) @final def getitem_block_columns( @@ -288,7 +293,7 @@ def getitem_block_columns( if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)(new_values, new_mgr_locs, self.ndim) + return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs) @final def _can_hold_element(self, element: Any) -> bool: @@ -516,9 +521,13 @@ def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: def copy(self, deep: bool = True) -> Block: """copy constructor""" values = self.values + refs: BlockValuesRefs | None if deep: values = values.copy() - return type(self)(values, placement=self._mgr_locs, ndim=self.ndim) + refs = None + else: + refs = self.refs + return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs) # --------------------------------------------------------------------- # Replace @@ -1351,6 +1360,10 @@ def delete(self, loc) -> list[Block]: new_blocks: list[Block] = [] previous_loc = -1 + # TODO(CoW): This is tricky, if parent block goes out of scope + # all split blocks are referencing each other even though they + # don't share data + refs = self.refs if self.refs.has_reference() else None for idx in loc: if idx == previous_loc + 1: @@ -1361,7 +1374,9 @@ def delete(self, loc) -> list[Block]: # argument type "Tuple[slice, slice]" values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa locs = mgr_locs_arr[previous_loc + 1 : idx] - nb = type(self)(values, placement=BlockPlacement(locs), ndim=self.ndim) + nb = type(self)( + values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs + ) new_blocks.append(nb) previous_loc = idx @@ -1818,7 +1833,7 @@ def getitem_block_index(self, slicer: slice) -> ExtensionBlock: # GH#42787 in principle this is equivalent to values[..., slicer], but we don't # require subclasses of ExtensionArray to support that form (for now). new_values = self.values[slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) def diff(self, n: int, axis: AxisInt = 1) -> list[Block]: # only reached with ndim == 2 and axis == 1 @@ -2151,7 +2166,9 @@ def new_block_2d(values: ArrayLike, placement: BlockPlacement): return klass(values, ndim=2, placement=placement) -def new_block(values, placement, *, ndim: int) -> Block: +def new_block( + values, placement, *, ndim: int, refs: BlockValuesRefs | None = None +) -> Block: # caller is responsible for ensuring values is NOT a PandasArray if not isinstance(placement, BlockPlacement): @@ -2162,7 +2179,7 @@ def new_block(values, placement, *, ndim: int) -> Block: klass = get_block_type(values.dtype) values = maybe_coerce_values(values) - return klass(values, ndim=ndim, placement=placement) + return klass(values, ndim=ndim, placement=placement, refs=refs) def check_ndim(values, placement: BlockPlacement, ndim: int) -> None: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d46b51a2ee954..bedd4d92a1ea3 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -7,7 +7,6 @@ Sequence, cast, ) -import weakref import numpy as np @@ -62,10 +61,7 @@ ensure_block_shape, new_block_2d, ) -from pandas.core.internals.managers import ( - BlockManager, - using_copy_on_write, -) +from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: from pandas import Index @@ -271,8 +267,6 @@ def _concat_managers_axis0( offset = 0 blocks = [] - refs: list[weakref.ref | None] = [] - parents: list = [] for i, mgr in enumerate(mgrs): # If we already reindexed, then we definitely don't need another copy made_copy = had_reindexers[i] @@ -289,17 +283,9 @@ def _concat_managers_axis0( nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) - if not made_copy and not copy and using_copy_on_write(): - refs.extend([weakref.ref(blk) for blk in mgr.blocks]) - parents.append(mgr) - elif using_copy_on_write(): - refs.extend([None] * len(mgr.blocks)) - offset += len(mgr.items) - result_parents = parents if parents else None - result_ref = refs if refs else None - result = BlockManager(tuple(blocks), axes, parent=result_parents, refs=result_ref) + result = BlockManager(tuple(blocks), axes) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dd1394ede32be..d3e406c25e455 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -11,7 +11,6 @@ cast, ) import warnings -import weakref import numpy as np @@ -146,8 +145,6 @@ class BaseBlockManager(DataManager): _blklocs: npt.NDArray[np.intp] blocks: tuple[Block, ...] axes: list[Index] - refs: list[weakref.ref | None] | None - parent: object @property def ndim(self) -> int: @@ -156,17 +153,11 @@ def ndim(self) -> int: _known_consolidated: bool _is_consolidated: bool - def __init__(self, blocks, axes, refs=None, verify_integrity: bool = True) -> None: + def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: raise NotImplementedError @classmethod - def from_blocks( - cls: type_t[T], - blocks: list[Block], - axes: list[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, - ) -> T: + def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T: raise NotImplementedError @property @@ -254,19 +245,7 @@ def _has_no_reference_block(self, blkno: int) -> bool: (whether it references another array or is itself being referenced) Returns True if the block has no references. """ - # TODO(CoW) include `or self.refs[blkno]() is None` ? - return ( - self.refs is None or self.refs[blkno] is None - ) and weakref.getweakrefcount(self.blocks[blkno]) == 0 - - def _clear_reference_block(self, blkno: int) -> None: - """ - Clear any reference for column `i`. - """ - if self.refs is not None: - self.refs[blkno] = None - if com.all_none(*self.refs): - self.parent = None + return not self.blocks[blkno].refs.has_reference() def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) @@ -598,23 +577,17 @@ def _combine( # TODO(CoW) we could optimize here if we know that the passed blocks # are fully "owned" (eg created from an operation, not coming from # an existing manager) - new_refs: list[weakref.ref | None] | None = None if copy else [] for b in blocks: nb = b.copy(deep=copy) nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) new_blocks.append(nb) - if not copy: - # None has no attribute "append" - new_refs.append(weakref.ref(b)) # type: ignore[union-attr] axes = list(self.axes) if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) - return type(self).from_blocks( - new_blocks, axes, new_refs, parent=None if copy else self - ) + return type(self).from_blocks(new_blocks, axes) @property def nblocks(self) -> int: @@ -654,17 +627,7 @@ def copy_func(ax): new_axes = list(self.axes) res = self.apply("copy", deep=deep) - new_refs: list[weakref.ref | None] | None - if deep: - new_refs = None - parent = None - else: - new_refs = [weakref.ref(blk) for blk in self.blocks] - parent = self - res.axes = new_axes - res.refs = new_refs - res.parent = parent if self.ndim > 1: # Avoid needing to re-compute these @@ -688,7 +651,7 @@ def consolidate(self: T) -> T: if self.is_consolidated(): return self - bm = type(self)(self.blocks, self.axes, self.refs, verify_integrity=False) + bm = type(self)(self.blocks, self.axes, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() return bm @@ -750,13 +713,12 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks, new_refs = self._slice_take_blocks_ax0( + new_blocks = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice, use_na_proxy=use_na_proxy, ) - parent = None if com.all_none(*new_refs) else self else: new_blocks = [ blk.take_nd( @@ -768,13 +730,11 @@ def reindex_indexer( ) for blk in self.blocks ] - new_refs = None - parent = None new_axes = list(self.axes) new_axes[axis] = new_axis - new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent) + new_mgr = type(self).from_blocks(new_blocks, new_axes) if axis == 1: # We can avoid the need to rebuild these new_mgr._blknos = self.blknos.copy() @@ -788,7 +748,7 @@ def _slice_take_blocks_ax0( only_slice: bool = False, *, use_na_proxy: bool = False, - ) -> tuple[list[Block], list[weakref.ref | None]]: + ) -> list[Block]: """ Slice/take blocks along axis=0. @@ -821,11 +781,9 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [], [] + return [] bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)], [ - weakref.ref(blk) - ] + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -839,9 +797,7 @@ def _slice_take_blocks_ax0( ) for i, ml in enumerate(slobj) ] - # We have - # all(np.shares_memory(nb.values, blk.values) for nb in blocks) - return blocks, [weakref.ref(blk)] * len(blocks) + return blocks else: bp = BlockPlacement(slice(0, sllen)) return [ @@ -851,7 +807,7 @@ def _slice_take_blocks_ax0( new_mgr_locs=bp, fill_value=fill_value, ) - ], [None] + ] if sl_type == "slice": blknos = self.blknos[slobj] @@ -867,7 +823,6 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] - refs: list[weakref.ref | None] = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: @@ -880,7 +835,6 @@ def _slice_take_blocks_ax0( use_na_proxy=use_na_proxy, ) ) - refs.append(None) else: blk = self.blocks[blkno] @@ -894,7 +848,6 @@ def _slice_take_blocks_ax0( newblk = blk.copy(deep=False) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) blocks.append(newblk) - refs.append(weakref.ref(blk)) else: # GH#32779 to avoid the performance penalty of copying, @@ -907,7 +860,6 @@ def _slice_take_blocks_ax0( if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) - refs.append(weakref.ref(blk)) elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -917,13 +869,11 @@ def _slice_take_blocks_ax0( nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) - refs.append(weakref.ref(blk)) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) blocks.append(nb) - refs.append(None) - return blocks, refs + return blocks def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -1008,8 +958,6 @@ def __init__( self, blocks: Sequence[Block], axes: Sequence[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, verify_integrity: bool = True, ) -> None: @@ -1041,28 +989,13 @@ def _verify_integrity(self) -> None: f"block items\n# manager items: {len(self.items)}, # " f"tot_items: {tot_items}" ) - if self.refs is not None: - if len(self.refs) != len(self.blocks): - raise AssertionError( - "Number of passed refs must equal the number of blocks: " - f"{len(self.refs)} refs vs {len(self.blocks)} blocks." - "\nIf you see this error, please report a bug at " - "https://github.com/pandas-dev/pandas/issues" - ) @classmethod - def from_blocks( - cls, - blocks: list[Block], - axes: list[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, - ) -> BlockManager: + def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ - parent = parent if using_copy_on_write() else None - return cls(blocks, axes, refs, parent, verify_integrity=False) + return cls(blocks, axes, verify_integrity=False) # ---------------------------------------------------------------- # Indexing @@ -1081,10 +1014,14 @@ def fast_xs(self, loc: int) -> SingleBlockManager: """ if len(self.blocks) == 1: result = self.blocks[0].iget((slice(None), loc)) - block = new_block(result, placement=slice(0, len(result)), ndim=1) # in the case of a single block, the new block is a view - ref = weakref.ref(self.blocks[0]) - return SingleBlockManager(block, self.axes[0], [ref], parent=self) + block = new_block( + result, + placement=slice(0, len(result)), + ndim=1, + refs=self.blocks[0].refs, + ) + return SingleBlockManager(block, self.axes[0]) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) @@ -1127,10 +1064,10 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: # shortcut for select a single-dim from a 2-dim BM bp = BlockPlacement(slice(0, len(values))) - nb = type(block)(values, placement=bp, ndim=1) - ref = weakref.ref(block) if track_ref else None - parent = self if track_ref else None - return SingleBlockManager(nb, self.axes[1], [ref], parent) + nb = type(block)( + values, placement=bp, ndim=1, refs=block.refs if track_ref else None + ) + return SingleBlockManager(nb, self.axes[1]) def iget_values(self, i: int) -> ArrayLike: """ @@ -1262,7 +1199,7 @@ def value_getitem(placement): self._iset_split_block(blkno_l, blk_locs) if len(removed_blknos): - # Remove blocks & update blknos and refs accordingly + # Remove blocks & update blknos accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True @@ -1273,18 +1210,14 @@ def value_getitem(placement): self.blocks = tuple( blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) ) - if self.refs is not None: - self.refs = [ - ref - for i, ref in enumerate(self.refs) - if i not in set(removed_blknos) - ] if unfit_val_locs: unfit_idxr = np.concatenate(unfit_mgr_locs) unfit_count = len(unfit_idxr) new_blocks: list[Block] = [] + # TODO(CoW) is this always correct to assume that the new_blocks + # are not referencing anything else? if value_is_extension_type: # This code (ab-)uses the fact that EA blocks contain only # one item. @@ -1315,10 +1248,6 @@ def value_getitem(placement): self._blklocs[unfit_idxr] = np.arange(unfit_count) self.blocks += tuple(new_blocks) - # TODO(CoW) is this always correct to assume that the new_blocks - # are not referencing anything else? - if self.refs is not None: - self.refs = list(self.refs) + [None] * len(new_blocks) # Newly created block's dtype may already be present. self._known_consolidated = False @@ -1355,13 +1284,6 @@ def _iset_split_block( first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) - if self.refs is not None: - self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) - - if value is not None: - # Only clear if we set new values - self._clear_reference_block(blkno_l) - nr_blocks = len(self.blocks) blocks_tup = ( self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup @@ -1391,7 +1313,6 @@ def _iset_single( if using_copy_on_write() and not self._has_no_reference_block(blkno): # perform Copy-on-Write and clear the reference copy = True - self._clear_reference_block(blkno) iloc = self.blklocs[loc] blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return @@ -1400,7 +1321,6 @@ def _iset_single( old_blocks = self.blocks new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] self.blocks = new_blocks - self._clear_reference_block(blkno) return def column_setitem( @@ -1418,7 +1338,6 @@ def column_setitem( blocks = list(self.blocks) blocks[blkno] = blocks[blkno].copy() self.blocks = tuple(blocks) - self._clear_reference_block(blkno) # this manager is only created temporarily to mutate the values in place # so don't track references, otherwise the `setitem` would perform CoW again @@ -1452,6 +1371,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: value = ensure_block_shape(value, ndim=self.ndim) bp = BlockPlacement(slice(loc, loc + 1)) + # TODO(CoW) do we always "own" the passed `value`? block = new_block_2d(values=value, placement=bp) if not len(self.blocks): @@ -1464,9 +1384,6 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: self.axes[0] = new_axis self.blocks += (block,) - # TODO(CoW) do we always "own" the passed `value`? - if self.refs is not None: - self.refs += [None] self._known_consolidated = False @@ -1520,12 +1437,10 @@ def idelete(self, indexer) -> BlockManager: is_deleted[indexer] = True taker = (~is_deleted).nonzero()[0] - nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True) + nbs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - # TODO this might not be needed (can a delete ever be done in chained manner?) - parent = None if com.all_none(*new_refs) else self - return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False) + return type(self)(tuple(nbs), axes, verify_integrity=False) # ---------------------------------------------------------------- # Block-wise Operation @@ -1872,10 +1787,7 @@ def _consolidate_inplace(self) -> None: # the DataFrame's _item_cache. The exception is for newly-created # BlockManager objects not yet attached to a DataFrame. if not self.is_consolidated(): - if self.refs is None: - self.blocks = _consolidate(self.blocks) - else: - self.blocks, self.refs = _consolidate_with_refs(self.blocks, self.refs) + self.blocks = _consolidate(self.blocks) self._is_consolidated = True self._known_consolidated = True self._rebuild_blknos_and_blklocs() @@ -1897,8 +1809,6 @@ def __init__( self, block: Block, axis: Index, - refs: list[weakref.ref | None] | None = None, - parent: object = None, verify_integrity: bool = False, ) -> None: # Assertions disabled for performance @@ -1907,25 +1817,19 @@ def __init__( self.axes = [axis] self.blocks = (block,) - self.refs = refs - self.parent = parent if using_copy_on_write() else None @classmethod def from_blocks( cls, blocks: list[Block], axes: list[Index], - refs: list[weakref.ref | None] | None = None, - parent: object = None, ) -> SingleBlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ assert len(blocks) == 1 assert len(axes) == 1 - if refs is not None: - assert len(refs) == 1 - return cls(blocks[0], axes[0], refs, parent, verify_integrity=False) + return cls(blocks[0], axes[0], verify_integrity=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: @@ -1942,13 +1846,9 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: blk = self.blocks[0] arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) - new_blk = type(blk)(arr, placement=bp, ndim=2) + new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) axes = [columns, self.axes[0]] - refs: list[weakref.ref | None] = [weakref.ref(blk)] - parent = self if using_copy_on_write() else None - return BlockManager( - [new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False - ) + return BlockManager([new_blk], axes=axes, verify_integrity=False) def _has_no_reference(self, i: int = 0) -> bool: """ @@ -1956,9 +1856,7 @@ def _has_no_reference(self, i: int = 0) -> bool: (whether it references another array or is itself being referenced) Returns True if the column has no references. """ - return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount( - self.blocks[0] - ) == 0 + return not self.blocks[0].refs.has_reference() def __getstate__(self): block_values = [b.values for b in self.blocks] @@ -2026,19 +1924,18 @@ def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager: and com.is_bool_indexer(indexer) and indexer.all() ): - return type(self)(blk, self.index, [weakref.ref(blk)], parent=self) + return type(self)(blk.copy(deep=False), self.index) array = blk._slice(indexer) if array.ndim > 1: # This will be caught by Series._get_values raise ValueError("dimension-expanding indexing not allowed") bp = BlockPlacement(slice(0, len(array))) - block = type(blk)(array, placement=bp, ndim=1) + # TODO(CoW) in theory only need to track reference if new_array is a view + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) new_idx = self.index[indexer] - # TODO(CoW) in theory only need to track reference if new_array is a view - ref = weakref.ref(blk) - return type(self)(block, new_idx, [ref], parent=self) + return type(self)(block, new_idx) def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: # Assertion disabled for performance @@ -2049,11 +1946,11 @@ def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) bp = BlockPlacement(slice(0, len(array))) - block = type(blk)(array, placement=bp, ndim=1) - new_index = self.index._getitem_slice(slobj) # TODO this method is only used in groupby SeriesSplitter at the moment, - # so passing refs / parent is not yet covered by the tests - return type(self)(block, new_index, [weakref.ref(blk)], parent=self) + # so passing refs is not yet covered by the tests + block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + new_index = self.index._getitem_slice(slobj) + return type(self)(block, new_index) @property def index(self) -> Index: @@ -2099,8 +1996,6 @@ def setitem_inplace(self, indexer, value) -> None: """ if using_copy_on_write() and not self._has_no_reference(0): self.blocks = (self._block.copy(),) - self.refs = None - self.parent = None self._cache.clear() super().setitem_inplace(indexer, value) @@ -2115,9 +2010,6 @@ def idelete(self, indexer) -> SingleBlockManager: self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) self._cache.clear() - # clear reference since delete always results in a new array - self.refs = None - self.parent = None return self def fast_xs(self, loc): @@ -2337,31 +2229,6 @@ def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: return tuple(new_blocks) -def _consolidate_with_refs( - blocks: tuple[Block, ...], refs -) -> tuple[tuple[Block, ...], list[weakref.ref | None]]: - """ - Merge blocks having same dtype, exclude non-consolidating blocks, handling - refs - """ - gkey = lambda x: x[0]._consolidate_key - grouper = itertools.groupby(sorted(zip(blocks, refs), key=gkey), gkey) - - new_blocks: list[Block] = [] - new_refs: list[weakref.ref | None] = [] - for (_can_consolidate, dtype), group_blocks_refs in grouper: - group_blocks, group_refs = list(zip(*list(group_blocks_refs))) - merged_blocks, consolidated = _merge_blocks( - list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate - ) - new_blocks = extend_blocks(merged_blocks, new_blocks) - if consolidated: - new_refs.extend([None]) - else: - new_refs.extend(group_refs) - return tuple(new_blocks), new_refs - - def _merge_blocks( blocks: list[Block], dtype: DtypeObj, can_consolidate: bool ) -> tuple[list[Block], bool]: diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 477dc98aa2b2b..24fc51a96d9df 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -36,7 +36,7 @@ def _iter_block_pairs( left_ea = blk_vals.ndim == 1 - rblks, _ = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) + rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) # Assertions are disabled for performance, but should hold: # if left_ea: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index f8220649bf890..43e0b77a90a85 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -14,7 +14,6 @@ cast, overload, ) -import weakref import numpy as np @@ -551,8 +550,9 @@ def __init__( obj = sample._constructor(obj, columns=[name], copy=False) if using_copy_on_write(): # TODO(CoW): Remove when ref tracking in constructors works - obj._mgr.parent = original_obj # type: ignore[union-attr] - obj._mgr.refs = [weakref.ref(original_obj._mgr.blocks[0])] # type: ignore[union-attr] # noqa: E501 + for i, block in enumerate(original_obj._mgr.blocks): # type: ignore[union-attr] # noqa + obj._mgr.blocks[i].refs = block.refs # type: ignore[union-attr] # noqa + obj._mgr.blocks[i].refs.add_reference(obj._mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa obj.columns = [new_name] @@ -612,13 +612,9 @@ def get_result(self): typ=get_option("mode.data_manager"), ) if using_copy_on_write() and not self.copy: - parents = [obj._mgr for obj in self.objs] - mgr.parent = parents # type: ignore[union-attr] - refs = [ - weakref.ref(obj._mgr.blocks[0]) # type: ignore[union-attr] - for obj in self.objs - ] - mgr.refs = refs # type: ignore[union-attr] + for i, obj in enumerate(self.objs): + mgr.blocks[i].refs = obj._mgr.blocks[0].refs # type: ignore[union-attr] # noqa + mgr.blocks[i].refs.add_reference(mgr.blocks[i]) # type: ignore[arg-type, union-attr] # noqa df = cons(mgr, copy=False) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index f8a57b67f29fb..1b709a8aa8076 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -20,7 +20,7 @@ def test_series_from_series(dtype, using_copy_on_write): assert np.shares_memory(ser.values, result.values) if using_copy_on_write: - assert result._mgr.refs is not None + assert result._mgr.blocks[0].refs.has_reference() if using_copy_on_write: # mutating new series copy doesn't mutate original @@ -74,4 +74,4 @@ def test_series_from_series_with_reindex(using_copy_on_write): result = Series(ser, index=[0, 1, 2, 3]) assert not np.shares_memory(ser.values, result.values) if using_copy_on_write: - assert result._mgr.refs is None or result._mgr.refs[0] is None + assert not result._mgr.blocks[0].refs.has_reference() diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py new file mode 100644 index 0000000000000..204e26b35d680 --- /dev/null +++ b/pandas/tests/copy_view/test_core_functionalities.py @@ -0,0 +1,88 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_assigning_to_same_variable_removes_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df = df.reset_index() + if using_copy_on_write: + assert df._mgr._has_no_reference(1) + arr = get_array(df, "a") + df.iloc[0, 1] = 100 # Write into a + + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_dont_track_unnecessary_references(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + + df["b"] = 100 + arr = get_array(df, "a") + # We split the block in setitem, if we are not careful the new blocks will + # reference each other triggering a copy + df.iloc[0, 0] = 100 + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_setitem_with_view_copies(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + expected = df.copy() + + df["b"] = 100 + arr = get_array(df, "a") + df.iloc[0, 0] = 100 # Check that we correctly track reference + if using_copy_on_write: + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(view, expected) + + +def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, request): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) + view = df[:] + + df["b"] = 100 + arr = get_array(df, "a") + view = None # noqa + df.iloc[0, 0] = 100 + if using_copy_on_write: + # Setitem split the block. Since the old block shared data with view + # all the new blocks are referencing view and each other. When view + # goes out of scope, they don't share data with any other block, + # so we should not trigger a copy + mark = pytest.mark.xfail( + reason="blk.delete does not track references correctly" + ) + request.node.add_marker(mark) + assert np.shares_memory(arr, get_array(df, "a")) + + +def test_out_of_scope(using_copy_on_write): + def func(): + df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1}) + # create some subset + result = df[["a", "b"]] + return result + + result = func() + if using_copy_on_write: + assert not result._mgr.blocks[0].refs.has_reference() + assert not result._mgr.blocks[1].refs.has_reference() + + +def test_delete(using_copy_on_write): + df = DataFrame(np.random.randn(4, 3), columns=["a", "b", "c"]) + del df["b"] + if using_copy_on_write: + # TODO: This should not have references, delete makes a shallow copy + # but keeps the blocks alive + assert df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + + df = df[["a"]] + if using_copy_on_write: + assert not df._mgr.blocks[0].refs.has_reference() diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 506ae7d3465c5..67022e533dbc4 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -20,52 +20,32 @@ def test_consolidate(using_copy_on_write): subset = df[:] # each block of subset references a block of df - assert subset._mgr.refs is not None and all( - ref is not None for ref in subset._mgr.refs - ) + assert all(blk.refs.has_reference() for blk in subset._mgr.blocks) # consolidate the two int64 blocks subset._consolidate_inplace() # the float64 block still references the parent one because it still a view - assert subset._mgr.refs[0] is not None + assert subset._mgr.blocks[0].refs.has_reference() # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) # but avoids caching df["b"] assert np.shares_memory(get_array(df, "b"), get_array(subset, "b")) # the new consolidated int64 block does not reference another - assert subset._mgr.refs[1] is None + assert not subset._mgr.blocks[1].refs.has_reference() # the parent dataframe now also only is linked for the float column - assert df._mgr._has_no_reference(0) - assert not df._mgr._has_no_reference(1) - assert df._mgr._has_no_reference(2) + assert not df._mgr.blocks[0].refs.has_reference() + assert df._mgr.blocks[1].refs.has_reference() + assert not df._mgr.blocks[2].refs.has_reference() # and modifying subset still doesn't modify parent if using_copy_on_write: subset.iloc[0, 1] = 0.0 - assert df._mgr._has_no_reference(1) + assert not df._mgr.blocks[1].refs.has_reference() assert df.loc[0, "b"] == 0.1 -@td.skip_array_manager_invalid_test -def test_clear_parent(using_copy_on_write): - # ensure to clear parent reference if we are no longer viewing data from parent - if not using_copy_on_write: - pytest.skip("test only relevant when using copy-on-write") - - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - assert subset._mgr.parent is not None - - # replacing existing columns loses the references to the parent df - subset["a"] = 0 - assert subset._mgr.parent is not None - # when losing the last reference, also the parent should be reset - subset["b"] = 0 - assert subset._mgr.parent is None - - @pytest.mark.single_cpu @td.skip_array_manager_invalid_test def test_switch_options(): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 833e4feadf27e..59645c99d9929 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -24,7 +24,8 @@ def test_copy(using_copy_on_write): # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is None + assert not df_copy._mgr.blocks[0].refs.has_reference() + assert not df_copy._mgr.blocks[1].refs.has_reference() # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 @@ -38,7 +39,8 @@ def test_copy_shallow(using_copy_on_write): # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is not None + assert df_copy._mgr.blocks[0].refs.has_reference() + assert df_copy._mgr.blocks[1].refs.has_reference() if using_copy_on_write: # mutating shallow copy doesn't mutate original From 31557be2c799993092e884cf0ad40fdc34783f97 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 7 Feb 2023 21:47:28 +0100 Subject: [PATCH 14/23] Refactor for new cow mechanism and move test --- pandas/core/dtypes/astype.py | 20 ++++++------- pandas/core/internals/blocks.py | 28 +++++++++++-------- pandas/core/internals/managers.py | 20 +++++-------- pandas/tests/arrays/test_array.py | 11 -------- pandas/tests/copy_view/test_methods.py | 6 ++-- .../tests/extension/decimal/test_decimal.py | 11 ++++++++ 6 files changed, 47 insertions(+), 49 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 0bac9b463939b..22163fad63339 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -268,24 +268,22 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: # Only equal numpy dtypes avoid a copy return False - if is_string_dtype(dtype) and is_string_dtype(new_dtype): + elif is_string_dtype(dtype) and is_string_dtype(new_dtype): return True elif is_object_dtype(dtype) and new_dtype.kind == "O": # When the underlying array has dtype object, we don't have to make a copy return True - elif is_string_dtype(dtype) or is_string_dtype(new_dtype): - return False - elif dtype.kind in "mM" and new_dtype.kind in "mM": return True - elif getattr(dtype, "numpy_dtype", dtype) == getattr( - new_dtype, "numpy_dtype", new_dtype - ): - # If underlying numpy dtype is the same, no copy is made, e.g. - # int64 -> Int64 or int64[pyarrow] - return True + numpy_dtype = getattr(dtype, "numpy_dtype", None) + new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None) + + if numpy_dtype is not None and new_numpy_dtype is not None: + # if both have NumPy dtype then they are only views if they are equal + return numpy_dtype == new_numpy_dtype - return False + # Assume this is a view since we don't know for sure if a copy was made + return True diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 41b8e63cc7a64..273e2211f1325 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,12 +11,9 @@ cast, final, ) -import weakref import numpy as np -from pandas._config import using_copy_on_write - from pandas._libs import ( Timestamp, internals as libinternals, @@ -162,7 +159,6 @@ class Block(PandasObject): is_extension = False _can_consolidate = True _validate_ndim = True - _ref = None @final @cache_readonly @@ -214,7 +210,9 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None: self._mgr_locs = new_mgr_locs @final - def make_block(self, values, placement=None) -> Block: + def make_block( + self, values, placement=None, refs: BlockValuesRefs | None = None + ) -> Block: """ Create a new block, with type inference propagate any values that are not specified @@ -226,7 +224,7 @@ def make_block(self, values, placement=None) -> Block: # TODO: perf by not going through new_block # We assume maybe_coerce_values has already been called - return new_block(values, placement=placement, ndim=self.ndim) + return new_block(values, placement=placement, ndim=self.ndim, refs=refs) @final def make_block_same_class( @@ -476,7 +474,11 @@ def dtype(self) -> DtypeObj: @final def astype( - self, dtype: DtypeObj, copy: bool = False, errors: IgnoreRaise = "raise" + self, + dtype: DtypeObj, + copy: bool = False, + errors: IgnoreRaise = "raise", + using_cow: bool = False, ) -> Block: """ Coerce to the new dtype. @@ -489,6 +491,8 @@ def astype( errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + using_cow: bool, default False + Signaling if copy on write copy logic is used. Returns ------- @@ -499,16 +503,18 @@ def astype( new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) new_values = maybe_coerce_values(new_values) - newb = self.make_block(new_values) + + refs = None + if using_cow and astype_is_view(values.dtype, new_values.dtype): + refs = self.refs + + newb = self.make_block(new_values, refs=refs) if newb.shape != self.shape: raise TypeError( f"cannot set astype for copy = [{copy}] for dtype " f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) - if using_copy_on_write(): - if astype_is_view(values.dtype, new_values.dtype): - newb._ref = weakref.ref(self) return newb @final diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d3e406c25e455..4993d50603012 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -422,25 +422,19 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: ) def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T: - return self._apply_and_handle_refs( - "astype", copy=copy, dtype=dtype, errors=errors - ) - - def _apply_and_handle_refs(self, func: str, copy: bool | None, **kwargs): if copy is None: if using_copy_on_write(): copy = False else: copy = True - result = self.apply(func, copy=copy, **kwargs) - - if using_copy_on_write() and not copy: - refs = [blk._ref for blk in result.blocks] - if any(ref is not None for ref in refs): - result.refs = refs - result.parent = self - return result + return self.apply( + "astype", + dtype=dtype, + copy=copy, + errors=errors, + using_cow=using_copy_on_write(), + ) def convert(self: T, copy: bool) -> T: return self.apply( diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index ff3a345f92c80..59e5c6fa2dda3 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -429,14 +429,3 @@ def test_array_to_numpy_na(): result = arr.to_numpy(na_value=True, dtype=bool) expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) - - -def test_array_copy_on_write(using_copy_on_write): - df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object") - df2 = df.astype(DecimalDtype()) - df.iloc[0, 0] = 0 - if using_copy_on_write: - expected = pd.DataFrame( - {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype() - ) - tm.assert_equal(df2.values, expected.values) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 59645c99d9929..af86c81eb0de7 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas.compat import pa_version_under6p0 +from pandas.compat import pa_version_under7p0 from pandas import ( DataFrame, @@ -556,7 +556,7 @@ def test_astype_single_dtype(using_copy_on_write): @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): - if new_dtype == "int64[pyarrow]" and pa_version_under6p0: + if new_dtype == "int64[pyarrow]" and pa_version_under7p0: pytest.skip("pyarrow not installed") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) df_orig = df.copy() @@ -581,7 +581,7 @@ def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): @pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) def test_astype_different_target_dtype(using_copy_on_write, dtype): - if dtype == "int32[pyarrow]" and pa_version_under6p0: + if dtype == "int32[pyarrow]" and pa_version_under7p0: pytest.skip("pyarrow not installed") df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index cdacbf9c5bb5a..90f89d71b15a9 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -483,3 +483,14 @@ def test_to_numpy_keyword(): result = pd.Series(a).to_numpy(decimals=2) tm.assert_numpy_array_equal(result, expected) + + +def test_array_copy_on_write(using_copy_on_write): + df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object") + df2 = df.astype(DecimalDtype()) + df.iloc[0, 0] = 0 + if using_copy_on_write: + expected = pd.DataFrame( + {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype() + ) + tm.assert_equal(df2.values, expected.values) From 031aaa51a3b8163b0ced8fdd95df224eac7afa1b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 10:17:52 +0100 Subject: [PATCH 15/23] Remove merge problem --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0ccfea2c93752..6608720e3b6bc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -271,7 +271,6 @@ def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block: new_values = self._slice(slicer) refs = self.refs if isinstance(slicer, slice) else None - refs = self.refs if isinstance(slicer, slice) else None return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs) @final From 174e2d3bd34ac6b368bdfd04c736f891c668307f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 10:19:30 +0100 Subject: [PATCH 16/23] Add comment --- pandas/core/dtypes/astype.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 22163fad63339..64016e47acc0a 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -269,6 +269,7 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: return False elif is_string_dtype(dtype) and is_string_dtype(new_dtype): + # Potentially! a copy when converting from object to string return True elif is_object_dtype(dtype) and new_dtype.kind == "O": From 876d293360ab34a8c8e79fdfcc60e21ce2a71fc5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 8 Feb 2023 15:34:35 +0100 Subject: [PATCH 17/23] Update pandas/core/dtypes/astype.py Co-authored-by: Joris Van den Bossche --- pandas/core/dtypes/astype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 64016e47acc0a..331f539b2fc5a 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -259,7 +259,7 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: Returns ------- - True if new data is a view, False otherwise + True if new data is a view or not guaranteed to be a copy, False otherwise """ if dtype == new_dtype: return True From ff37e58e55e416b403fc2a2cf1681dff9b0a04ce Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 15:42:19 +0100 Subject: [PATCH 18/23] Check numpy non numpy conversion better --- pandas/core/dtypes/astype.py | 6 ++ pandas/tests/copy_view/test_astype.py | 144 +++++++++++++++++++++++++ pandas/tests/copy_view/test_methods.py | 134 ----------------------- 3 files changed, 150 insertions(+), 134 deletions(-) create mode 100644 pandas/tests/copy_view/test_astype.py diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 331f539b2fc5a..32b28192c3ba2 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -282,6 +282,12 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: numpy_dtype = getattr(dtype, "numpy_dtype", None) new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None) + if numpy_dtype is None and isinstance(dtype, np.dtype): + numpy_dtype = dtype + + if new_numpy_dtype is None and isinstance(new_dtype, np.dtype): + numpy_dtype = new_dtype + if numpy_dtype is not None and new_numpy_dtype is not None: # if both have NumPy dtype then they are only views if they are equal return numpy_dtype == new_numpy_dtype diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py new file mode 100644 index 0000000000000..dbf9f71ea95b0 --- /dev/null +++ b/pandas/tests/copy_view/test_astype.py @@ -0,0 +1,144 @@ +import numpy as np +import pytest + +from pandas.compat import pa_version_under7p0 + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +def test_astype_single_dtype(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) + df_orig = df.copy() + df2 = df.astype("float64") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype("float64") + df.iloc[0, 2] = 5.5 + tm.assert_frame_equal(df2, df_orig.astype("float64")) + + +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) +def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): + if new_dtype == "int64[pyarrow]" and pa_version_under7p0: + pytest.skip("pyarrow not installed") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype(new_dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(new_dtype)) + + +@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) +def test_astype_different_target_dtype(using_copy_on_write, dtype): + if dtype == "int32[pyarrow]" and pa_version_under7p0: + pytest.skip("pyarrow not installed") + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.astype(dtype) + + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert df2._mgr._has_no_reference(0) + + df2.iloc[0, 0] = 5 + tm.assert_frame_equal(df, df_orig) + + # mutating parent also doesn't update result + df2 = df.astype(dtype) + df.iloc[0, 0] = 100 + tm.assert_frame_equal(df2, df_orig.astype(dtype)) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = "x" + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype, new_dtype", [("object", "string"), ("string", "object")] +) +def test_astype_string_and_object_update_original( + using_copy_on_write, dtype, new_dtype +): + df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) + df2 = df.astype(new_dtype) + df_orig = df2.copy() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df.iloc[0, 0] = "x" + tm.assert_frame_equal(df2, df_orig) + + +def test_astype_dict_dtypes(using_copy_on_write): + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} + ) + df_orig = df.copy() + df2 = df.astype({"a": "float64", "c": "float64"}) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + + df2.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index af86c81eb0de7..b814b9089aabd 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 - from pandas import ( DataFrame, Index, @@ -529,138 +527,6 @@ def test_to_frame(using_copy_on_write): tm.assert_frame_equal(df, expected) -def test_astype_single_dtype(using_copy_on_write): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) - df_orig = df.copy() - df2 = df.astype("float64") - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - # mutating df2 triggers a copy-on-write for that column/block - df2.iloc[0, 2] = 5.5 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - tm.assert_frame_equal(df, df_orig) - - # mutating parent also doesn't update result - df2 = df.astype("float64") - df.iloc[0, 2] = 5.5 - tm.assert_frame_equal(df2, df_orig.astype("float64")) - - -@pytest.mark.parametrize("dtype", ["int64", "Int64"]) -@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) -def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): - if new_dtype == "int64[pyarrow]" and pa_version_under7p0: - pytest.skip("pyarrow not installed") - df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) - df_orig = df.copy() - df2 = df.astype(new_dtype) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - # mutating df2 triggers a copy-on-write for that column/block - df2.iloc[0, 0] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - tm.assert_frame_equal(df, df_orig) - - # mutating parent also doesn't update result - df2 = df.astype(new_dtype) - df.iloc[0, 0] = 100 - tm.assert_frame_equal(df2, df_orig.astype(new_dtype)) - - -@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) -def test_astype_different_target_dtype(using_copy_on_write, dtype): - if dtype == "int32[pyarrow]" and pa_version_under7p0: - pytest.skip("pyarrow not installed") - df = DataFrame({"a": [1, 2, 3]}) - df_orig = df.copy() - df2 = df.astype(dtype) - - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - df2.iloc[0, 0] = 5 - tm.assert_frame_equal(df, df_orig) - - # mutating parent also doesn't update result - df2 = df.astype(dtype) - df.iloc[0, 0] = 100 - tm.assert_frame_equal(df2, df_orig.astype(dtype)) - - -@pytest.mark.parametrize( - "dtype, new_dtype", [("object", "string"), ("string", "object")] -) -def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype): - df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) - df_orig = df.copy() - df2 = df.astype(new_dtype) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - df2.iloc[0, 0] = "x" - tm.assert_frame_equal(df, df_orig) - - -@pytest.mark.parametrize( - "dtype, new_dtype", [("object", "string"), ("string", "object")] -) -def test_astype_string_and_object_update_original( - using_copy_on_write, dtype, new_dtype -): - df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype) - df2 = df.astype(new_dtype) - df_orig = df2.copy() - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - df.iloc[0, 0] = "x" - tm.assert_frame_equal(df2, df_orig) - - -def test_astype_dict_dtypes(using_copy_on_write): - df = DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} - ) - df_orig = df.copy() - df2 = df.astype({"a": "float64", "c": "float64"}) - - if using_copy_on_write: - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - - # mutating df2 triggers a copy-on-write for that column/block - df2.iloc[0, 2] = 5.5 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) - - df2.iloc[0, 1] = 10 - if using_copy_on_write: - assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - tm.assert_frame_equal(df, df_orig) - - @pytest.mark.parametrize("ax", ["index", "columns"]) def test_swapaxes_noop(using_copy_on_write, ax): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) From 5492c10cb51be5b31ba02e770b5d17cac281faa4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 15:53:51 +0100 Subject: [PATCH 19/23] Add explicit timezone tests --- pandas/core/dtypes/astype.py | 4 +++- pandas/tests/copy_view/test_astype.py | 32 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 32b28192c3ba2..8fcd39a8252e8 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -277,7 +277,9 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: return True elif dtype.kind in "mM" and new_dtype.kind in "mM": - return True + # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no + # attribute "unit" + return dtype.unit == new_dtype.unit # type: ignore[union-attr] numpy_dtype = getattr(dtype, "numpy_dtype", None) new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index dbf9f71ea95b0..a01ef727c2809 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -6,6 +6,7 @@ from pandas import ( DataFrame, Series, + date_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -142,3 +143,34 @@ def test_astype_dict_dtypes(using_copy_on_write): if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) tm.assert_frame_equal(df, df_orig) + + +def test_astype_different_datetime_resos(using_copy_on_write): + df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")}) + result = df.astype("datetime64[ms]") + + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) + if using_copy_on_write: + assert result._mgr._has_no_reference(0) + + +def test_astype_different_timezones(using_copy_on_write): + df = DataFrame( + {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} + ) + result = df.astype("datetime64[ns, Europe/Berlin]") + if using_copy_on_write: + assert not result._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8) + + +def test_astype_different_timezones_different_reso(using_copy_on_write): + df = DataFrame( + {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")} + ) + result = df.astype("datetime64[ms, Europe/Berlin]") + if using_copy_on_write: + assert result._mgr._has_no_reference(0) + assert not np.shares_memory( + get_array(df, "a").asi8, get_array(result, "a").asi8 + ) From 620181c670bc542955e72d4a3cd2755d678e7cf6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 15:56:25 +0100 Subject: [PATCH 20/23] Adjust comment --- pandas/core/dtypes/astype.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index 8fcd39a8252e8..dff31fdbb3c03 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -269,7 +269,7 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: return False elif is_string_dtype(dtype) and is_string_dtype(new_dtype): - # Potentially! a copy when converting from object to string + # Potentially! a view when converting from object to string return True elif is_object_dtype(dtype) and new_dtype.kind == "O": @@ -291,7 +291,10 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: numpy_dtype = new_dtype if numpy_dtype is not None and new_numpy_dtype is not None: - # if both have NumPy dtype then they are only views if they are equal + # if both have NumPy dtype or one of them is a numpy dtype + # they are only a view when the numpy dtypes are equal, e.g. + # int64 -> Int64 or int64[pyarrow] + # int64 -> Int32 copies return numpy_dtype == new_numpy_dtype # Assume this is a view since we don't know for sure if a copy was made From 95a8296a2fd69a34862aca3608473fac30bd9733 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 16:04:54 +0100 Subject: [PATCH 21/23] Add to whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 70c60401f29fb..36f1a77badc95 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -224,6 +224,7 @@ Copy-on-Write improvements - :meth:`DataFrame.truncate` - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize` - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects` + - :meth:`DataFrame.astype` / :meth:`Series.astype` - :func:`concat` These methods return views when Copy-on-Write is enabled, which provides a significant From 7640c36276d90305d64c658f53e2600a9172fe50 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 17:44:17 +0100 Subject: [PATCH 22/23] Fix tests --- pandas/core/dtypes/astype.py | 6 +++--- pandas/tests/copy_view/test_astype.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py index dff31fdbb3c03..4780ec176f6d0 100644 --- a/pandas/core/dtypes/astype.py +++ b/pandas/core/dtypes/astype.py @@ -277,9 +277,9 @@ def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool: return True elif dtype.kind in "mM" and new_dtype.kind in "mM": - # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has no - # attribute "unit" - return dtype.unit == new_dtype.unit # type: ignore[union-attr] + dtype = getattr(dtype, "numpy_dtype", dtype) + new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype) + return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None) numpy_dtype = getattr(dtype, "numpy_dtype", None) new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index a01ef727c2809..d12d6650e7434 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -6,6 +6,7 @@ from pandas import ( DataFrame, Series, + Timestamp, date_range, ) import pandas._testing as tm @@ -174,3 +175,20 @@ def test_astype_different_timezones_different_reso(using_copy_on_write): assert not np.shares_memory( get_array(df, "a").asi8, get_array(result, "a").asi8 ) + + +@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed") +def test_astype_arrow_timestamp(using_copy_on_write): + df = DataFrame( + { + "a": [ + Timestamp("2020-01-01 01:01:01.000001"), + Timestamp("2020-01-01 01:01:01.000001"), + ] + }, + dtype="M8[ns]", + ) + result = df.astype("timestamp[ns][pyarrow]") + if using_copy_on_write: + assert not result._mgr._has_no_reference(0) + assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data) From 469383d9c509be12efb7168829ea9fa5a7f774de Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Feb 2023 23:07:59 +0100 Subject: [PATCH 23/23] Fix array manager --- pandas/tests/copy_view/test_astype.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index d12d6650e7434..a485275a28ac4 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -72,7 +72,8 @@ def test_astype_different_target_dtype(using_copy_on_write, dtype): df2 = df.astype(dtype) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert df2._mgr._has_no_reference(0) + if using_copy_on_write: + assert df2._mgr._has_no_reference(0) df2.iloc[0, 0] = 5 tm.assert_frame_equal(df, df_orig)