pandas-dev · jorisvandenbossche · Feb 9, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 17, 2023
diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
@@ -26,6 +26,7 @@
     is_dtype_equal,
     is_integer_dtype,
     is_object_dtype,
+    is_string_dtype,
     is_timedelta64_dtype,
     pandas_dtype,
 )
@@ -246,3 +247,44 @@ def astype_array_safe(
             raise
 
     return new_values
+
+
+def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
+    """Checks if astype avoided copying the data.
+
+    Parameters
+    ----------
+    dtype : Original dtype
+    new_dtype : target dtype
+
+    Returns
+    -------
+    True if new data is a view, False otherwise
+    """
+    if dtype == new_dtype:
+        return True
+
+    elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
+        # Only equal numpy dtypes avoid a copy
+        return False
+
+    elif is_string_dtype(dtype) and is_string_dtype(new_dtype):
+        # Potentially! a copy when converting from object to string
+        return True
+
+    elif is_object_dtype(dtype) and new_dtype.kind == "O":
+        # When the underlying array has dtype object, we don't have to make a copy
+        return True
+
+    elif dtype.kind in "mM" and new_dtype.kind in "mM":
+        return True
+
+    numpy_dtype = getattr(dtype, "numpy_dtype", None)
+    new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None)
+
+    if numpy_dtype is not None and new_numpy_dtype is not None:
+        # if both have NumPy dtype then they are only views if they are equal
+        return numpy_dtype == new_numpy_dtype
+
+    # Assume this is a view since we don't know for sure if a copy was made
+    return True
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6120,7 +6120,7 @@ def dtypes(self):
         return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
 
     def astype(
-        self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise"
+        self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
     ) -> NDFrameT:
         """
         Cast a pandas object to a specified dtype ``dtype``.
@@ -6257,7 +6257,7 @@ def astype(
             for i, (col_name, col) in enumerate(self.items()):
                 cdt = dtype_ser.iat[i]
                 if isna(cdt):
-                    res_col = col.copy() if copy else col
+                    res_col = col.copy(deep=copy)
                 else:
                     try:
                         res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
@@ -6284,7 +6284,7 @@ def astype(
 
         # GH 33113: handle empty frame or series
         if not results:
-            return self.copy()
+            return self.copy(deep=None)
 
         # GH 19920: retain column metadata after concat
         result = concat(results, axis=1, copy=False)

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -366,7 +366,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
             "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
+    def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
+        if copy is None:
+            copy = True
+
         return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
 
     def convert(self: T, copy: bool) -> T:

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -41,7 +41,10 @@
 from pandas.util._decorators import cache_readonly
 from pandas.util._validators import validate_bool_kwarg
 
-from pandas.core.dtypes.astype import astype_array_safe
+from pandas.core.dtypes.astype import (
+    astype_array_safe,
+    astype_is_view,
+)
 from pandas.core.dtypes.cast import (
     LossySetitemError,
     can_hold_element,
@@ -207,7 +210,9 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None:
         self._mgr_locs = new_mgr_locs
 
     @final
-    def make_block(self, values, placement=None) -> Block:
+    def make_block(
+        self, values, placement=None, refs: BlockValuesRefs | None = None
+    ) -> Block:
         """
         Create a new block, with type inference propagate any values that are
         not specified
@@ -219,7 +224,7 @@ def make_block(self, values, placement=None) -> Block:
 
         # TODO: perf by not going through new_block
         # We assume maybe_coerce_values has already been called
-        return new_block(values, placement=placement, ndim=self.ndim)
+        return new_block(values, placement=placement, ndim=self.ndim, refs=refs)
 
     @final
     def make_block_same_class(
@@ -465,7 +470,11 @@ def dtype(self) -> DtypeObj:
 
     @final
     def astype(
-        self, dtype: DtypeObj, copy: bool = False, errors: IgnoreRaise = "raise"
+        self,
+        dtype: DtypeObj,
+        copy: bool = False,
+        errors: IgnoreRaise = "raise",
+        using_cow: bool = False,
     ) -> Block:
         """
         Coerce to the new dtype.
@@ -478,6 +487,8 @@ def astype(
         errors : str, {'raise', 'ignore'}, default 'raise'
             - ``raise`` : allow exceptions to be raised
             - ``ignore`` : suppress exceptions. On error return original object
+        using_cow: bool, default False
+            Signaling if copy on write copy logic is used.
 
         Returns
         -------
@@ -488,7 +499,12 @@ def astype(
         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
 
         new_values = maybe_coerce_values(new_values)
-        newb = self.make_block(new_values)
+
+        refs = None
+        if using_cow and astype_is_view(values.dtype, new_values.dtype):
+            refs = self.refs
+
+        newb = self.make_block(new_values, refs=refs)
         if newb.shape != self.shape:
             raise TypeError(
                 f"cannot set astype for copy = [{copy}] for dtype "

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -421,8 +421,20 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
             "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
-        return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
+    def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
+        if copy is None:
+            if using_copy_on_write():
+                copy = False
+            else:
+                copy = True
+
+        return self.apply(
+            "astype",
+            dtype=dtype,
+            copy=copy,
+            errors=errors,
+            using_cow=using_copy_on_write(),
+        )
 
     def convert(self: T, copy: bool) -> T:
         return self.apply(

diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
@@ -1,18 +1,20 @@
 import numpy as np
+import pytest
 
 from pandas import Series
 
 # -----------------------------------------------------------------------------
 # Copy/view behaviour for Series / DataFrame constructors
 
 
-def test_series_from_series(using_copy_on_write):
+@pytest.mark.parametrize("dtype", [None, "int64"])
+def test_series_from_series(dtype, using_copy_on_write):
     # Case: constructing a Series from another Series object follows CoW rules:
     # a new object is returned and thus mutations are not propagated
     ser = Series([1, 2, 3], name="name")
 
     # default is copy=False -> new Series is a shallow copy / view of original
-    result = Series(ser)
+    result = Series(ser, dtype=dtype)
 
     # the shallow copy still shares memory
     assert np.shares_memory(ser.values, result.values)
@@ -34,7 +36,7 @@ def test_series_from_series(using_copy_on_write):
         assert np.shares_memory(ser.values, result.values)
 
     # the same when modifying the parent
-    result = Series(ser)
+    result = Series(ser, dtype=dtype)
 
     if using_copy_on_write:
         # mutating original doesn't mutate new series

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under7p0
+
 from pandas import (
     DataFrame,
     Index,
@@ -527,6 +529,138 @@ def test_to_frame(using_copy_on_write):
         tm.assert_frame_equal(df, expected)
 
 
+def test_astype_single_dtype(using_copy_on_write):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
+    df_orig = df.copy()
+    df2 = df.astype("float64")
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
-    tm.assert_frame_equal(df, df_orig)
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64")
-    tm.assert_frame_equal(df, df_orig)
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64")
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64"))
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
+def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype):
+    if new_dtype == "int64[pyarrow]" and pa_version_under7p0:
+        pytest.skip("pyarrow not installed")
+    df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(new_dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
+
+
+@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
+def test_astype_different_target_dtype(using_copy_on_write, dtype):
+    if dtype == "int32[pyarrow]" and pa_version_under7p0:
+        pytest.skip("pyarrow not installed")
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    df2 = df.astype(dtype)
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2._mgr._has_no_reference(0)
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2._mgr._has_no_reference(0)
+
+    df2.iloc[0, 0] = 5
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+)
+def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+)
+def test_astype_string_and_object_update_original(
+    using_copy_on_write, dtype, new_dtype
+):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df2 = df.astype(new_dtype)
+    df_orig = df2.copy()
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df2, df_orig)
+
+
+def test_astype_dict_dtypes(using_copy_on_write):
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
+    )
+    df_orig = df.copy()
+    df2 = df.astype({"a": "float64", "c": "float64"})
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+
+    df2.iloc[0, 1] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    tm.assert_frame_equal(df, df_orig)
+
+
 @pytest.mark.parametrize("ax", ["index", "columns"])
 def test_swapaxes_noop(using_copy_on_write, ax):
     df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -483,3 +483,14 @@ def test_to_numpy_keyword():
 
     result = pd.Series(a).to_numpy(decimals=2)
     tm.assert_numpy_array_equal(result, expected)
+
+
+def test_array_copy_on_write(using_copy_on_write):
+    df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object")
+    df2 = df.astype(DecimalDtype())
+    df.iloc[0, 0] = 0
+    if using_copy_on_write:
+        expected = pd.DataFrame(
+            {"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype()
+        )
+        tm.assert_equal(df2.values, expected.values)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -873,13 +873,16 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp
         with pytest.raises(IntCastingNaNError, match=msg):
             Series(np.array(vals), dtype=any_int_numpy_dtype)
 
-    def test_constructor_dtype_no_cast(self):
+    def test_constructor_dtype_no_cast(self, using_copy_on_write):
         # see gh-1572
         s = Series([1, 2, 3])
         s2 = Series(s, dtype=np.int64)
 
         s2[1] = 5
-        assert s[1] == 5
+        if using_copy_on_write:
+            assert s[1] == 2
+        else:
+            assert s[1] == 5
 
     def test_constructor_datelike_coercion(self):