pandas-dev · jorisvandenbossche · Feb 9, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 17, 2023
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -224,6 +224,7 @@ Copy-on-Write improvements
   - :meth:`DataFrame.truncate`
   - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
   - :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
+  - :meth:`DataFrame.astype` / :meth:`Series.astype`
   - :func:`concat`
 
   These methods return views when Copy-on-Write is enabled, which provides a significant

diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py
@@ -26,6 +26,7 @@
     is_dtype_equal,
     is_integer_dtype,
     is_object_dtype,
+    is_string_dtype,
     is_timedelta64_dtype,
     pandas_dtype,
 )
@@ -246,3 +247,55 @@ def astype_array_safe(
             raise
 
     return new_values
+
+
+def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
+    """Checks if astype avoided copying the data.
+
+    Parameters
+    ----------
+    dtype : Original dtype
+    new_dtype : target dtype
+
+    Returns
+    -------
+    True if new data is a view or not guaranteed to be a copy, False otherwise
+    """
+    if dtype == new_dtype:
+        return True
+
+    elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
+        # Only equal numpy dtypes avoid a copy
+        return False
+
+    elif is_string_dtype(dtype) and is_string_dtype(new_dtype):
+        # Potentially! a view when converting from object to string
+        return True
+
+    elif is_object_dtype(dtype) and new_dtype.kind == "O":
+        # When the underlying array has dtype object, we don't have to make a copy
+        return True
+
+    elif dtype.kind in "mM" and new_dtype.kind in "mM":
+        dtype = getattr(dtype, "numpy_dtype", dtype)
+        new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype)
+        return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None)
+
+    numpy_dtype = getattr(dtype, "numpy_dtype", None)
+    new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None)
+
+    if numpy_dtype is None and isinstance(dtype, np.dtype):
+        numpy_dtype = dtype
+
+    if new_numpy_dtype is None and isinstance(new_dtype, np.dtype):
+        numpy_dtype = new_dtype
+
+    if numpy_dtype is not None and new_numpy_dtype is not None:
+        # if both have NumPy dtype or one of them is a numpy dtype
+        # they are only a view when the numpy dtypes are equal, e.g.
+        # int64 -> Int64 or int64[pyarrow]
+        # int64 -> Int32 copies
+        return numpy_dtype == new_numpy_dtype
+
+    # Assume this is a view since we don't know for sure if a copy was made
+    return True
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6120,7 +6120,7 @@ def dtypes(self):
         return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
 
     def astype(
-        self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise"
+        self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
     ) -> NDFrameT:
         """
         Cast a pandas object to a specified dtype ``dtype``.
@@ -6257,7 +6257,7 @@ def astype(
             for i, (col_name, col) in enumerate(self.items()):
                 cdt = dtype_ser.iat[i]
                 if isna(cdt):
-                    res_col = col.copy() if copy else col
+                    res_col = col.copy(deep=copy)
                 else:
                     try:
                         res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
@@ -6284,7 +6284,7 @@ def astype(
 
         # GH 33113: handle empty frame or series
         if not results:
-            return self.copy()
+            return self.copy(deep=None)
 
         # GH 19920: retain column metadata after concat
         result = concat(results, axis=1, copy=False)

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -366,7 +366,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
             "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
+    def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
+        if copy is None:
+            copy = True
+
         return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
 
     def convert(self: T, copy: bool | None) -> T:

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -41,7 +41,10 @@
 from pandas.util._decorators import cache_readonly
 from pandas.util._validators import validate_bool_kwarg
 
-from pandas.core.dtypes.astype import astype_array_safe
+from pandas.core.dtypes.astype import (
+    astype_array_safe,
+    astype_is_view,
+)
 from pandas.core.dtypes.cast import (
     LossySetitemError,
     can_hold_element,
@@ -470,7 +473,11 @@ def dtype(self) -> DtypeObj:
 
     @final
     def astype(
-        self, dtype: DtypeObj, copy: bool = False, errors: IgnoreRaise = "raise"
+        self,
+        dtype: DtypeObj,
+        copy: bool = False,
+        errors: IgnoreRaise = "raise",
+        using_cow: bool = False,
     ) -> Block:
         """
         Coerce to the new dtype.
@@ -483,6 +490,8 @@ def astype(
         errors : str, {'raise', 'ignore'}, default 'raise'
             - ``raise`` : allow exceptions to be raised
             - ``ignore`` : suppress exceptions. On error return original object
+        using_cow: bool, default False
+            Signaling if copy on write copy logic is used.
 
         Returns
         -------
@@ -493,7 +502,12 @@ def astype(
         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
 
         new_values = maybe_coerce_values(new_values)
-        newb = self.make_block(new_values)
+
+        refs = None
+        if using_cow and astype_is_view(values.dtype, new_values.dtype):
+            refs = self.refs
+
+        newb = self.make_block(new_values, refs=refs)
         if newb.shape != self.shape:
             raise TypeError(
                 f"cannot set astype for copy = [{copy}] for dtype "

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -421,8 +421,20 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
             "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
         )
 
-    def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
-        return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
+    def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
+        if copy is None:
+            if using_copy_on_write():
+                copy = False
+            else:
+                copy = True
+
+        return self.apply(
+            "astype",
+            dtype=dtype,
+            copy=copy,
+            errors=errors,
+            using_cow=using_copy_on_write(),
+        )
 
     def convert(self: T, copy: bool | None) -> T:
         if copy is None:

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -0,0 +1,195 @@
+import numpy as np
+import pytest
+
+from pandas.compat import pa_version_under7p0
+
+from pandas import (
+    DataFrame,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+def test_astype_single_dtype(using_copy_on_write):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
+    df_orig = df.copy()
+    df2 = df.astype("float64")
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64"))
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
+def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype):
+    if new_dtype == "int64[pyarrow]" and pa_version_under7p0:
+        pytest.skip("pyarrow not installed")
+    df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 0] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(new_dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
+
+
+@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
+def test_astype_different_target_dtype(using_copy_on_write, dtype):
+    if dtype == "int32[pyarrow]" and pa_version_under7p0:
+        pytest.skip("pyarrow not installed")
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    df2 = df.astype(dtype)
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    if using_copy_on_write:
+        assert df2._mgr._has_no_reference(0)
+
+    df2.iloc[0, 0] = 5
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(dtype))
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+)
+def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string"), ("string", "object")]
+)
+def test_astype_string_and_object_update_original(
+    using_copy_on_write, dtype, new_dtype
+):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df2 = df.astype(new_dtype)
+    df_orig = df2.copy()
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df2, df_orig)
+
+
+def test_astype_dict_dtypes(using_copy_on_write):
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
+    )
+    df_orig = df.copy()
+    df2 = df.astype({"a": "float64", "c": "float64"})
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+
+    df2.iloc[0, 1] = 10
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_astype_different_datetime_resos(using_copy_on_write):
+    df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")})
+    result = df.astype("datetime64[ms]")
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+    if using_copy_on_write:
+        assert result._mgr._has_no_reference(0)
+
+
+def test_astype_different_timezones(using_copy_on_write):
+    df = DataFrame(
+        {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")}
+    )
+    result = df.astype("datetime64[ns, Europe/Berlin]")
+    if using_copy_on_write:
+        assert not result._mgr._has_no_reference(0)
+        assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
+
+
+def test_astype_different_timezones_different_reso(using_copy_on_write):
+    df = DataFrame(
+        {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")}
+    )
+    result = df.astype("datetime64[ms, Europe/Berlin]")
+    if using_copy_on_write:
+        assert result._mgr._has_no_reference(0)
+        assert not np.shares_memory(
+            get_array(df, "a").asi8, get_array(result, "a").asi8
+        )
+
+
+@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
+def test_astype_arrow_timestamp(using_copy_on_write):
+    df = DataFrame(
+        {
+            "a": [
+                Timestamp("2020-01-01 01:01:01.000001"),
+                Timestamp("2020-01-01 01:01:01.000001"),
+            ]
+        },
+        dtype="M8[ns]",
+    )
+    result = df.astype("timestamp[ns][pyarrow]")
+    if using_copy_on_write:
+        assert not result._mgr._has_no_reference(0)
+        assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)