Skip to content

Commit 13db83a

Browse files
ENH: Add lazy copy to astype (#50802)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent d0fbeb4 commit 13db83a

File tree

10 files changed

+308
-14
lines changed

10 files changed

+308
-14
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Copy-on-Write improvements
224224
- :meth:`DataFrame.truncate`
225225
- :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
226226
- :meth:`DataFrame.infer_objects` / :meth:`Series.infer_objects`
227+
- :meth:`DataFrame.astype` / :meth:`Series.astype`
227228
- :func:`concat`
228229

229230
These methods return views when Copy-on-Write is enabled, which provides a significant

pandas/core/dtypes/astype.py

+53
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
is_dtype_equal,
2727
is_integer_dtype,
2828
is_object_dtype,
29+
is_string_dtype,
2930
is_timedelta64_dtype,
3031
pandas_dtype,
3132
)
@@ -246,3 +247,55 @@ def astype_array_safe(
246247
raise
247248

248249
return new_values
250+
251+
252+
def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
253+
"""Checks if astype avoided copying the data.
254+
255+
Parameters
256+
----------
257+
dtype : Original dtype
258+
new_dtype : target dtype
259+
260+
Returns
261+
-------
262+
True if new data is a view or not guaranteed to be a copy, False otherwise
263+
"""
264+
if dtype == new_dtype:
265+
return True
266+
267+
elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
268+
# Only equal numpy dtypes avoid a copy
269+
return False
270+
271+
elif is_string_dtype(dtype) and is_string_dtype(new_dtype):
272+
# Potentially! a view when converting from object to string
273+
return True
274+
275+
elif is_object_dtype(dtype) and new_dtype.kind == "O":
276+
# When the underlying array has dtype object, we don't have to make a copy
277+
return True
278+
279+
elif dtype.kind in "mM" and new_dtype.kind in "mM":
280+
dtype = getattr(dtype, "numpy_dtype", dtype)
281+
new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype)
282+
return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None)
283+
284+
numpy_dtype = getattr(dtype, "numpy_dtype", None)
285+
new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None)
286+
287+
if numpy_dtype is None and isinstance(dtype, np.dtype):
288+
numpy_dtype = dtype
289+
290+
if new_numpy_dtype is None and isinstance(new_dtype, np.dtype):
291+
numpy_dtype = new_dtype
292+
293+
if numpy_dtype is not None and new_numpy_dtype is not None:
294+
# if both have NumPy dtype or one of them is a numpy dtype
295+
# they are only a view when the numpy dtypes are equal, e.g.
296+
# int64 -> Int64 or int64[pyarrow]
297+
# int64 -> Int32 copies
298+
return numpy_dtype == new_numpy_dtype
299+
300+
# Assume this is a view since we don't know for sure if a copy was made
301+
return True

pandas/core/generic.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -6120,7 +6120,7 @@ def dtypes(self):
61206120
return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
61216121

61226122
def astype(
6123-
self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise"
6123+
self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
61246124
) -> NDFrameT:
61256125
"""
61266126
Cast a pandas object to a specified dtype ``dtype``.
@@ -6257,7 +6257,7 @@ def astype(
62576257
for i, (col_name, col) in enumerate(self.items()):
62586258
cdt = dtype_ser.iat[i]
62596259
if isna(cdt):
6260-
res_col = col.copy() if copy else col
6260+
res_col = col.copy(deep=copy)
62616261
else:
62626262
try:
62636263
res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
@@ -6284,7 +6284,7 @@ def astype(
62846284

62856285
# GH 33113: handle empty frame or series
62866286
if not results:
6287-
return self.copy()
6287+
return self.copy(deep=None)
62886288

62896289
# GH 19920: retain column metadata after concat
62906290
result = concat(results, axis=1, copy=False)

pandas/core/internals/array_manager.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
366366
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
367367
)
368368

369-
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
369+
def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
370+
if copy is None:
371+
copy = True
372+
370373
return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
371374

372375
def convert(self: T, copy: bool | None) -> T:

pandas/core/internals/blocks.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,10 @@
4141
from pandas.util._decorators import cache_readonly
4242
from pandas.util._validators import validate_bool_kwarg
4343

44-
from pandas.core.dtypes.astype import astype_array_safe
44+
from pandas.core.dtypes.astype import (
45+
astype_array_safe,
46+
astype_is_view,
47+
)
4548
from pandas.core.dtypes.cast import (
4649
LossySetitemError,
4750
can_hold_element,
@@ -470,7 +473,11 @@ def dtype(self) -> DtypeObj:
470473

471474
@final
472475
def astype(
473-
self, dtype: DtypeObj, copy: bool = False, errors: IgnoreRaise = "raise"
476+
self,
477+
dtype: DtypeObj,
478+
copy: bool = False,
479+
errors: IgnoreRaise = "raise",
480+
using_cow: bool = False,
474481
) -> Block:
475482
"""
476483
Coerce to the new dtype.
@@ -483,6 +490,8 @@ def astype(
483490
errors : str, {'raise', 'ignore'}, default 'raise'
484491
- ``raise`` : allow exceptions to be raised
485492
- ``ignore`` : suppress exceptions. On error return original object
493+
using_cow: bool, default False
494+
Signaling if copy on write copy logic is used.
486495
487496
Returns
488497
-------
@@ -493,7 +502,12 @@ def astype(
493502
new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
494503

495504
new_values = maybe_coerce_values(new_values)
496-
newb = self.make_block(new_values)
505+
506+
refs = None
507+
if using_cow and astype_is_view(values.dtype, new_values.dtype):
508+
refs = self.refs
509+
510+
newb = self.make_block(new_values, refs=refs)
497511
if newb.shape != self.shape:
498512
raise TypeError(
499513
f"cannot set astype for copy = [{copy}] for dtype "

pandas/core/internals/managers.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -421,8 +421,20 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
421421
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
422422
)
423423

424-
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
425-
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
424+
def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
425+
if copy is None:
426+
if using_copy_on_write():
427+
copy = False
428+
else:
429+
copy = True
430+
431+
return self.apply(
432+
"astype",
433+
dtype=dtype,
434+
copy=copy,
435+
errors=errors,
436+
using_cow=using_copy_on_write(),
437+
)
426438

427439
def convert(self: T, copy: bool | None) -> T:
428440
if copy is None:

pandas/tests/copy_view/test_astype.py

+195
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
import numpy as np
2+
import pytest
3+
4+
from pandas.compat import pa_version_under7p0
5+
6+
from pandas import (
7+
DataFrame,
8+
Series,
9+
Timestamp,
10+
date_range,
11+
)
12+
import pandas._testing as tm
13+
from pandas.tests.copy_view.util import get_array
14+
15+
16+
def test_astype_single_dtype(using_copy_on_write):
17+
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
18+
df_orig = df.copy()
19+
df2 = df.astype("float64")
20+
21+
if using_copy_on_write:
22+
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
23+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
24+
else:
25+
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
26+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
27+
28+
# mutating df2 triggers a copy-on-write for that column/block
29+
df2.iloc[0, 2] = 5.5
30+
if using_copy_on_write:
31+
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
32+
tm.assert_frame_equal(df, df_orig)
33+
34+
# mutating parent also doesn't update result
35+
df2 = df.astype("float64")
36+
df.iloc[0, 2] = 5.5
37+
tm.assert_frame_equal(df2, df_orig.astype("float64"))
38+
39+
40+
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
41+
@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
42+
def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype):
43+
if new_dtype == "int64[pyarrow]" and pa_version_under7p0:
44+
pytest.skip("pyarrow not installed")
45+
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
46+
df_orig = df.copy()
47+
df2 = df.astype(new_dtype)
48+
49+
if using_copy_on_write:
50+
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
51+
else:
52+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
53+
54+
# mutating df2 triggers a copy-on-write for that column/block
55+
df2.iloc[0, 0] = 10
56+
if using_copy_on_write:
57+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
58+
tm.assert_frame_equal(df, df_orig)
59+
60+
# mutating parent also doesn't update result
61+
df2 = df.astype(new_dtype)
62+
df.iloc[0, 0] = 100
63+
tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
64+
65+
66+
@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
67+
def test_astype_different_target_dtype(using_copy_on_write, dtype):
68+
if dtype == "int32[pyarrow]" and pa_version_under7p0:
69+
pytest.skip("pyarrow not installed")
70+
df = DataFrame({"a": [1, 2, 3]})
71+
df_orig = df.copy()
72+
df2 = df.astype(dtype)
73+
74+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
75+
if using_copy_on_write:
76+
assert df2._mgr._has_no_reference(0)
77+
78+
df2.iloc[0, 0] = 5
79+
tm.assert_frame_equal(df, df_orig)
80+
81+
# mutating parent also doesn't update result
82+
df2 = df.astype(dtype)
83+
df.iloc[0, 0] = 100
84+
tm.assert_frame_equal(df2, df_orig.astype(dtype))
85+
86+
87+
@pytest.mark.parametrize(
88+
"dtype, new_dtype", [("object", "string"), ("string", "object")]
89+
)
90+
def test_astype_string_and_object(using_copy_on_write, dtype, new_dtype):
91+
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
92+
df_orig = df.copy()
93+
df2 = df.astype(new_dtype)
94+
95+
if using_copy_on_write:
96+
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
97+
else:
98+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
99+
100+
df2.iloc[0, 0] = "x"
101+
tm.assert_frame_equal(df, df_orig)
102+
103+
104+
@pytest.mark.parametrize(
105+
"dtype, new_dtype", [("object", "string"), ("string", "object")]
106+
)
107+
def test_astype_string_and_object_update_original(
108+
using_copy_on_write, dtype, new_dtype
109+
):
110+
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
111+
df2 = df.astype(new_dtype)
112+
df_orig = df2.copy()
113+
114+
if using_copy_on_write:
115+
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
116+
else:
117+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
118+
119+
df.iloc[0, 0] = "x"
120+
tm.assert_frame_equal(df2, df_orig)
121+
122+
123+
def test_astype_dict_dtypes(using_copy_on_write):
124+
df = DataFrame(
125+
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
126+
)
127+
df_orig = df.copy()
128+
df2 = df.astype({"a": "float64", "c": "float64"})
129+
130+
if using_copy_on_write:
131+
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
132+
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
133+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
134+
else:
135+
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
136+
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
137+
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
138+
139+
# mutating df2 triggers a copy-on-write for that column/block
140+
df2.iloc[0, 2] = 5.5
141+
if using_copy_on_write:
142+
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
143+
144+
df2.iloc[0, 1] = 10
145+
if using_copy_on_write:
146+
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
147+
tm.assert_frame_equal(df, df_orig)
148+
149+
150+
def test_astype_different_datetime_resos(using_copy_on_write):
151+
df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")})
152+
result = df.astype("datetime64[ms]")
153+
154+
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
155+
if using_copy_on_write:
156+
assert result._mgr._has_no_reference(0)
157+
158+
159+
def test_astype_different_timezones(using_copy_on_write):
160+
df = DataFrame(
161+
{"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")}
162+
)
163+
result = df.astype("datetime64[ns, Europe/Berlin]")
164+
if using_copy_on_write:
165+
assert not result._mgr._has_no_reference(0)
166+
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
167+
168+
169+
def test_astype_different_timezones_different_reso(using_copy_on_write):
170+
df = DataFrame(
171+
{"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific")}
172+
)
173+
result = df.astype("datetime64[ms, Europe/Berlin]")
174+
if using_copy_on_write:
175+
assert result._mgr._has_no_reference(0)
176+
assert not np.shares_memory(
177+
get_array(df, "a").asi8, get_array(result, "a").asi8
178+
)
179+
180+
181+
@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
182+
def test_astype_arrow_timestamp(using_copy_on_write):
183+
df = DataFrame(
184+
{
185+
"a": [
186+
Timestamp("2020-01-01 01:01:01.000001"),
187+
Timestamp("2020-01-01 01:01:01.000001"),
188+
]
189+
},
190+
dtype="M8[ns]",
191+
)
192+
result = df.astype("timestamp[ns][pyarrow]")
193+
if using_copy_on_write:
194+
assert not result._mgr._has_no_reference(0)
195+
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)

0 commit comments

Comments
 (0)