Skip to content

Handle CoW in BlockManager.apply #50948

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6142,7 +6142,7 @@ def dtypes(self):
return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)

def astype(
self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise"
self: NDFrameT, dtype, copy: bool_t = None, errors: IgnoreRaise = "raise"
) -> NDFrameT:
"""
Cast a pandas object to a specified dtype ``dtype``.
Expand Down Expand Up @@ -6260,6 +6260,11 @@ def astype(
2 2020-01-03
dtype: datetime64[ns]
"""
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True
if is_dict_like(dtype):
if self.ndim == 1: # i.e. Series
if len(dtype) > 1 or self.name not in dtype:
Expand Down Expand Up @@ -6459,7 +6464,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
return self.copy(deep=True)

@final
def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT:
def infer_objects(self: NDFrameT, copy: bool_t = None) -> NDFrameT:
"""
Attempt to infer better dtypes for object columns.

Expand Down
58 changes: 46 additions & 12 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import (
Timestamp,
internals as libinternals,
Expand Down Expand Up @@ -414,7 +416,7 @@ def coerce_to_target_dtype(self, other) -> Block:
"""
new_dtype = find_result_type(self.values, other)

return self.astype(new_dtype, copy=False)
return self.astype(new_dtype, copy=False)[0]

@final
def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]:
Expand Down Expand Up @@ -449,13 +451,20 @@ def convert(
self,
*,
copy: bool = True,
return_is_view=False,
) -> list[Block]:
"""
attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
return [self.copy()] if copy else [self]
if using_copy_on_write():
result = self.copy(deep=copy)
is_view = True
else:
result = self.copy() if copy else self
is_view = False
return [(result, is_view)] if return_is_view else [result]

# ---------------------------------------------------------------------
# Array-Like Methods
Expand Down Expand Up @@ -496,7 +505,22 @@ def astype(
f"({self.dtype.name} [{self.shape}]) to different shape "
f"({newb.dtype.name} [{newb.shape}])"
)
return newb

is_view = False
if using_copy_on_write():
if not copy:
if (
isinstance(values.dtype, np.dtype)
and isinstance(new_values.dtype, np.dtype)
and values is not new_values
):
# We certainly made a copy
pass
else:
# We maybe didn't make a copy
is_view = True

return newb, is_view

@final
def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block:
Expand Down Expand Up @@ -571,7 +595,7 @@ def replace(

elif self.ndim == 1 or self.shape[0] == 1:
if value is None or value is NA:
blk = self.astype(np.dtype(object))
blk = self.astype(np.dtype(object))[0]
else:
blk = self.coerce_to_target_dtype(value)
return blk.replace(
Expand Down Expand Up @@ -753,7 +777,7 @@ def _replace_coerce(
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
nb = self.astype(np.dtype(object), copy=False)
nb = self.astype(np.dtype(object), copy=False)[0]
if nb is self and not inplace:
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
Expand Down Expand Up @@ -1995,6 +2019,7 @@ def convert(
self,
*,
copy: bool = True,
return_is_view=False,
) -> list[Block]:
"""
attempt to cast any object types to better types return a copy of
Expand All @@ -2003,7 +2028,10 @@ def convert(
if self.dtype != _dtype_obj:
# GH#50067 this should be impossible in ObjectBlock, but until
# that is fixed, we short-circuit here.
return [self]
if using_copy_on_write():
result = self.copy(deep=False)
return [(result, True)] if return_is_view else [result]
return [(self, False)] if return_is_view else [self]

values = self.values
if values.ndim == 2:
Expand All @@ -2018,10 +2046,16 @@ def convert(
convert_period=True,
convert_interval=True,
)
is_view = False
if copy and res_values is values:
res_values = values.copy()
elif res_values is values and using_copy_on_write():
is_view = True
res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values)]
if return_is_view:
return [(self.make_block(res_values), is_view)]
else:
return [self.make_block(res_values)]


# -----------------------------------------------------------------
Expand Down Expand Up @@ -2185,16 +2219,16 @@ def extract_pandas_array(
# -----------------------------------------------------------------


def extend_blocks(result, blocks=None) -> list[Block]:
def extend_blocks(result, blocks=None, i=0) -> list[Block]:
"""return a new extended blocks, given the result"""
if blocks is None:
blocks = []
if isinstance(result, list):
for r in result:
if isinstance(r, list):
blocks.extend(r)
else:
blocks.append(r)
extend_blocks(r, blocks)
elif isinstance(result, tuple):
assert isinstance(result[0], Block), type(result[0])
blocks.append((result[0], result[1], i))
else:
assert isinstance(result, Block), type(result)
blocks.append(result)
Expand Down
40 changes: 36 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ def apply(
self: T,
f,
align_keys: list[str] | None = None,
check_cow: bool = False,
**kwargs,
) -> T:
"""
Expand All @@ -327,7 +328,7 @@ def apply(

aligned_args = {k: kwargs[k] for k in align_keys}

for b in self.blocks:
for i, b in enumerate(self.blocks):

if aligned_args:

Expand All @@ -347,9 +348,24 @@ def apply(
applied = b.apply(f, **kwargs)
else:
applied = getattr(b, f)(**kwargs)
result_blocks = extend_blocks(applied, result_blocks)
result_blocks = extend_blocks(applied, result_blocks, i)

refs = None
parent = None
if check_cow:
result_blocks, result_is_view, result_index = list(zip(*result_blocks))
if using_copy_on_write():
refs = []
for b, is_view, i in zip(result_blocks, result_is_view, result_index):
if is_view:
refs.append(weakref.ref(self.blocks[i]))
else:
refs.append(None)

out = type(self).from_blocks(result_blocks, self.axes)
if com.any_not_none(refs):
parent = self

out = type(self).from_blocks(result_blocks, self.axes, refs, parent)
return out

def where(self: T, other, cond, align: bool) -> T:
Expand Down Expand Up @@ -436,12 +452,28 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
)

def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True

return self.apply(
"astype", dtype=dtype, copy=copy, errors=errors, check_cow=True
)

def convert(self: T, copy: bool) -> T:
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True

return self.apply(
"convert",
copy=copy,
check_cow=True,
return_is_view=True,
)

def replace(self: T, to_replace, value, inplace: bool) -> T:
Expand Down
93 changes: 93 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,3 +1137,96 @@ def test_isetitem(using_copy_on_write):
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
else:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))


def test_astype_single_dtype(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
df_orig = df.copy()
df2 = df.astype("float64")

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 2] = 5.5
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
tm.assert_frame_equal(df, df_orig)


def test_astype_dict_dtypes(using_copy_on_write):
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
)
df_orig = df.copy()
df2 = df.astype({"a": "float64", "c": "float64"})

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

# mutating df2 triggers a copy-on-write for that column/block
df2.iloc[0, 2] = 5.5
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))

df2.iloc[0, 1] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)


def test_infer_objects(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
df_orig = df.copy()
df2 = df.infer_objects()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 0] = 0
df2.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)


def test_infer_objects2(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "x", "c": np.array([1, 2], dtype=object)})
df_orig = df.copy()
df2 = df.infer_objects()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 0] = 0
df2.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)

# mutate parent
df2 = df.infer_objects()
df.iloc[0, 0] = 0
assert df2.iloc[0, 0] == 1