From 705b17c29b2a8218ffca6e50fe5c99e5729f8e0d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Jan 2023 23:37:55 +0100 Subject: [PATCH 1/2] Handle CoW in BlockManager.apply --- pandas/core/generic.py | 7 +++- pandas/core/internals/blocks.py | 28 +++++++++++++--- pandas/core/internals/managers.py | 28 ++++++++++++++-- pandas/tests/copy_view/test_methods.py | 46 ++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 998c57b66509d..3897e1b2f8a21 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6142,7 +6142,7 @@ def dtypes(self): return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) def astype( - self: NDFrameT, dtype, copy: bool_t = True, errors: IgnoreRaise = "raise" + self: NDFrameT, dtype, copy: bool_t = None, errors: IgnoreRaise = "raise" ) -> NDFrameT: """ Cast a pandas object to a specified dtype ``dtype``. @@ -6260,6 +6260,11 @@ def astype( 2 2020-01-03 dtype: datetime64[ns] """ + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 00ab9d02cee00..776f381b1cfa2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import ( Timestamp, internals as libinternals, @@ -414,7 +416,7 @@ def coerce_to_target_dtype(self, other) -> Block: """ new_dtype = find_result_type(self.values, other) - return self.astype(new_dtype, copy=False) + return self.astype(new_dtype, copy=False)[0] @final def _maybe_downcast(self, blocks: list[Block], downcast=None) -> list[Block]: @@ -496,7 +498,22 @@ def astype( f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) - return newb + + is_view = False + if using_copy_on_write(): + if not copy: + if ( + isinstance(values.dtype, np.dtype) + and isinstance(new_values.dtype, np.dtype) + and values is not new_values + ): + # We certainly made a copy + pass + else: + # We maybe didn't make a copy + is_view = True + + return newb, is_view @final def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: @@ -571,7 +588,7 @@ def replace( elif self.ndim == 1 or self.shape[0] == 1: if value is None or value is NA: - blk = self.astype(np.dtype(object)) + blk = self.astype(np.dtype(object))[0] else: blk = self.coerce_to_target_dtype(value) return blk.replace( @@ -753,7 +770,7 @@ def _replace_coerce( if value is None: # gh-45601, gh-45836, gh-46634 if mask.any(): - nb = self.astype(np.dtype(object), copy=False) + nb = self.astype(np.dtype(object), copy=False)[0] if nb is self and not inplace: nb = nb.copy() putmask_inplace(nb.values, mask, value) @@ -2195,6 +2212,9 @@ def extend_blocks(result, blocks=None) -> list[Block]: blocks.extend(r) else: blocks.append(r) + elif isinstance(result, tuple): + assert isinstance(result[0], Block), type(result[0]) + blocks.append(result) else: assert isinstance(result, Block), type(result) blocks.append(result) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1a0aba0778da5..14ef21a922cfc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -302,6 +302,7 @@ def apply( self: T, f, align_keys: list[str] | None = None, + check_cow: bool = False, **kwargs, ) -> T: """ @@ -349,7 +350,22 @@ def apply( applied = getattr(b, f)(**kwargs) result_blocks = extend_blocks(applied, result_blocks) - out = type(self).from_blocks(result_blocks, self.axes) + refs = None + parent = None + if check_cow: + result_blocks, result_is_view = list(zip(*result_blocks)) + if using_copy_on_write(): + refs = [] + for b, is_view in zip(result_blocks, result_is_view): + if is_view: + refs.append(weakref.ref(b)) + else: + refs.append(None) + + if com.any_not_none(refs): + parent = self + + out = type(self).from_blocks(result_blocks, self.axes, refs, parent) return out def where(self: T, other, cond, align: bool) -> T: @@ -436,7 +452,15 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: ) def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: - return self.apply("astype", dtype=dtype, copy=copy, errors=errors) + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + + return self.apply( + "astype", dtype=dtype, copy=copy, errors=errors, check_cow=True + ) def convert(self: T, copy: bool) -> T: return self.apply( diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 0cec5522e39cd..3b40e69374a89 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1137,3 +1137,49 @@ def test_isetitem(using_copy_on_write): assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) else: assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + + +def test_astype_single_dtype(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5}) + df_orig = df.copy() + df2 = df.astype("float64") + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +def test_astype_dict_dtypes(using_copy_on_write): + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} + ) + df_orig = df.copy() + df2 = df.astype({"a": "float64", "c": "float64"}) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 2] = 5.5 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + + df2.iloc[0, 1] = 10 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) From aec2d1727e9268056b17542bca133b5abbb943da Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Jan 2023 00:36:07 +0100 Subject: [PATCH 2/2] fix and adapt for infer_objects --- pandas/core/generic.py | 2 +- pandas/core/internals/blocks.py | 32 +++++++++++++----- pandas/core/internals/managers.py | 18 +++++++--- pandas/tests/copy_view/test_methods.py | 47 ++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3897e1b2f8a21..15213b7e82516 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6464,7 +6464,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT: return self.copy(deep=True) @final - def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT: + def infer_objects(self: NDFrameT, copy: bool_t = None) -> NDFrameT: """ Attempt to infer better dtypes for object columns. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 776f381b1cfa2..c52d380495bea 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -451,13 +451,20 @@ def convert( self, *, copy: bool = True, + return_is_view=False, ) -> list[Block]: """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ - return [self.copy()] if copy else [self] + if using_copy_on_write(): + result = self.copy(deep=copy) + is_view = True + else: + result = self.copy() if copy else self + is_view = False + return [(result, is_view)] if return_is_view else [result] # --------------------------------------------------------------------- # Array-Like Methods @@ -2012,6 +2019,7 @@ def convert( self, *, copy: bool = True, + return_is_view=False, ) -> list[Block]: """ attempt to cast any object types to better types return a copy of @@ -2020,7 +2028,10 @@ def convert( if self.dtype != _dtype_obj: # GH#50067 this should be impossible in ObjectBlock, but until # that is fixed, we short-circuit here. - return [self] + if using_copy_on_write(): + result = self.copy(deep=False) + return [(result, True)] if return_is_view else [result] + return [(self, False)] if return_is_view else [self] values = self.values if values.ndim == 2: @@ -2035,10 +2046,16 @@ def convert( convert_period=True, convert_interval=True, ) + is_view = False if copy and res_values is values: res_values = values.copy() + elif res_values is values and using_copy_on_write(): + is_view = True res_values = ensure_block_shape(res_values, self.ndim) - return [self.make_block(res_values)] + if return_is_view: + return [(self.make_block(res_values), is_view)] + else: + return [self.make_block(res_values)] # ----------------------------------------------------------------- @@ -2202,19 +2219,16 @@ def extract_pandas_array( # ----------------------------------------------------------------- -def extend_blocks(result, blocks=None) -> list[Block]: +def extend_blocks(result, blocks=None, i=0) -> list[Block]: """return a new extended blocks, given the result""" if blocks is None: blocks = [] if isinstance(result, list): for r in result: - if isinstance(r, list): - blocks.extend(r) - else: - blocks.append(r) + extend_blocks(r, blocks) elif isinstance(result, tuple): assert isinstance(result[0], Block), type(result[0]) - blocks.append(result) + blocks.append((result[0], result[1], i)) else: assert isinstance(result, Block), type(result) blocks.append(result) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 14ef21a922cfc..d2defd2524535 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -328,7 +328,7 @@ def apply( aligned_args = {k: kwargs[k] for k in align_keys} - for b in self.blocks: + for i, b in enumerate(self.blocks): if aligned_args: @@ -348,17 +348,17 @@ def apply( applied = b.apply(f, **kwargs) else: applied = getattr(b, f)(**kwargs) - result_blocks = extend_blocks(applied, result_blocks) + result_blocks = extend_blocks(applied, result_blocks, i) refs = None parent = None if check_cow: - result_blocks, result_is_view = list(zip(*result_blocks)) + result_blocks, result_is_view, result_index = list(zip(*result_blocks)) if using_copy_on_write(): refs = [] - for b, is_view in zip(result_blocks, result_is_view): + for b, is_view, i in zip(result_blocks, result_is_view, result_index): if is_view: - refs.append(weakref.ref(b)) + refs.append(weakref.ref(self.blocks[i])) else: refs.append(None) @@ -463,9 +463,17 @@ def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T: ) def convert(self: T, copy: bool) -> T: + if copy is None: + if using_copy_on_write(): + copy = False + else: + copy = True + return self.apply( "convert", copy=copy, + check_cow=True, + return_is_view=True, ) def replace(self: T, to_replace, value, inplace: bool) -> T: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 3b40e69374a89..782d6e991fe1e 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1183,3 +1183,50 @@ def test_astype_dict_dtypes(using_copy_on_write): if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) tm.assert_frame_equal(df, df_orig) + + +def test_infer_objects(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) + df_orig = df.copy() + df2 = df.infer_objects() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + df2.iloc[0, 0] = 0 + df2.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + +def test_infer_objects2(using_copy_on_write): + df = DataFrame({"a": [1, 2], "b": "x", "c": np.array([1, 2], dtype=object)}) + df_orig = df.copy() + df2 = df.infer_objects() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + df2.iloc[0, 0] = 0 + df2.iloc[0, 1] = "d" + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + tm.assert_frame_equal(df, df_orig) + + # mutate parent + df2 = df.infer_objects() + df.iloc[0, 0] = 0 + assert df2.iloc[0, 0] == 1