From 7c68f0d262213762ea346a929f68ec2433cc6530 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 4 Feb 2024 23:22:23 +0000 Subject: [PATCH 1/3] CoW: Enforce some deprecations on the block level --- pandas/core/frame.py | 1 - pandas/core/generic.py | 19 ++- pandas/core/internals/blocks.py | 202 ++++++++++++------------------ pandas/core/internals/managers.py | 50 +------- pandas/core/series.py | 2 +- 5 files changed, 94 insertions(+), 180 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index afa680d064c4a..614e8ff0232f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10744,7 +10744,6 @@ def _series_round(ser: Series, decimals: int) -> Series: # type "Union[int, integer[Any]]"; expected "int" new_mgr = self._mgr.round( decimals=decimals, # type: ignore[arg-type] - using_cow=using_copy_on_write(), ) return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7bb07694c34a5..61fb757fafec4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6360,9 +6360,6 @@ def astype( 2 2020-01-03 dtype: datetime64[ns] """ - if copy and using_copy_on_write(): - copy = False - if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: @@ -6371,7 +6368,7 @@ def astype( "the key in Series dtype mappings." ) new_type = dtype[self.name] - return self.astype(new_type, copy, errors) + return self.astype(new_type, errors=errors) # GH#44417 cast to Series so we can use .iat below, which will be # robust in case we @@ -6393,10 +6390,10 @@ def astype( for i, (col_name, col) in enumerate(self.items()): cdt = dtype_ser.iat[i] if isna(cdt): - res_col = col.copy(deep=copy) + res_col = col.copy(deep=False) else: try: - res_col = col.astype(dtype=cdt, copy=copy, errors=errors) + res_col = col.astype(dtype=cdt, errors=errors) except ValueError as ex: ex.args = ( f"{ex}: Error while type casting for column '{col_name}'", @@ -6410,22 +6407,20 @@ def astype( if isinstance(dtype, ExtensionDtype) and all( arr.dtype == dtype for arr in self._mgr.arrays ): - return self.copy(deep=copy) + return self.copy(deep=False) # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: self.items handles duplicate column names - results = [ - ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items() - ] + results = [ser.astype(dtype, errors=errors) for _, ser in self.items()] else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) + new_data = self._mgr.astype(dtype=dtype, errors=errors) res = self._constructor_from_mgr(new_data, axes=new_data.axes) return res.__finalize__(self, method="astype") # GH 33113: handle empty frame or series if not results: - return self.copy(deep=None) + return self.copy(deep=False) # GH 19920: retain column metadata after concat result = concat(results, axis=1, copy=False) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index bb65e7a4d0838..02296643acc3e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -527,14 +527,13 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype, copy=False) + return self.astype(new_dtype) @final def _maybe_downcast( self, blocks: list[Block], downcast, - using_cow: bool, caller: str, ) -> list[Block]: if downcast is False: @@ -551,7 +550,7 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [blk.convert(using_cow=True, copy=False) for blk in blocks] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -576,7 +575,7 @@ def _maybe_downcast( elif caller == "where" and get_option("future.no_silent_downcasting") is True: return blocks else: - nbs = extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) + nbs = extend_blocks([b._downcast_2d(downcast, True) for b in blocks]) # When _maybe_downcast is called with caller="where", it is either # a) with downcast=False, which is a no-op (the desired future behavior) @@ -667,8 +666,6 @@ def convert( def convert_dtypes( self, - copy: bool, - using_cow: bool, infer_objects: bool = True, convert_string: bool = True, convert_integer: bool = True, @@ -677,14 +674,14 @@ def convert_dtypes( dtype_backend: DtypeBackend = "numpy_nullable", ) -> list[Block]: if infer_objects and self.is_object: - blks = self.convert(copy=False, using_cow=using_cow) + blks = self.convert(copy=False) else: blks = [self] if not any( [convert_floating, convert_integer, convert_boolean, convert_string] ): - return [b.copy(deep=copy) for b in blks] + return [b.copy(deep=False) for b in blks] rbs = [] for blk in blks: @@ -704,11 +701,11 @@ def convert_dtypes( ] if all(dtype == self.dtype for dtype in dtypes): # Avoid block splitting if no dtype changes - rbs.append(blk.copy(deep=copy)) + rbs.append(blk.copy(deep=False)) continue for dtype, b in zip(dtypes, sub_blks): - rbs.append(b.astype(dtype=dtype, copy=copy, squeeze=b.ndim != 1)) + rbs.append(b.astype(dtype=dtype, squeeze=b.ndim != 1)) return rbs # --------------------------------------------------------------------- @@ -723,9 +720,7 @@ def dtype(self) -> DtypeObj: def astype( self, dtype: DtypeObj, - copy: bool = False, errors: IgnoreRaise = "raise", - using_cow: bool = False, squeeze: bool = False, ) -> Block: """ @@ -734,13 +729,9 @@ def astype( Parameters ---------- dtype : np.dtype or ExtensionDtype - copy : bool, default False - copy if indicated errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object - using_cow: bool, default False - Signaling if copy on write copy logic is used. squeeze : bool, default False squeeze values to ndim=1 if only one column is given @@ -754,18 +745,18 @@ def astype( raise ValueError("Can not squeeze with more than one column.") values = values[0, :] # type: ignore[call-overload] - new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) + new_values = astype_array_safe(values, dtype, errors=errors) new_values = maybe_coerce_values(new_values) refs = None - if (using_cow or not copy) and astype_is_view(values.dtype, new_values.dtype): + if astype_is_view(values.dtype, new_values.dtype): refs = self.refs newb = self.make_block(new_values, refs=refs) if newb.shape != self.shape: raise TypeError( - f"cannot set astype for copy = [{copy}] for dtype " + f"cannot set astype for dtype " f"({self.dtype.name} [{self.shape}]) to different shape " f"({newb.dtype.name} [{newb.shape}])" ) @@ -801,8 +792,16 @@ def copy(self, deep: bool = True) -> Self: # --------------------------------------------------------------------- # Copy-on-Write Helpers + def _maybe_copy(self, inplace: bool) -> Self: + if inplace: + deep = self.refs.has_reference() + return self.copy(deep=deep) + return self.copy() + @final - def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: + def _maybe_copy_cow_check( + self, using_cow: bool = True, inplace: bool = True + ) -> Self: if using_cow and inplace: deep = self.refs.has_reference() blk = self.copy(deep=deep) @@ -811,7 +810,18 @@ def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: return blk @final - def _get_refs_and_copy(self, using_cow: bool, inplace: bool): + def _get_refs_and_copy(self, inplace: bool): + refs = None + copy = not inplace + if inplace: + if self.refs.has_reference(): + copy = True + else: + refs = self.refs + return copy, refs + + @final + def _get_refs_and_copy_cow_check(self, using_cow: bool, inplace: bool): refs = None copy = not inplace if inplace: @@ -847,7 +857,7 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy(using_cow, inplace) + blk = self._maybe_copy_cow_check(using_cow, inplace) values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] @@ -875,7 +885,7 @@ def replace( elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - blk = self._maybe_copy(using_cow, inplace) + blk = self._maybe_copy_cow_check(using_cow, inplace) putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): @@ -968,7 +978,7 @@ def _replace_regex( rx = re.compile(to_replace) - block = self._maybe_copy(using_cow, inplace) + block = self._maybe_copy_cow_check(using_cow, inplace) replace_regex(block.values, rx, value, mask) @@ -1005,7 +1015,7 @@ def replace_list( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy(using_cow, inplace) + blk = self._maybe_copy_cow_check(using_cow, inplace) values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] @@ -1164,7 +1174,7 @@ def _replace_coerce( # gh-45601, gh-45836, gh-46634 if mask.any(): has_ref = self.refs.has_reference() - nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) + nb = self.astype(np.dtype(object)) if (nb is self or using_cow) and not inplace: nb = nb.copy() elif inplace and has_ref and nb.refs.has_reference() and using_cow: @@ -1325,7 +1335,7 @@ def _unstack( # --------------------------------------------------------------------- - def setitem(self, indexer, value, using_cow: bool = False) -> Block: + def setitem(self, indexer, value) -> Block: """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -1335,8 +1345,6 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: The subset of self.values to set value : object The value being set - using_cow: bool, default False - Signaling if CoW is used. Returns ------- @@ -1375,7 +1383,7 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: # test_iloc_setitem_custom_object casted = setitem_datetimelike_compat(values, len(vi), casted) - self = self._maybe_copy(using_cow, inplace=True) + self = self._maybe_copy(inplace=True) values = cast(np.ndarray, self.values.T) if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 @@ -1383,7 +1391,7 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: values[indexer] = casted return self - def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: + def putmask(self, mask, new) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -1394,7 +1402,6 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: ---------- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray new : a ndarray/object - using_cow: bool, default False Returns ------- @@ -1412,14 +1419,12 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: new = extract_array(new, extract_numpy=True) if noop: - if using_cow: - return [self.copy(deep=False)] - return [self] + return [self.copy(deep=False)] try: casted = np_can_hold_element(values.dtype, new) - self = self._maybe_copy(using_cow, inplace=True) + self = self._maybe_copy(inplace=True) values = cast(np.ndarray, self.values) putmask_without_repeat(values.T, mask, casted) @@ -1435,7 +1440,7 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: ).putmask(mask, new) else: indexer = mask.nonzero()[0] - nb = self.setitem(indexer, new[indexer], using_cow=using_cow) + nb = self.setitem(indexer, new[indexer]) return [nb] else: @@ -1450,13 +1455,11 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: n = new[:, i : i + 1] submask = orig_mask[:, i : i + 1] - rbs = nb.putmask(submask, n, using_cow=using_cow) + rbs = nb.putmask(submask, n) res_blocks.extend(rbs) return res_blocks - def where( - self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False - ) -> list[Block]: + def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: """ evaluate the block; return result block(s) from the result @@ -1487,9 +1490,7 @@ def where( icond, noop = validate_putmask(values, ~cond) if noop: # GH-39595: Always return a copy; short-circuit up/downcasting - if using_cow: - return [self.copy(deep=False)] - return [self.copy()] + return [self.copy(deep=False)] if other is lib.no_default: other = self.fill_value @@ -1508,10 +1509,8 @@ def where( # no need to split columns block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond, using_cow=using_cow) - return self._maybe_downcast( - blocks, downcast=_downcast, using_cow=using_cow, caller="where" - ) + blocks = block.where(orig_other, cond) + return self._maybe_downcast(blocks, downcast=_downcast, caller="where") else: # since _maybe_downcast would split blocks anyway, we @@ -1528,9 +1527,7 @@ def where( oth = other[:, i : i + 1] submask = cond[:, i : i + 1] - rbs = nb.where( - oth, submask, _downcast=_downcast, using_cow=using_cow - ) + rbs = nb.where(oth, submask, _downcast=_downcast) res_blocks.extend(rbs) return res_blocks @@ -1579,7 +1576,6 @@ def fillna( limit: int | None = None, inplace: bool = False, downcast=None, - using_cow: bool = False, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to @@ -1598,24 +1594,18 @@ def fillna( if noop: # we can't process the value, but nothing to do if inplace: - if using_cow: - return [self.copy(deep=False)] - # Arbitrarily imposing the convention that we ignore downcast - # on no-op when inplace=True - return [self] + return [self.copy(deep=False)] else: # GH#45423 consistent downcasting on no-ops. - nb = self.copy(deep=not using_cow) - nbs = nb._maybe_downcast( - [nb], downcast=downcast, using_cow=using_cow, caller="fillna" - ) + nb = self.copy(deep=False) + nbs = nb._maybe_downcast([nb], downcast=downcast, caller="fillna") return nbs if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False if inplace: - nbs = self.putmask(mask.T, value, using_cow=using_cow) + nbs = self.putmask(mask.T, value) else: # without _downcast, we would break # test_fillna_dtype_conversion_equiv_replace @@ -1626,9 +1616,7 @@ def fillna( # different behavior in _maybe_downcast. return extend_blocks( [ - blk._maybe_downcast( - [blk], downcast=downcast, using_cow=using_cow, caller="fillna" - ) + blk._maybe_downcast([blk], downcast=downcast, caller="fillna") for blk in nbs ] ) @@ -1642,15 +1630,12 @@ def pad_or_backfill( limit: int | None = None, limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, - using_cow: bool = False, ) -> list[Block]: if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] - copy, refs = self._get_refs_and_copy(using_cow, inplace) + copy, refs = self._get_refs_and_copy(inplace) # Dispatch to the NumpyExtensionArray method. # We know self.array_values is a NumpyExtensionArray bc EABlock overrides @@ -1669,7 +1654,7 @@ def pad_or_backfill( data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow, caller="fillna") + return nb._maybe_downcast([nb], downcast, caller="fillna") @final def interpolate( @@ -1682,7 +1667,6 @@ def interpolate( limit_direction: Literal["forward", "backward", "both"] = "forward", limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, - using_cow: bool = False, **kwargs, ) -> list[Block]: inplace = validate_bool_kwarg(inplace, "inplace") @@ -1693,20 +1677,16 @@ def interpolate( if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] # TODO(3.0): this case will not be reachable once GH#53638 is enforced if self.dtype == _dtype_obj: # only deal with floats # bc we already checked that can_hold_na, we don't have int dtype here # test_interp_basic checks that we make a copy here - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] - copy, refs = self._get_refs_and_copy(using_cow, inplace) + copy, refs = self._get_refs_and_copy(inplace) # Dispatch to the EA method. new_values = self.array_values.interpolate( @@ -1722,7 +1702,7 @@ def interpolate( data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") + return nb._maybe_downcast([nb], downcast, caller="interpolate") @final def diff(self, n: int) -> list[Block]: @@ -1797,7 +1777,7 @@ def quantile( return new_block_2d(result, placement=self._mgr_locs) @final - def round(self, decimals: int, using_cow: bool = False) -> Self: + def round(self, decimals: int) -> Self: """ Rounds the values. If the block is not of an integer or float dtype, nothing happens. @@ -1809,26 +1789,19 @@ def round(self, decimals: int, using_cow: bool = False) -> Self: decimals: int, Number of decimal places to round to. Caller is responsible for validating this - using_cow: bool, - Whether Copy on Write is enabled right now """ if not self.is_numeric or self.is_bool: - return self.copy(deep=not using_cow) - refs = None + return self.copy(deep=False) # TODO: round only defined on BaseMaskedArray # Series also does this, so would need to fix both places # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]" # has no attribute "round" values = self.values.round(decimals) # type: ignore[union-attr] + + refs = None if values is self.values: - if not using_cow: - # Normally would need to do this before, but - # numpy only returns same array when round operation - # is no-op - # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 - values = values.copy() - else: - refs = self.refs + refs = self.refs + return self.make_block_same_class(values, refs=refs) # --------------------------------------------------------------------- @@ -1923,7 +1896,7 @@ def shift(self, periods: int, fill_value: Any = None) -> list[Block]: return [self.make_block_same_class(new_values)] @final - def setitem(self, indexer, value, using_cow: bool = False): + def setitem(self, indexer, value): """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -1936,8 +1909,6 @@ def setitem(self, indexer, value, using_cow: bool = False): The subset of self.values to set value : object The value being set - using_cow: bool, default False - Signaling if CoW is used. Returns ------- @@ -1980,9 +1951,7 @@ def setitem(self, indexer, value, using_cow: bool = False): return self @final - def where( - self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False - ) -> list[Block]: + def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: # _downcast private bc we only specify it when calling from fillna arr = self.values.T @@ -2000,9 +1969,7 @@ def where( if noop: # GH#44181, GH#45135 # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast - if using_cow: - return [self.copy(deep=False)] - return [self.copy()] + return [self.copy(deep=False)] try: res_values = arr._where(cond, other).T @@ -2011,19 +1978,15 @@ def where( if isinstance(self.dtype, IntervalDtype): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) - return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow, caller="where" - ) + nbs = blk.where(orig_other, orig_cond) + return self._maybe_downcast(nbs, downcast=_downcast, caller="where") elif isinstance(self, NDArrayBackedExtensionBlock): # NB: not (yet) the same as # isinstance(values, NDArrayBackedExtensionArray) blk = self.coerce_to_target_dtype(orig_other) - nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) - return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow, caller="where" - ) + nbs = blk.where(orig_other, orig_cond) + return self._maybe_downcast(nbs, downcast=_downcast, caller="where") else: raise @@ -2041,7 +2004,7 @@ def where( n = orig_other[:, i : i + 1] submask = orig_cond[:, i : i + 1] - rbs = nb.where(n, submask, using_cow=using_cow) + rbs = nb.where(n, submask) res_blocks.extend(rbs) return res_blocks @@ -2049,7 +2012,7 @@ def where( return [nb] @final - def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: + def putmask(self, mask, new) -> list[Block]: """ See Block.putmask.__doc__ """ @@ -2063,11 +2026,9 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: mask = self._maybe_squeeze_arg(mask) if not mask.any(): - if using_cow: - return [self.copy(deep=False)] - return [self] + return [self.copy(deep=False)] - self = self._maybe_copy(using_cow, inplace=True) + self = self._maybe_copy(inplace=True) values = self.values if values.ndim == 2: values = values.T @@ -2149,7 +2110,6 @@ def pad_or_backfill( limit: int | None = None, limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, - using_cow: bool = False, ) -> list[Block]: values = self.values @@ -2191,7 +2151,6 @@ def fillna( limit: int | None = None, inplace: bool = False, downcast=None, - using_cow: bool = False, ) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Block.fillna handles coercion (test_fillna_interval) @@ -2200,13 +2159,12 @@ def fillna( limit=limit, inplace=inplace, downcast=downcast, - using_cow=using_cow, ) - if using_cow and self._can_hold_na and not self.values._hasna: + if self._can_hold_na and not self.values._hasna: refs = self.refs new_values = self.values else: - copy, refs = self._get_refs_and_copy(using_cow, inplace) + copy, refs = self._get_refs_and_copy(inplace) try: new_values = self.values.fillna( @@ -2230,7 +2188,7 @@ def fillna( ) nb = self.make_block_same_class(new_values, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow=using_cow, caller="fillna") + return nb._maybe_downcast([nb], downcast, caller="fillna") @cache_readonly def shape(self) -> Shape: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5a8a14168d504..cda5575a2b04e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -470,7 +470,6 @@ def fillna(self, value, limit: int | None, inplace: bool, downcast) -> Self: limit=limit, inplace=inplace, downcast=downcast, - using_cow=using_copy_on_write(), ) @final @@ -486,7 +485,6 @@ def where(self, other, cond, align: bool) -> Self: align_keys=align_keys, other=other, cond=cond, - using_cow=using_copy_on_write(), ) @final @@ -502,16 +500,11 @@ def putmask(self, mask, new, align: bool = True) -> Self: align_keys=align_keys, mask=mask, new=new, - using_cow=using_copy_on_write(), ) @final - def round(self, decimals: int, using_cow: bool = False) -> Self: - return self.apply( - "round", - decimals=decimals, - using_cow=using_cow, - ) + def round(self, decimals: int) -> Self: + return self.apply("round", decimals=decimals) @final def replace(self, to_replace, value, inplace: bool) -> Self: @@ -558,20 +551,10 @@ def replace_list( return bm def interpolate(self, inplace: bool, **kwargs) -> Self: - return self.apply( - "interpolate", - inplace=inplace, - **kwargs, - using_cow=using_copy_on_write(), - ) + return self.apply("interpolate", inplace=inplace, **kwargs) def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: - return self.apply( - "pad_or_backfill", - inplace=inplace, - **kwargs, - using_cow=using_copy_on_write(), - ) + return self.apply("pad_or_backfill", inplace=inplace, **kwargs) def shift(self, periods: int, fill_value) -> Self: if fill_value is lib.no_default: @@ -622,21 +605,7 @@ def diff(self, n: int) -> Self: return self.apply("diff", n=n) def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: - if copy is None: - if using_copy_on_write(): - copy = False - else: - copy = True - elif using_copy_on_write(): - copy = False - - return self.apply( - "astype", - dtype=dtype, - copy=copy, - errors=errors, - using_cow=using_copy_on_write(), - ) + return self.apply("astype", dtype=dtype, errors=errors) def convert(self, copy: bool | None) -> Self: if copy is None: @@ -650,14 +619,7 @@ def convert(self, copy: bool | None) -> Self: return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) def convert_dtypes(self, **kwargs): - if using_copy_on_write(): - copy = False - else: - copy = True - - return self.apply( - "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs - ) + return self.apply("convert_dtypes", **kwargs) def get_values_for_csv( self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None diff --git a/pandas/core/series.py b/pandas/core/series.py index d3c199286931f..78a3bdd2281ce 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2671,7 +2671,7 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: dtype: float64 """ nv.validate_round(args, kwargs) - new_mgr = self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()) + new_mgr = self._mgr.round(decimals=decimals) return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" ) From 5b9021d8fee9ce583fd778897ded797d9a5724ad Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 5 Feb 2024 19:39:15 +0000 Subject: [PATCH 2/3] CoW: Finish deprecation enforcal on block level --- pandas/core/generic.py | 2 +- pandas/core/internals/blocks.py | 118 ++++++----------------- pandas/core/internals/managers.py | 50 +++------- pandas/tests/internals/test_internals.py | 6 +- 4 files changed, 46 insertions(+), 130 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 61fb757fafec4..32ad8fad06119 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6643,7 +6643,7 @@ def infer_objects(self, copy: bool_t | None = None) -> Self: A int64 dtype: object """ - new_mgr = self._mgr.convert(copy=copy) + new_mgr = self._mgr.convert() res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) return res.__finalize__(self, method="infer_objects") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 02296643acc3e..5948b2caf221e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -549,9 +549,7 @@ def _maybe_downcast( if caller == "fillna" and get_option("future.no_silent_downcasting"): return blocks - nbs = extend_blocks( - [blk.convert(using_cow=True, copy=False) for blk in blocks] - ) + nbs = extend_blocks([blk.convert() for blk in blocks]) if caller == "fillna": if len(nbs) != len(blocks) or not all( x.dtype == y.dtype for x, y in zip(nbs, blocks) @@ -575,7 +573,7 @@ def _maybe_downcast( elif caller == "where" and get_option("future.no_silent_downcasting") is True: return blocks else: - nbs = extend_blocks([b._downcast_2d(downcast, True) for b in blocks]) + nbs = extend_blocks([b._downcast_2d(downcast) for b in blocks]) # When _maybe_downcast is called with caller="where", it is either # a) with downcast=False, which is a no-op (the desired future behavior) @@ -606,7 +604,7 @@ def _maybe_downcast( @final @maybe_split - def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: + def _downcast_2d(self, dtype) -> list[Block]: """ downcast specialized to 2D case post-validation. @@ -618,30 +616,19 @@ def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: return [self.make_block(new_values, refs=refs)] @final - def convert( - self, - *, - copy: bool = True, - using_cow: bool = False, - ) -> list[Block]: + def convert(self) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy of the block (if copy = True). """ if not self.is_object: - if not copy and using_cow: - return [self.copy(deep=False)] - return [self.copy()] if copy else [self] + return [self.copy(deep=False)] if self.ndim != 1 and self.shape[0] != 1: - blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow - ) + blocks = self.split_and_operate(Block.convert) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op - if using_cow: - return [self.copy(deep=False)] - return [self.copy()] if copy else [self] + return [self.copy(deep=False)] return blocks values = self.values @@ -655,9 +642,7 @@ def convert( convert_non_numeric=True, ) refs = None - if copy and res_values is values: - res_values = values.copy() - elif res_values is values: + if res_values is values: refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) @@ -674,7 +659,7 @@ def convert_dtypes( dtype_backend: DtypeBackend = "numpy_nullable", ) -> list[Block]: if infer_objects and self.is_object: - blks = self.convert(copy=False) + blks = self.convert() else: blks = [self] @@ -798,17 +783,6 @@ def _maybe_copy(self, inplace: bool) -> Self: return self.copy(deep=deep) return self.copy() - @final - def _maybe_copy_cow_check( - self, using_cow: bool = True, inplace: bool = True - ) -> Self: - if using_cow and inplace: - deep = self.refs.has_reference() - blk = self.copy(deep=deep) - else: - blk = self if inplace else self.copy() - return blk - @final def _get_refs_and_copy(self, inplace: bool): refs = None @@ -820,17 +794,6 @@ def _get_refs_and_copy(self, inplace: bool): refs = self.refs return copy, refs - @final - def _get_refs_and_copy_cow_check(self, using_cow: bool, inplace: bool): - refs = None - copy = not inplace - if inplace: - if using_cow and self.refs.has_reference(): - copy = True - else: - refs = self.refs - return copy, refs - # --------------------------------------------------------------------- # Replace @@ -842,7 +805,6 @@ def replace( inplace: bool = False, # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, - using_cow: bool = False, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -857,7 +819,7 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy_cow_check(using_cow, inplace) + blk = self._maybe_copy(inplace) values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] @@ -867,25 +829,19 @@ def replace( # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. - if using_cow: - return [self.copy(deep=False)] - else: - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] if mask is None: mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. - if using_cow: - return [self.copy(deep=False)] - else: - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - blk = self._maybe_copy_cow_check(using_cow, inplace) + blk = self._maybe_copy(inplace) putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): @@ -894,7 +850,7 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert() if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -935,7 +891,6 @@ def replace( value=value, inplace=True, mask=mask[i : i + 1], - using_cow=using_cow, ) ) return blocks @@ -947,7 +902,6 @@ def _replace_regex( value, inplace: bool = False, mask=None, - using_cow: bool = False, ) -> list[Block]: """ Replace elements by the given value. @@ -962,8 +916,6 @@ def _replace_regex( Perform inplace modification. mask : array-like of bool, optional True indicate corresponding element is ignored. - using_cow: bool, default False - Specifying if copy on write is enabled. Returns ------- @@ -972,17 +924,15 @@ def _replace_regex( if not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock - if using_cow: - return [self.copy(deep=False)] - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] rx = re.compile(to_replace) - block = self._maybe_copy_cow_check(using_cow, inplace) + block = self._maybe_copy(inplace) replace_regex(block.values, rx, value, mask) - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert() opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1005,7 +955,6 @@ def replace_list( dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, - using_cow: bool = False, ) -> list[Block]: """ See BlockManager.replace_list docstring. @@ -1015,7 +964,7 @@ def replace_list( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy_cow_check(using_cow, inplace) + blk = self._maybe_copy(inplace) values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] @@ -1025,10 +974,7 @@ def replace_list( (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) ] if not len(pairs): - if using_cow: - return [self.copy(deep=False)] - # shortcut, nothing to replace - return [self] if inplace else [self.copy()] + return [self.copy(deep=False)] src_len = len(pairs) - 1 @@ -1055,12 +1001,9 @@ def replace_list( if inplace: masks = list(masks) - if using_cow: - # Don't set up refs here, otherwise we will think that we have - # references when we check again later - rb = [self] - else: - rb = [self if inplace else self.copy()] + # Don't set up refs here, otherwise we will think that we have + # references when we check again later + rb = [self] opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): @@ -1087,10 +1030,9 @@ def replace_list( mask=m, inplace=inplace, regex=regex, - using_cow=using_cow, ) - if using_cow and i != src_len: + if i != src_len: # This is ugly, but we have to get rid of intermediate refs # that did not go out of scope yet, otherwise we will trigger # many unnecessary copies @@ -1109,9 +1051,7 @@ def replace_list( # GH#44498 avoid unwanted cast-back nbs = [] for res_blk in result: - converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow - ) + converted = res_blk.convert() if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( # GH#54710 @@ -1139,7 +1079,6 @@ def _replace_coerce( mask: npt.NDArray[np.bool_], inplace: bool = True, regex: bool = False, - using_cow: bool = False, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1175,22 +1114,19 @@ def _replace_coerce( if mask.any(): has_ref = self.refs.has_reference() nb = self.astype(np.dtype(object)) - if (nb is self or using_cow) and not inplace: + if not inplace: nb = nb.copy() - elif inplace and has_ref and nb.refs.has_reference() and using_cow: + elif inplace and has_ref and nb.refs.has_reference(): # no copy in astype and we had refs before nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] - if using_cow: - return [self] - return [self] if inplace else [self.copy()] + return [self] return self.replace( to_replace=to_replace, value=value, inplace=inplace, mask=mask, - using_cow=using_cow, ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cda5575a2b04e..824ba54a5c151 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -517,16 +517,11 @@ def replace(self, to_replace, value, inplace: bool) -> Self: to_replace=to_replace, value=value, inplace=inplace, - using_cow=using_copy_on_write(), ) @final def replace_regex(self, **kwargs) -> Self: - return self.apply( - "_replace_regex", - **kwargs, - using_cow=using_copy_on_write(), - ) + return self.apply("_replace_regex", **kwargs) @final def replace_list( @@ -545,7 +540,6 @@ def replace_list( dest_list=dest_list, inplace=inplace, regex=regex, - using_cow=using_copy_on_write(), ) bm._consolidate_inplace() return bm @@ -571,7 +565,7 @@ def setitem(self, indexer, value) -> Self: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") - if using_copy_on_write() and not self._has_no_reference(0): + if not self._has_no_reference(0): # this method is only called if there is a single block -> hardcoded 0 # Split blocks to only copy the columns we want to modify if self.ndim == 2 and isinstance(indexer, tuple): @@ -607,16 +601,8 @@ def diff(self, n: int) -> Self: def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self: return self.apply("astype", dtype=dtype, errors=errors) - def convert(self, copy: bool | None) -> Self: - if copy is None: - if using_copy_on_write(): - copy = False - else: - copy = True - elif using_copy_on_write(): - copy = False - - return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) + def convert(self) -> Self: + return self.apply("convert") def convert_dtypes(self, **kwargs): return self.apply("convert_dtypes", **kwargs) @@ -750,10 +736,7 @@ def copy_func(ax): new_axes = [copy_func(ax) for ax in self.axes] else: - if using_copy_on_write(): - new_axes = [ax.view() for ax in self.axes] - else: - new_axes = list(self.axes) + new_axes = [ax.view() for ax in self.axes] res = self.apply("copy", deep=deep) res.axes = new_axes @@ -1006,7 +989,7 @@ def _slice_take_blocks_ax0( # A non-consolidatable block, it's easy, because there's # only one item and each mgr loc is a copy of that single # item. - deep = not (only_slice or using_copy_on_write()) + deep = False for mgr_loc in mgr_locs: newblk = blk.copy(deep=deep) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) @@ -1017,8 +1000,7 @@ def _slice_take_blocks_ax0( # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice or using_copy_on_write(): - taker = lib.maybe_indices_to_slice(taker, max_len) + taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) @@ -1335,7 +1317,7 @@ def value_getitem(placement): blk_locs = blklocs[val_locs.indexer] if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write - if using_copy_on_write() and not self._has_no_reference_block(blkno_l): + if not self._has_no_reference_block(blkno_l): self._iset_split_block( blkno_l, blk_locs, value_getitem(val_locs), refs=refs ) @@ -1477,7 +1459,7 @@ def _iset_single( if inplace and blk.should_store(value): copy = False - if using_copy_on_write() and not self._has_no_reference_block(blkno): + if not self._has_no_reference_block(blkno): # perform Copy-on-Write and clear the reference copy = True iloc = self.blklocs[loc] @@ -1499,7 +1481,7 @@ def column_setitem( This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ - if using_copy_on_write() and not self._has_no_reference(loc): + if not self._has_no_reference(loc): blkno = self.blknos[loc] # Split blocks to only copy the column we want to modify blk_loc = self.blklocs[loc] @@ -1863,7 +1845,7 @@ def as_array( else: arr = np.array(blk.values, dtype=dtype, copy=copy) - if using_copy_on_write() and not copy: + if not copy: arr = arr.view() arr.flags.writeable = False else: @@ -2140,7 +2122,7 @@ def _blklocs(self) -> None: # type: ignore[override] def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self: # similar to get_slice, but not restricted to slice indexer blk = self._block - if using_copy_on_write() and len(indexer) > 0 and indexer.all(): + if len(indexer) > 0 and indexer.all(): return type(self)(blk.copy(deep=False), self.index) array = blk.values[indexer] @@ -2214,11 +2196,9 @@ def setitem_inplace(self, indexer, value) -> None: in place, not returning a new Manager (and Block), and thus never changing the dtype. """ - using_cow = using_copy_on_write() - if using_cow and not self._has_no_reference(0): - if using_cow: - self.blocks = (self._block.copy(),) - self._cache.clear() + if not self._has_no_reference(0): + self.blocks = (self._block.copy(),) + self._cache.clear() arr = self.array diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 96a0ccc33808a..89abd428f0ed7 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -612,7 +612,7 @@ def _compare(old_mgr, new_mgr): # noops mgr = create_mgr("f: i8; g: f8") - new_mgr = mgr.convert(copy=True) + new_mgr = mgr.convert() _compare(mgr, new_mgr) # convert @@ -620,7 +620,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(0, np.array(["1"] * N, dtype=np.object_)) mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) - new_mgr = mgr.convert(copy=True) + new_mgr = mgr.convert() dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype @@ -634,7 +634,7 @@ def _compare(old_mgr, new_mgr): mgr.iset(0, np.array(["1"] * N, dtype=np.object_)) mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) - new_mgr = mgr.convert(copy=True) + new_mgr = mgr.convert() assert new_mgr.iget(0).dtype == dtype assert new_mgr.iget(1).dtype == dtype assert new_mgr.iget(2).dtype == dtype From 2fe93a2dbdfe54ee20dd04befcbd6d108bcfc49b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 9 Feb 2024 21:53:32 +0100 Subject: [PATCH 3/3] Fixup --- pandas/core/internals/blocks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 50821349fe6fa..ef5c936d35ae9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -819,7 +819,7 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy_cow_check(inplace) + blk = self._maybe_copy(inplace) values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] @@ -841,7 +841,7 @@ def replace( elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - blk = self._maybe_copy_cow_check(inplace) + blk = self._maybe_copy(inplace) putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): @@ -928,7 +928,7 @@ def _replace_regex( rx = re.compile(to_replace) - block = self._maybe_copy_cow_check(inplace) + block = self._maybe_copy(inplace) replace_regex(block.values, rx, value, mask) @@ -964,7 +964,7 @@ def replace_list( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy_cow_check(inplace) + blk = self._maybe_copy(inplace) values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk]