From 67ad7953e51d3af5a0b07f2c19773f322669cc91 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Jun 2023 20:29:46 -0700 Subject: [PATCH 1/3] REF: refactor out Block helpers for CoW --- pandas/core/internals/blocks.py | 125 ++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9f1767befd18a..89e31f20debae 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -574,6 +574,65 @@ def copy(self, deep: bool = True) -> Self: # --------------------------------------------------------------------- # Replace + def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: + if using_cow and (self.refs.has_reference() or not inplace): + blk = self.copy() + elif using_cow: + blk = self.copy(deep=False) + else: + blk = self if inplace else self.copy() + return blk + + def _maybe_copy2(self, using_cow: bool, inplace: bool) -> Self: + if using_cow: + if inplace: + blk = self.copy(deep=self.refs.has_reference()) + else: + blk = self.copy() + else: + blk = self if inplace else self.copy() + return blk + + def _maybe_copy3(self, using_cow: bool, inplace: bool) -> Self: + if using_cow and inplace: + blk = self.copy(deep=self.refs.has_reference()) + else: + blk = self if inplace else self.copy() + return blk + + def _get_values_and_refs(self, using_cow, inplace): + if using_cow: + if inplace and not self.refs.has_reference(): + refs = self.refs + new_values = self.values + else: + refs = None + new_values = self.values.copy() + else: + refs = None + new_values = self.values if inplace else self.values.copy() + return new_values, refs + + def _get_refs_and_copy(self, using_cow: bool, inplace: bool): + refs = None + arr_inplace = inplace + if inplace: + if using_cow and self.refs.has_reference(): + arr_inplace = False + else: + refs = self.refs + return arr_inplace, refs + + def _get_refs_and_copy2(self, using_cow: bool, inplace: bool): + refs = None + arr_inplace = inplace + if using_cow: + if inplace and not self.refs.has_reference(): + refs = self.refs + else: + arr_inplace = False + return arr_inplace, refs + @final def replace( self, @@ -597,12 +656,7 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - if using_cow and (self.refs.has_reference() or not inplace): - blk = self.copy() - elif using_cow: - blk = self.copy(deep=False) - else: - blk = self if inplace else self.copy() + blk = self._maybe_copy(using_cow, inplace) values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] @@ -630,13 +684,7 @@ def replace( elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - if using_cow: - if inplace: - blk = self.copy(deep=self.refs.has_reference()) - else: - blk = self.copy() - else: - blk = self if inplace else self.copy() + blk = self._maybe_copy2(using_cow, inplace) putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise @@ -712,16 +760,7 @@ def _replace_regex( rx = re.compile(to_replace) - if using_cow: - if inplace and not self.refs.has_reference(): - refs = self.refs - new_values = self.values - else: - refs = None - new_values = self.values.copy() - else: - refs = None - new_values = self.values if inplace else self.values.copy() + new_values, refs = self._get_values_and_refs(using_cow, inplace) replace_regex(new_values, rx, value, mask) @@ -745,10 +784,7 @@ def replace_list( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - if using_cow and inplace: - blk = self.copy(deep=self.refs.has_reference()) - else: - blk = self if inplace else self.copy() + blk = self._maybe_copy3(using_cow, inplace) values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] @@ -875,13 +911,18 @@ def _replace_coerce( if value is None: # gh-45601, gh-45836, gh-46634 if mask.any(): - has_ref = self.refs.has_reference() - nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) - if (nb is self or using_cow) and not inplace: - nb = nb.copy() - elif inplace and has_ref and nb.refs.has_reference(): - # no copy in astype and we had refs before - nb = nb.copy() + if self.dtype == _dtype_obj: + nb = self + if not inplace: + nb = nb.copy() + elif inplace and self.refs.has_reference(): + nb = nb.copy() + else: + nb = self.astype( + np.dtype(object), copy=False, using_cow=using_cow + ) + if using_cow and not inplace: + nb = nb.copy() # This looks like a double-copy putmask_inplace(nb.values, mask, value) return [nb] if using_cow: @@ -1429,13 +1470,7 @@ def interpolate( **kwargs, ) - refs = None - arr_inplace = inplace - if inplace: - if using_cow and self.refs.has_reference(): - arr_inplace = False - else: - refs = self.refs + arr_inplace, refs = self._get_refs_and_copy(using_cow, inplace) # Dispatch to the PandasArray method. # We know self.array_values is a PandasArray bc EABlock overrides @@ -2281,13 +2316,7 @@ def interpolate( # "Literal['linear']") [comparison-overlap] if method == "linear": # type: ignore[comparison-overlap] # TODO: GH#50950 implement for arbitrary EAs - refs = None - arr_inplace = inplace - if using_cow: - if inplace and not self.refs.has_reference(): - refs = self.refs - else: - arr_inplace = False + arr_inplace, refs = self._get_refs_and_copy2(using_cow, inplace) new_values = self.values.interpolate( method=method, From 0727e2f95fff34e9e11806d1ae458e3ee76452ca Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 26 Jun 2023 21:39:57 -0700 Subject: [PATCH 2/3] REF: de-duplicate helpers --- pandas/core/internals/blocks.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 89e31f20debae..262df8ee47594 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -575,27 +575,9 @@ def copy(self, deep: bool = True) -> Self: # Replace def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: - if using_cow and (self.refs.has_reference() or not inplace): - blk = self.copy() - elif using_cow: - blk = self.copy(deep=False) - else: - blk = self if inplace else self.copy() - return blk - - def _maybe_copy2(self, using_cow: bool, inplace: bool) -> Self: - if using_cow: - if inplace: - blk = self.copy(deep=self.refs.has_reference()) - else: - blk = self.copy() - else: - blk = self if inplace else self.copy() - return blk - - def _maybe_copy3(self, using_cow: bool, inplace: bool) -> Self: if using_cow and inplace: - blk = self.copy(deep=self.refs.has_reference()) + deep = self.refs.has_reference() + blk = self.copy(deep=deep) else: blk = self if inplace else self.copy() return blk @@ -684,7 +666,7 @@ def replace( elif self._can_hold_element(value): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? - blk = self._maybe_copy2(using_cow, inplace) + blk = self._maybe_copy(using_cow, inplace) putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise @@ -784,7 +766,7 @@ def replace_list( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self._maybe_copy3(using_cow, inplace) + blk = self._maybe_copy(using_cow, inplace) values = cast(Categorical, blk.values) values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] From 3fdc77403fa6da39036620c5e7c6a11d7191318b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Jun 2023 07:41:11 -0700 Subject: [PATCH 3/3] De-duplicate helper, revert _replace_coerce edits --- pandas/core/internals/blocks.py | 37 +++++++++++++-------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 262df8ee47594..cb4b4d44d0139 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -572,8 +572,9 @@ def copy(self, deep: bool = True) -> Self: return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs) # --------------------------------------------------------------------- - # Replace + # Copy-on-Write Helpers + @final def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: if using_cow and inplace: deep = self.refs.has_reference() @@ -582,6 +583,7 @@ def _maybe_copy(self, using_cow: bool, inplace: bool) -> Self: blk = self if inplace else self.copy() return blk + @final def _get_values_and_refs(self, using_cow, inplace): if using_cow: if inplace and not self.refs.has_reference(): @@ -595,6 +597,7 @@ def _get_values_and_refs(self, using_cow, inplace): new_values = self.values if inplace else self.values.copy() return new_values, refs + @final def _get_refs_and_copy(self, using_cow: bool, inplace: bool): refs = None arr_inplace = inplace @@ -605,15 +608,8 @@ def _get_refs_and_copy(self, using_cow: bool, inplace: bool): refs = self.refs return arr_inplace, refs - def _get_refs_and_copy2(self, using_cow: bool, inplace: bool): - refs = None - arr_inplace = inplace - if using_cow: - if inplace and not self.refs.has_reference(): - refs = self.refs - else: - arr_inplace = False - return arr_inplace, refs + # --------------------------------------------------------------------- + # Replace @final def replace( @@ -893,18 +889,13 @@ def _replace_coerce( if value is None: # gh-45601, gh-45836, gh-46634 if mask.any(): - if self.dtype == _dtype_obj: - nb = self - if not inplace: - nb = nb.copy() - elif inplace and self.refs.has_reference(): - nb = nb.copy() - else: - nb = self.astype( - np.dtype(object), copy=False, using_cow=using_cow - ) - if using_cow and not inplace: - nb = nb.copy() # This looks like a double-copy + has_ref = self.refs.has_reference() + nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow) + if (nb is self or using_cow) and not inplace: + nb = nb.copy() + elif inplace and has_ref and nb.refs.has_reference(): + # no copy in astype and we had refs before + nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] if using_cow: @@ -2298,7 +2289,7 @@ def interpolate( # "Literal['linear']") [comparison-overlap] if method == "linear": # type: ignore[comparison-overlap] # TODO: GH#50950 implement for arbitrary EAs - arr_inplace, refs = self._get_refs_and_copy2(using_cow, inplace) + arr_inplace, refs = self._get_refs_and_copy(using_cow, inplace) new_values = self.values.interpolate( method=method,