From 788f3fa968c6382262a9806153d83405dab8cc7c Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 7 Nov 2020 19:24:23 -0800 Subject: [PATCH 1/7] REF: implement replace_regex, remove unreachable branch --- pandas/core/array_algos/replace.py | 46 +++++++++++++++++++++++++++++- pandas/core/internals/blocks.py | 40 +++----------------------- 2 files changed, 49 insertions(+), 37 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 9eaa265adab2b..76d723beac7e6 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -3,7 +3,7 @@ """ import operator import re -from typing import Pattern, Union +from typing import Optional, Pattern, Union import numpy as np @@ -12,8 +12,10 @@ from pandas.core.dtypes.common import ( is_datetimelike_v_numeric, is_numeric_v_string_like, + is_re, is_scalar, ) +from pandas.core.dtypes.missing import isna def compare_or_regex_search( @@ -87,3 +89,45 @@ def _check_comparison_types( _check_comparison_types(result, a, b) return result + + +def replace_regex(values: ArrayLike, rx: re.Pattern, value, mask: Optional[np.ndarray]): + """ + Parameters + ---------- + values : ArrayLike + Object dtype. + rx : re.Pattern + value : Any + mask : np.ndarray[bool], optional + + Notes + ----- + Alters values in-place. + """ + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, str): + + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return value if rx.search(s) is not None else s + else: + return s + + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return rx.sub(value, s) + else: + return s + + f = np.vectorize(re_replacer, otypes=[values.dtype]) + + if mask is None: + values[:] = f(values) + else: + values[mask] = f(values[mask]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8e01aaa396265..581287439e98f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -60,7 +60,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat import pandas.core.algorithms as algos -from pandas.core.array_algos.replace import compare_or_regex_search +from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -2504,7 +2504,8 @@ def replace( ) -> List["Block"]: to_rep_is_list = is_list_like(to_replace) value_is_list = is_list_like(value) - both_lists = to_rep_is_list and value_is_list + # Note: we will never have both to_rep_is_list and value_is_list, + # as that case will go through _replace_list. either_list = to_rep_is_list or value_is_list result_blocks: List["Block"] = [] @@ -2514,14 +2515,6 @@ def replace( return self._replace_single(to_replace, value, inplace=inplace, regex=True) elif not (either_list or regex): return super().replace(to_replace, value, inplace=inplace, regex=regex) - elif both_lists: - for to_rep, v in zip(to_replace, value): - result_blocks = [] - for b in blocks: - result = b._replace_single(to_rep, v, inplace=inplace, regex=regex) - result_blocks.extend(result) - blocks = result_blocks - return result_blocks elif to_rep_is_list and regex: for to_rep in to_replace: @@ -2588,32 +2581,7 @@ def _replace_single( return super().replace(to_replace, value, inplace=inplace, regex=regex) new_values = self.values if inplace else self.values.copy() - - # deal with replacing values with objects (strings) that match but - # whose replacement is not a string (numeric, nan, object) - if isna(value) or not isinstance(value, str): - - def re_replacer(s): - if is_re(rx) and isinstance(s, str): - return value if rx.search(s) is not None else s - else: - return s - - else: - # value is guaranteed to be a string here, s can be either a string - # or null if it's null it gets returned - def re_replacer(s): - if is_re(rx) and isinstance(s, str): - return rx.sub(value, s) - else: - return s - - f = np.vectorize(re_replacer, otypes=[self.dtype]) - - if mask is None: - new_values[:] = f(new_values) - else: - new_values[mask] = f(new_values[mask]) + replace_regex(new_values, rx, value, mask) # convert block = self.make_block(new_values) From 251669088bf4db6c582e1ef408f8104c972ae979 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 7 Nov 2020 20:10:08 -0800 Subject: [PATCH 2/7] CLN: remove unreachable branch from ObjectBlock.replace --- pandas/core/internals/blocks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 581287439e98f..788859c1f3e92 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2503,17 +2503,17 @@ def replace( regex: bool = False, ) -> List["Block"]: to_rep_is_list = is_list_like(to_replace) - value_is_list = is_list_like(value) - # Note: we will never have both to_rep_is_list and value_is_list, - # as that case will go through _replace_list. - either_list = to_rep_is_list or value_is_list + + # The checks done in NDFrame.replace before calling _mgr.replace + # ensure that we never get here with listlike value + assert not is_list_like(value) result_blocks: List["Block"] = [] blocks: List["Block"] = [self] - if not either_list and is_re(to_replace): + if not to_rep_is_list and is_re(to_replace): return self._replace_single(to_replace, value, inplace=inplace, regex=True) - elif not (either_list or regex): + elif not (to_rep_is_list or regex): return super().replace(to_replace, value, inplace=inplace, regex=regex) elif to_rep_is_list and regex: From 8d226e6c7de7e1a6b216e9e285682a92ff15592c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Nov 2020 10:11:11 -0800 Subject: [PATCH 3/7] REF: simplify NDFrame.replace, ObjectBlock.replace --- pandas/core/generic.py | 34 +++++++-------- pandas/core/internals/blocks.py | 41 ++++++++----------- .../tests/arrays/categorical/test_replace.py | 3 +- 3 files changed, 37 insertions(+), 41 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bea650c1b50fd..02fa7308e7ee8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6744,25 +6744,25 @@ def replace( else: raise TypeError("value argument must be scalar, dict, or Series") - elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] - if is_list_like(value): - if len(to_replace) != len(value): - raise ValueError( - f"Replacement lists must match in length. " - f"Expecting {len(to_replace)} got {len(value)} " - ) - self._consolidate_inplace() - new_data = self._mgr.replace_list( - src_list=to_replace, - dest_list=value, - inplace=inplace, - regex=regex, + elif is_list_like(to_replace): + if not is_list_like(value): + # e.g. to_replace = [NA, ''] and value is 0, + # so we replace NA with 0 and then replace '' with 0 + value = [value] * len(to_replace) + + # e.g. we have to_replace = [NA, ''] and value = [0, 'missing'] + if len(to_replace) != len(value): + raise ValueError( + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " ) + new_data = self._mgr.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) - else: # [NA, ''] -> 0 - new_data = self._mgr.replace( - to_replace=to_replace, value=value, inplace=inplace, regex=regex - ) elif to_replace is None: if not ( is_re_compilable(regex) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 788859c1f3e92..fd23b89365496 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2502,33 +2502,15 @@ def replace( inplace: bool = False, regex: bool = False, ) -> List["Block"]: - to_rep_is_list = is_list_like(to_replace) + # Note: the checks we do in NDFrame.replace ensure we never get + # here with listlike to_replace or value, as those cases + # go through _replace_list - # The checks done in NDFrame.replace before calling _mgr.replace - # ensure that we never get here with listlike value - assert not is_list_like(value) - - result_blocks: List["Block"] = [] - blocks: List["Block"] = [self] - - if not to_rep_is_list and is_re(to_replace): + if is_re(to_replace) or regex: return self._replace_single(to_replace, value, inplace=inplace, regex=True) - elif not (to_rep_is_list or regex): + else: return super().replace(to_replace, value, inplace=inplace, regex=regex) - elif to_rep_is_list and regex: - for to_rep in to_replace: - result_blocks = [] - for b in blocks: - result = b._replace_single( - to_rep, value, inplace=inplace, regex=regex - ) - result_blocks.extend(result) - blocks = result_blocks - return result_blocks - - return self._replace_single(to_replace, value, inplace=inplace, regex=regex) - def _replace_single( self, to_replace, @@ -2595,6 +2577,19 @@ def _replace_single( class CategoricalBlock(ExtensionBlock): __slots__ = () + def _replace_list( + self, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> List["Block"]: + if len(algos.unique(dest_list)) == 1: + # We got likely here by tiling value inside NDFrame.replace, + # so un-tile here + return self.replace(src_list, dest_list[0], inplace, regex) + return super()._replace_list(src_list, dest_list, inplace, regex) + def replace( self, to_replace, diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 5889195ad68db..007c4bdea17f8 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -21,6 +21,7 @@ ((1, 2, 4), 5, [5, 5, 3], False), ((5, 6), 2, [1, 2, 3], False), # many-to-many, handled outside of Categorical and results in separate dtype + # except for cases with only 1 unique entry in `value` ([1], [2], [2, 2, 3], True), ([1, 4], [5, 2], [5, 2, 3], True), # check_categorical sorts categories, which crashes on mixed dtypes @@ -30,7 +31,7 @@ ) def test_replace(to_replace, value, expected, flip_categories): # GH 31720 - stays_categorical = not isinstance(value, list) + stays_categorical = not isinstance(value, list) or len(pd.unique(value)) == 1 s = pd.Series([1, 2, 3], dtype="category") result = s.replace(to_replace, value) From 744c00ae1846bed483e34a4b91bb18242e6daf73 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Nov 2020 11:43:54 -0800 Subject: [PATCH 4/7] REF: replace_single->replace_regex, dont call super from it --- pandas/core/internals/blocks.py | 63 +++++++++++++++++---------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fd23b89365496..15174eac9acda 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -813,12 +813,11 @@ def replace( ) return blocks - def _replace_single( + def _replace_regex( self, to_replace, value, inplace: bool = False, - regex: bool = False, convert: bool = True, mask=None, ) -> List["Block"]: @@ -1598,14 +1597,16 @@ def _replace_coerce( self = self.coerce_to_target_dtype(value) return self.putmask(mask, value, inplace=inplace) else: - return self._replace_single( - to_replace, - value, - inplace=inplace, - regex=regex, - convert=False, - mask=mask, - ) + regex = _should_use_regex(regex, to_replace) + if regex: + return self._replace_regex( + to_replace, + value, + inplace=inplace, + convert=False, + mask=mask, + ) + return self.replace(to_replace, value, inplace=inplace, regex=False) return [self] @@ -2506,17 +2507,18 @@ def replace( # here with listlike to_replace or value, as those cases # go through _replace_list - if is_re(to_replace) or regex: - return self._replace_single(to_replace, value, inplace=inplace, regex=True) + regex = _should_use_regex(regex, to_replace) + + if regex: + return self._replace_regex(to_replace, value, inplace=inplace) else: - return super().replace(to_replace, value, inplace=inplace, regex=regex) + return super().replace(to_replace, value, inplace=inplace, regex=False) - def _replace_single( + def _replace_regex( self, to_replace, value, inplace: bool = False, - regex: bool = False, convert: bool = True, mask=None, ) -> List["Block"]: @@ -2544,23 +2546,10 @@ def _replace_single( """ inplace = validate_bool_kwarg(inplace, "inplace") - # to_replace is regex compilable - regex = regex and is_re_compilable(to_replace) + regex = _should_use_regex(True, to_replace) + assert regex - # try to get the pattern attribute (compiled re) or it's a string - if is_re(to_replace): - pattern = to_replace.pattern - else: - pattern = to_replace - - # if the pattern is not empty and to_replace is either a string or a - # regex - if regex and pattern: - rx = re.compile(to_replace) - else: - # if the thing to replace is not a string or compiled regex call - # the superclass method -> to_replace is some kind of object - return super().replace(to_replace, value, inplace=inplace, regex=regex) + rx = re.compile(to_replace) new_values = self.values if inplace else self.values.copy() replace_regex(new_values, rx, value, mask) @@ -2574,6 +2563,18 @@ def _replace_single( return nbs +def _should_use_regex(regex: bool, to_replace: Any) -> bool: + # to_replace is regex compilable + if is_re(to_replace): + regex = True + + regex = regex and is_re_compilable(to_replace) + + # Don't use regex if the pattern is empty. + regex = regex and re.compile(to_replace).pattern != "" + return regex + + class CategoricalBlock(ExtensionBlock): __slots__ = () From 4c7ff6fff27c62547b81a5fd5f04602cfec81f40 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Nov 2020 13:28:09 -0800 Subject: [PATCH 5/7] REF: remove overriden replace_regex --- pandas/core/internals/blocks.py | 87 ++++++++++++++------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 15174eac9acda..afc92306d0f7f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -821,8 +821,43 @@ def _replace_regex( convert: bool = True, mask=None, ) -> List["Block"]: - """ no-op on a non-ObjectBlock """ - return [self] if inplace else [self.copy()] + """ + Replace elements by the given value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + List[Block] + """ + if not self._can_hold_element(to_replace): + # i.e. only ObjectBlock, but could in principle include a + # String ExtensionBlock + return [self] if inplace else [self.copy()] + + rx = re.compile(to_replace) + + new_values = self.values if inplace else self.values.copy() + replace_regex(new_values, rx, value, mask) + + # convert + block = self.make_block(new_values) + if convert: + nbs = block.convert(numeric=False) + else: + nbs = [block] + return nbs def _replace_list( self, @@ -2514,54 +2549,6 @@ def replace( else: return super().replace(to_replace, value, inplace=inplace, regex=False) - def _replace_regex( - self, - to_replace, - value, - inplace: bool = False, - convert: bool = True, - mask=None, - ) -> List["Block"]: - """ - Replace elements by the given value. - - Parameters - ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - inplace : bool, default False - Perform inplace modification. - regex : bool, default False - If true, perform regular expression substitution. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. - - Returns - ------- - List[Block] - """ - inplace = validate_bool_kwarg(inplace, "inplace") - - regex = _should_use_regex(True, to_replace) - assert regex - - rx = re.compile(to_replace) - - new_values = self.values if inplace else self.values.copy() - replace_regex(new_values, rx, value, mask) - - # convert - block = self.make_block(new_values) - if convert: - nbs = block.convert(numeric=False) - else: - nbs = [block] - return nbs - def _should_use_regex(regex: bool, to_replace: Any) -> bool: # to_replace is regex compilable From 2886c7c8996bc3c1b71c02672982cbf190a9766b Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Nov 2020 19:37:00 -0800 Subject: [PATCH 6/7] prune superfluous comment --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index afc92306d0f7f..2f9a1cf2ebb72 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -851,7 +851,6 @@ def _replace_regex( new_values = self.values if inplace else self.values.copy() replace_regex(new_values, rx, value, mask) - # convert block = self.make_block(new_values) if convert: nbs = block.convert(numeric=False) From 488c465e7b1dbfcd9d7397ebbc2ab66e4ad30190 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 8 Nov 2020 19:42:37 -0800 Subject: [PATCH 7/7] docstring --- pandas/core/internals/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2f9a1cf2ebb72..ed77a210b6913 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2550,7 +2550,9 @@ def replace( def _should_use_regex(regex: bool, to_replace: Any) -> bool: - # to_replace is regex compilable + """ + Decide whether to treat `to_replace` as a regular expression. + """ if is_re(to_replace): regex = True