From ac29c50e68fbeb5a6a3a3a6667b088c256049fab Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Feb 2021 15:27:12 -0800 Subject: [PATCH 1/2] REF: move some replace code from blocks --- pandas/core/array_algos/replace.py | 19 ++++++++- pandas/core/internals/blocks.py | 62 +++++++++--------------------- 2 files changed, 36 insertions(+), 45 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 1cac825cc0898..08afad406ae9a 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -3,7 +3,7 @@ """ import operator import re -from typing import Optional, Pattern, Union +from typing import Any, Optional, Pattern, Union import numpy as np @@ -13,11 +13,26 @@ is_datetimelike_v_numeric, is_numeric_v_string_like, is_re, + is_re_compilable, is_scalar, ) from pandas.core.dtypes.missing import isna +def should_use_regex(regex: bool, to_replace: Any) -> bool: + """ + Decide whether to treat `to_replace` as a regular expression. + """ + if is_re(to_replace): + regex = True + + regex = regex and is_re_compilable(to_replace) + + # Don't use regex if the pattern is empty. + regex = regex and re.compile(to_replace).pattern != "" + return regex + + def compare_or_regex_search( a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike ) -> Union[ArrayLike, bool]: @@ -38,6 +53,8 @@ def compare_or_regex_search( ------- mask : array_like of bool """ + if isna(b): + return ~mask def _check_comparison_types( result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index eb8bb0fe90e9a..4f4cf0c7c79c3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -17,7 +17,7 @@ ) from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, Dtype, DtypeObj, Scalar, Shape +from pandas._typing import ArrayLike, Dtype, DtypeObj, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -44,8 +44,6 @@ is_integer, is_list_like, is_object_dtype, - is_re, - is_re_compilable, is_sparse, pandas_dtype, ) @@ -59,7 +57,11 @@ putmask_smart, putmask_without_repeat, ) -from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex +from pandas.core.array_algos.replace import ( + compare_or_regex_search, + replace_regex, + should_use_regex, +) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -817,6 +819,12 @@ def _replace_list( """ See BlockManager._replace_list docstring. """ + # TODO: dont special-case Categorical + if self.is_categorical and len(algos.unique(dest_list)) == 1: + # We likely got here by tiling value inside NDFrame.replace, + # so un-tile here + return self.replace(src_list, dest_list[0], inplace, regex) + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -827,21 +835,14 @@ def _replace_list( src_len = len(pairs) - 1 - def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return ~mask - - return compare_or_regex_search(self.values, s, regex, mask) - if self.is_object: # Calculate the mask once, prior to the call of comp # in order to avoid repeating the same computations mask = ~isna(self.values) - masks = [comp(s[0], mask, regex) for s in pairs] + masks = [ + compare_or_regex_search(self.values, s[0], regex=regex, mask=mask) + for s in pairs + ] else: # GH#38086 faster if we know we dont need to check for regex masks = [missing.mask_missing(self.values, s[0]) for s in pairs] @@ -1462,7 +1463,7 @@ def _replace_coerce( putmask_inplace(nb.values, mask, value) return [nb] else: - regex = _should_use_regex(regex, to_replace) + regex = should_use_regex(regex, to_replace) if regex: return self._replace_regex( to_replace, @@ -2347,7 +2348,7 @@ def replace( # here with listlike to_replace or value, as those cases # go through _replace_list - regex = _should_use_regex(regex, to_replace) + regex = should_use_regex(regex, to_replace) if regex: return self._replace_regex(to_replace, value, inplace=inplace) @@ -2355,36 +2356,9 @@ def replace( return super().replace(to_replace, value, inplace=inplace, regex=False) -def _should_use_regex(regex: bool, to_replace: Any) -> bool: - """ - Decide whether to treat `to_replace` as a regular expression. - """ - if is_re(to_replace): - regex = True - - regex = regex and is_re_compilable(to_replace) - - # Don't use regex if the pattern is empty. - regex = regex and re.compile(to_replace).pattern != "" - return regex - - class CategoricalBlock(ExtensionBlock): __slots__ = () - def _replace_list( - self, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> List[Block]: - if len(algos.unique(dest_list)) == 1: - # We likely got here by tiling value inside NDFrame.replace, - # so un-tile here - return self.replace(src_list, dest_list[0], inplace, regex) - return super()._replace_list(src_list, dest_list, inplace, regex) - def replace( self, to_replace, From 9beb94af72f933629e431b62039a8b0732c57f82 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Feb 2021 08:17:10 -0800 Subject: [PATCH 2/2] mypy fixup --- pandas/core/array_algos/replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 08afad406ae9a..d0565dfff0eb1 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -34,7 +34,7 @@ def should_use_regex(regex: bool, to_replace: Any) -> bool: def compare_or_regex_search( - a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike + a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: np.ndarray ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -47,7 +47,7 @@ def compare_or_regex_search( a : array_like b : scalar or regex pattern regex : bool - mask : array_like + mask : np.ndarray[bool] Returns -------