Skip to content

REF: move replace code out of Blocks #39559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
import operator
import re
from typing import Optional, Pattern, Union
from typing import Any, Optional, Pattern, Union

import numpy as np

Expand All @@ -13,13 +13,28 @@
is_datetimelike_v_numeric,
is_numeric_v_string_like,
is_re,
is_re_compilable,
is_scalar,
)
from pandas.core.dtypes.missing import isna


def should_use_regex(regex: bool, to_replace: Any) -> bool:
"""
Decide whether to treat `to_replace` as a regular expression.
"""
if is_re(to_replace):
regex = True

regex = regex and is_re_compilable(to_replace)

# Don't use regex if the pattern is empty.
regex = regex and re.compile(to_replace).pattern != ""
return regex


def compare_or_regex_search(
a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike
a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: np.ndarray
) -> Union[ArrayLike, bool]:
"""
Compare two array_like inputs of the same shape or two scalar values
Expand All @@ -32,12 +47,14 @@ def compare_or_regex_search(
a : array_like
b : scalar or regex pattern
regex : bool
mask : array_like
mask : np.ndarray[bool]

Returns
-------
mask : array_like of bool
"""
if isna(b):
return ~mask

def _check_comparison_types(
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern]
Expand Down
62 changes: 18 additions & 44 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)
from pandas._libs.internals import BlockPlacement
from pandas._libs.tslibs import conversion
from pandas._typing import ArrayLike, Dtype, DtypeObj, Scalar, Shape
from pandas._typing import ArrayLike, Dtype, DtypeObj, Shape
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand All @@ -44,8 +44,6 @@
is_integer,
is_list_like,
is_object_dtype,
is_re,
is_re_compilable,
is_sparse,
pandas_dtype,
)
Expand All @@ -59,7 +57,11 @@
putmask_smart,
putmask_without_repeat,
)
from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex
from pandas.core.array_algos.replace import (
compare_or_regex_search,
replace_regex,
should_use_regex,
)
from pandas.core.array_algos.transforms import shift
from pandas.core.arrays import (
Categorical,
Expand Down Expand Up @@ -817,6 +819,12 @@ def _replace_list(
"""
See BlockManager._replace_list docstring.
"""
# TODO: dont special-case Categorical
if self.is_categorical and len(algos.unique(dest_list)) == 1:
# We likely got here by tiling value inside NDFrame.replace,
# so un-tile here
return self.replace(src_list, dest_list[0], inplace, regex)

# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
Expand All @@ -827,21 +835,14 @@ def _replace_list(

src_len = len(pairs) - 1

def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray:
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return ~mask

return compare_or_regex_search(self.values, s, regex, mask)

if self.is_object:
# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(self.values)
masks = [comp(s[0], mask, regex) for s in pairs]
masks = [
compare_or_regex_search(self.values, s[0], regex=regex, mask=mask)
for s in pairs
]
else:
# GH#38086 faster if we know we dont need to check for regex
masks = [missing.mask_missing(self.values, s[0]) for s in pairs]
Expand Down Expand Up @@ -1464,7 +1465,7 @@ def _replace_coerce(
putmask_inplace(nb.values, mask, value)
return [nb]
else:
regex = _should_use_regex(regex, to_replace)
regex = should_use_regex(regex, to_replace)
if regex:
return self._replace_regex(
to_replace,
Expand Down Expand Up @@ -2353,44 +2354,17 @@ def replace(
# here with listlike to_replace or value, as those cases
# go through _replace_list

regex = _should_use_regex(regex, to_replace)
regex = should_use_regex(regex, to_replace)

if regex:
return self._replace_regex(to_replace, value, inplace=inplace)
else:
return super().replace(to_replace, value, inplace=inplace, regex=False)


def _should_use_regex(regex: bool, to_replace: Any) -> bool:
"""
Decide whether to treat `to_replace` as a regular expression.
"""
if is_re(to_replace):
regex = True

regex = regex and is_re_compilable(to_replace)

# Don't use regex if the pattern is empty.
regex = regex and re.compile(to_replace).pattern != ""
return regex


class CategoricalBlock(ExtensionBlock):
__slots__ = ()

def _replace_list(
self,
src_list: List[Any],
dest_list: List[Any],
inplace: bool = False,
regex: bool = False,
) -> List[Block]:
if len(algos.unique(dest_list)) == 1:
# We likely got here by tiling value inside NDFrame.replace,
# so un-tile here
return self.replace(src_list, dest_list[0], inplace, regex)
return super()._replace_list(src_list, dest_list, inplace, regex)

def replace(
self,
to_replace,
Expand Down