Skip to content

Place the calculation of mask prior to the calls of comp in replace_list to improve performance #35229

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 15, 2020
19 changes: 12 additions & 7 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,18 +596,22 @@ def replace_list(
# figure out our mask apriori to avoid repeated replacements
values = self.as_array()

def comp(s, regex=False):
def comp(s: Scalar, mask: np.ndarray, regex: bool = False):
"""
Generate a bool array by perform an equality check, or perform
an element-wise regular expression matching
"""
if isna(s):
return isna(values)
return ~mask

s = com.maybe_box_datetimelike(s)
return _compare_or_regex_search(values, s, regex)
return _compare_or_regex_search(values, s, regex, mask)

masks = [comp(s, regex) for s in src_list]
# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(values)

masks = [comp(s, mask, regex) for s in src_list]

result_blocks = []
src_len = len(src_list) - 1
Expand Down Expand Up @@ -1895,7 +1899,7 @@ def _merge_blocks(


def _compare_or_regex_search(
a: ArrayLike, b: Scalar, regex: bool = False
a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None
) -> Union[ArrayLike, bool]:
"""
Compare two array_like inputs of the same shape or two scalar values
Expand All @@ -1908,6 +1912,7 @@ def _compare_or_regex_search(
a : array_like
b : scalar
regex : bool, default False
mask : array_like or None (default)

Returns
-------
Expand Down Expand Up @@ -1941,7 +1946,7 @@ def _check_comparison_types(
)

# GH#32621 use mask to avoid comparing to NAs
if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
mask = np.reshape(~(isna(a)), a.shape)
if isinstance(a, np.ndarray):
a = a[mask]
Expand All @@ -1953,7 +1958,7 @@ def _check_comparison_types(

result = op(a)

if isinstance(result, np.ndarray):
if isinstance(result, np.ndarray) and mask is not None:
# The shape of the mask can differ to that of the result
# since we may compare only a subset of a's or b's elements
tmp = np.zeros(mask.shape, dtype=np.bool_)
Expand Down