-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: df.replace with numeric values and str to_replace #36093
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8c8b36f
e44ff49
b3e3ad1
839b757
b683707
30be372
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
""" | ||
Methods used by Block.replace and related methods. | ||
""" | ||
import operator | ||
import re | ||
from typing import Optional, Pattern, Union | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import ArrayLike, Scalar | ||
|
||
from pandas.core.dtypes.common import ( | ||
is_datetimelike_v_numeric, | ||
is_numeric_v_string_like, | ||
is_scalar, | ||
) | ||
from pandas.core.dtypes.missing import isna | ||
|
||
|
||
def compare_or_regex_search( | ||
a: ArrayLike, | ||
b: Union[Scalar, Pattern], | ||
regex: bool = False, | ||
mask: Optional[ArrayLike] = None, | ||
) -> Union[ArrayLike, bool]: | ||
""" | ||
Compare two array_like inputs of the same shape or two scalar values | ||
|
||
Calls operator.eq or re.search, depending on regex argument. If regex is | ||
True, perform an element-wise regex matching. | ||
|
||
Parameters | ||
---------- | ||
a : array_like | ||
b : scalar or regex pattern | ||
regex : bool, default False | ||
mask : array_like or None (default) | ||
|
||
Returns | ||
------- | ||
mask : array_like of bool | ||
""" | ||
|
||
def _check_comparison_types( | ||
result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] | ||
): | ||
""" | ||
Raises an error if the two arrays (a,b) cannot be compared. | ||
Otherwise, returns the comparison result as expected. | ||
""" | ||
if is_scalar(result) and isinstance(a, np.ndarray): | ||
type_names = [type(a).__name__, type(b).__name__] | ||
|
||
if isinstance(a, np.ndarray): | ||
type_names[0] = f"ndarray(dtype={a.dtype})" | ||
|
||
raise TypeError( | ||
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" | ||
) | ||
|
||
if not regex: | ||
op = lambda x: operator.eq(x, b) | ||
else: | ||
op = np.vectorize( | ||
lambda x: bool(re.search(b, x)) | ||
if isinstance(x, str) and isinstance(b, (str, Pattern)) | ||
else False | ||
) | ||
|
||
# GH#32621 use mask to avoid comparing to NAs | ||
if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): | ||
mask = np.reshape(~(isna(a)), a.shape) | ||
if isinstance(a, np.ndarray): | ||
a = a[mask] | ||
|
||
if is_numeric_v_string_like(a, b): | ||
# GH#29553 avoid deprecation warnings from numpy | ||
return np.zeros(a.shape, dtype=bool) | ||
|
||
elif is_datetimelike_v_numeric(a, b): | ||
# GH#29553 avoid deprecation warnings from numpy | ||
_check_comparison_types(False, a, b) | ||
return False | ||
|
||
result = op(a) | ||
|
||
if isinstance(result, np.ndarray) and mask is not None: | ||
# The shape of the mask can differ to that of the result | ||
# since we may compare only a subset of a's or b's elements | ||
tmp = np.zeros(mask.shape, dtype=np.bool_) | ||
tmp[mask] = result | ||
result = tmp | ||
|
||
_check_comparison_types(result, a, b) | ||
return result |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ | |
from pandas._libs.internals import BlockPlacement | ||
from pandas._libs.tslibs import conversion | ||
from pandas._libs.tslibs.timezones import tz_compare | ||
from pandas._typing import ArrayLike | ||
from pandas._typing import ArrayLike, Scalar | ||
from pandas.util._validators import validate_bool_kwarg | ||
|
||
from pandas.core.dtypes.cast import ( | ||
|
@@ -59,6 +59,7 @@ | |
from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna | ||
|
||
import pandas.core.algorithms as algos | ||
from pandas.core.array_algos.replace import compare_or_regex_search | ||
from pandas.core.array_algos.transforms import shift | ||
from pandas.core.arrays import ( | ||
Categorical, | ||
|
@@ -792,7 +793,6 @@ def _replace_list( | |
self, | ||
src_list: List[Any], | ||
dest_list: List[Any], | ||
masks: List[np.ndarray], | ||
inplace: bool = False, | ||
regex: bool = False, | ||
) -> List["Block"]: | ||
|
@@ -801,11 +801,28 @@ def _replace_list( | |
""" | ||
src_len = len(src_list) - 1 | ||
|
||
def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: | ||
""" | ||
Generate a bool array by perform an equality check, or perform | ||
an element-wise regular expression matching | ||
""" | ||
if isna(s): | ||
return ~mask | ||
|
||
s = com.maybe_box_datetimelike(s) | ||
return compare_or_regex_search(self.values, s, regex, mask) | ||
|
||
# Calculate the mask once, prior to the call of comp | ||
# in order to avoid repeating the same computations | ||
mask = ~isna(self.values) | ||
|
||
masks = [comp(s, mask, regex) for s in src_list] | ||
|
||
rb = [self if inplace else self.copy()] | ||
for i, (src, dest) in enumerate(zip(src_list, dest_list)): | ||
new_rb: List["Block"] = [] | ||
for blk in rb: | ||
m = masks[i][blk.mgr_locs.indexer] | ||
m = masks[i] | ||
convert = i == src_len # only convert once at the end | ||
result = blk._replace_coerce( | ||
mask=m, | ||
|
@@ -2908,7 +2925,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: | |
""" | ||
if isinstance(mask, ExtensionArray): | ||
# We could have BooleanArray, Sparse[bool], ... | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think need to update this comment now though - so is there no way to keep this in the same branch as the ExtensionArray check? Would be nice to stay as generic as possible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll see if we can use to_numpy in the general case |
||
mask = np.asarray(mask, dtype=np.bool_) | ||
# Except for BooleanArray, this is equivalent to just | ||
# np.asarray(mask, dtype=bool) | ||
mask = mask.to_numpy(dtype=bool, na_value=False) | ||
|
||
assert isinstance(mask, np.ndarray), type(mask) | ||
assert mask.dtype == bool, mask.dtype | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Related to this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, though there is also a mistake here (the second condition has been refactored to a few lines up, so this line should just be
elif is_datetimelike_v_numeric(a, b):
In master this is where we incorrectly raise instead of just consider string==numeric not-equal