-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Fix replacing in string
series with NA (pandas-dev#32621)
#32890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
47f6676
2b53200
7678495
719369d
e98c7c9
fb8d143
0405f6a
23da71c
ca81cb0
8b6d224
c32a2cc
0a76844
b62ad89
f5b994a
a73e2eb
949accc
e7b37a0
37da655
df5bc39
606aeb8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1941,6 +1941,24 @@ def _compare_or_regex_search(a, b, regex=False): | |
------- | ||
mask : array_like of bool | ||
""" | ||
|
||
def _check(result, a, b): | ||
if is_scalar(result) and ( | ||
isinstance(a, np.ndarray) or isinstance(b, np.ndarray) | ||
): | ||
type_names = [type(a).__name__, type(b).__name__] | ||
|
||
if is_a_array: | ||
type_names[0] = f"ndarray(dtype={a.dtype})" | ||
|
||
if is_b_array: | ||
type_names[1] = f"ndarray(dtype={b.dtype})" | ||
|
||
raise TypeError( | ||
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" | ||
) | ||
return result | ||
|
||
if not regex: | ||
op = lambda x: operator.eq(x, b) | ||
else: | ||
|
@@ -1951,25 +1969,29 @@ def _compare_or_regex_search(a, b, regex=False): | |
is_a_array = isinstance(a, np.ndarray) | ||
is_b_array = isinstance(b, np.ndarray) | ||
|
||
# GH#32621 use mask to avoid comparing to NAs | ||
if is_a_array and not is_b_array: | ||
mask = np.reshape(~(isna(a)), a.shape) | ||
elif is_b_array and not is_a_array: | ||
mask = np.reshape(~(isna(b)), b.shape) | ||
elif is_a_array and is_b_array: | ||
mask = ~(isna(a) | isna(b)) | ||
if is_a_array: | ||
a = a[mask] | ||
if is_b_array: | ||
b = b[mask] | ||
|
||
if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): | ||
# GH#29553 avoid deprecation warnings from numpy | ||
result = False | ||
return _check(False, a, b) | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't need to have an else here any longer |
||
result = op(a) | ||
if isinstance(result, np.ndarray): | ||
tmp = np.zeros(mask.shape, dtype=np.bool) | ||
tmp[mask] = result | ||
result = tmp | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this not just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because the shape of the mask can differ to that of the result, since we may select to compare only a subset of a's or b's elements (i.e. some of them may be NAs). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok can you add a comment on what this is doing (simnilar to what you just said) |
||
|
||
if is_scalar(result) and (is_a_array or is_b_array): | ||
type_names = [type(a).__name__, type(b).__name__] | ||
|
||
if is_a_array: | ||
type_names[0] = f"ndarray(dtype={a.dtype})" | ||
|
||
if is_b_array: | ||
type_names[1] = f"ndarray(dtype={b.dtype})" | ||
|
||
raise TypeError( | ||
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" | ||
) | ||
return result | ||
return _check(result, a, b) | ||
|
||
|
||
def _fast_count_smallints(arr): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -241,6 +241,13 @@ def test_replace2(self): | |
assert (ser[6:10] == -1).all() | ||
assert (ser[20:30] == -1).all() | ||
|
||
def test_replace_with_dictlike_and_string_dtype(self): | ||
# GH 32621 | ||
s = pd.Series(["one", "two", np.nan], dtype="string") | ||
expected = pd.Series(["1", "2", np.nan]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The expected result here should also be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apparently that's not the case (even at the current stable version): >>> import pandas as pd
>>> pd.__version__
'1.0.3'
>>> s = pd.Series(["one", "two"], dtype="string")
>>> expected = pd.Series(["1", "2"], dtype="string")
>>> result = s.replace(to_replace={"one": "1", "two": "2"})
>>> expected
0 1
1 2
dtype: string
>>> result
0 1
1 2
dtype: object I'm not sure if that behaviour is to be expected or it should be tackled within a new issue. What do you think? This is not related to containing NA values. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would say that it is certainly expected for Although of course, there are not guarantees that your replacement value is a string ... For some cases of
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good observation. When not using the argument For more details check here: https://github.com/pandas-dev/pandas/blob/master/pandas/core/generic.py#L6462 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chrispe92 can you open an issue for this |
||
result = s.replace({"one": "1", "two": "2"}) | ||
tm.assert_series_equal(expected, result) | ||
|
||
def test_replace_with_empty_dictlike(self): | ||
# GH 15289 | ||
s = pd.Series(list("abcd")) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you give this a more informative name, maybe _check_comparision_types
and type the input args as much as possible