Skip to content

Commit 3cca07c

Browse files
authored
BUG: Fix replacing in string series with NA (#32621) (#32890)
1 parent a2cdd50 commit 3cca07c

File tree

3 files changed

+61
-20
lines changed

3 files changed

+61
-20
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -470,8 +470,8 @@ Indexing
470470

471471
Missing
472472
^^^^^^^
473-
474473
- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`).
474+
- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ``<NA>`` was raising a ``TypeError``. The method now handles this by ignoring ``<NA>`` values when doing the comparison for the replacement (:issue:`32621`)
475475
- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ``<NA>`` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`)
476476

477477
MultiIndex

pandas/core/internals/managers.py

+53-19
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99

1010
from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib
11-
from pandas._typing import ArrayLike, DtypeObj, Label
11+
from pandas._typing import ArrayLike, DtypeObj, Label, Scalar
1212
from pandas.util._validators import validate_bool_kwarg
1313

1414
from pandas.core.dtypes.cast import (
@@ -1887,7 +1887,9 @@ def _merge_blocks(
18871887
return blocks
18881888

18891889

1890-
def _compare_or_regex_search(a, b, regex=False):
1890+
def _compare_or_regex_search(
1891+
a: Union[ArrayLike, Scalar], b: Union[ArrayLike, Scalar], regex: bool = False
1892+
) -> Union[ArrayLike, bool]:
18911893
"""
18921894
Compare two array_like inputs of the same shape or two scalar values
18931895
@@ -1904,35 +1906,67 @@ def _compare_or_regex_search(a, b, regex=False):
19041906
-------
19051907
mask : array_like of bool
19061908
"""
1909+
1910+
def _check_comparison_types(
1911+
result: Union[ArrayLike, bool],
1912+
a: Union[ArrayLike, Scalar],
1913+
b: Union[ArrayLike, Scalar],
1914+
) -> Union[ArrayLike, bool]:
1915+
"""
1916+
Raises an error if the two arrays (a,b) cannot be compared.
1917+
Otherwise, returns the comparison result as expected.
1918+
"""
1919+
if is_scalar(result) and (
1920+
isinstance(a, np.ndarray) or isinstance(b, np.ndarray)
1921+
):
1922+
type_names = [type(a).__name__, type(b).__name__]
1923+
1924+
if isinstance(a, np.ndarray):
1925+
type_names[0] = f"ndarray(dtype={a.dtype})"
1926+
1927+
if isinstance(b, np.ndarray):
1928+
type_names[1] = f"ndarray(dtype={b.dtype})"
1929+
1930+
raise TypeError(
1931+
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
1932+
)
1933+
return result
1934+
19071935
if not regex:
19081936
op = lambda x: operator.eq(x, b)
19091937
else:
19101938
op = np.vectorize(
1911-
lambda x: bool(re.search(b, x)) if isinstance(x, str) else False
1939+
lambda x: bool(re.search(b, x))
1940+
if isinstance(x, str) and isinstance(b, str)
1941+
else False
19121942
)
19131943

1914-
is_a_array = isinstance(a, np.ndarray)
1915-
is_b_array = isinstance(b, np.ndarray)
1944+
# GH#32621 use mask to avoid comparing to NAs
1945+
if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
1946+
mask = np.reshape(~(isna(a)), a.shape)
1947+
elif isinstance(b, np.ndarray) and not isinstance(a, np.ndarray):
1948+
mask = np.reshape(~(isna(b)), b.shape)
1949+
elif isinstance(a, np.ndarray) and isinstance(b, np.ndarray):
1950+
mask = ~(isna(a) | isna(b))
1951+
if isinstance(a, np.ndarray):
1952+
a = a[mask]
1953+
if isinstance(b, np.ndarray):
1954+
b = b[mask]
19161955

19171956
if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
19181957
# GH#29553 avoid deprecation warnings from numpy
1919-
result = False
1920-
else:
1921-
result = op(a)
1922-
1923-
if is_scalar(result) and (is_a_array or is_b_array):
1924-
type_names = [type(a).__name__, type(b).__name__]
1958+
return _check_comparison_types(False, a, b)
19251959

1926-
if is_a_array:
1927-
type_names[0] = f"ndarray(dtype={a.dtype})"
1960+
result = op(a)
19281961

1929-
if is_b_array:
1930-
type_names[1] = f"ndarray(dtype={b.dtype})"
1962+
if isinstance(result, np.ndarray):
1963+
# The shape of the mask can differ to that of the result
1964+
# since we may compare only a subset of a's or b's elements
1965+
tmp = np.zeros(mask.shape, dtype=np.bool)
1966+
tmp[mask] = result
1967+
result = tmp
19311968

1932-
raise TypeError(
1933-
f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
1934-
)
1935-
return result
1969+
return _check_comparison_types(result, a, b)
19361970

19371971

19381972
def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:

pandas/tests/series/methods/test_replace.py

+7
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,13 @@ def test_replace2(self):
241241
assert (ser[6:10] == -1).all()
242242
assert (ser[20:30] == -1).all()
243243

244+
def test_replace_with_dictlike_and_string_dtype(self):
245+
# GH 32621
246+
s = pd.Series(["one", "two", np.nan], dtype="string")
247+
expected = pd.Series(["1", "2", np.nan])
248+
result = s.replace({"one": "1", "two": "2"})
249+
tm.assert_series_equal(expected, result)
250+
244251
def test_replace_with_empty_dictlike(self):
245252
# GH 15289
246253
s = pd.Series(list("abcd"))

0 commit comments

Comments
 (0)