BUG: Fix replacing in string series with NA (#32621) (#32890)

chrispe92 · web-flow · commit 3cca07c8a590 · 2020-04-10T13:40:50.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -470,8 +470,8 @@ Indexing
 
 Missing
 ^^^^^^^
-
 - Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`).
+- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ``<NA>`` was raising a ``TypeError``. The method now handles this by ignoring ``<NA>`` values when doing the comparison for the replacement (:issue:`32621`)
 - Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ``<NA>`` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`)
 
 MultiIndex
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib
-from pandas._typing import ArrayLike, DtypeObj, Label
+from pandas._typing import ArrayLike, DtypeObj, Label, Scalar
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.cast import (
@@ -1887,7 +1887,9 @@ def _merge_blocks(
     return blocks
 
 
-def _compare_or_regex_search(a, b, regex=False):
+def _compare_or_regex_search(
+    a: Union[ArrayLike, Scalar], b: Union[ArrayLike, Scalar], regex: bool = False
+) -> Union[ArrayLike, bool]:
     """
     Compare two array_like inputs of the same shape or two scalar values
 
@@ -1904,35 +1906,67 @@ def _compare_or_regex_search(a, b, regex=False):
     -------
     mask : array_like of bool
     """
+
+    def _check_comparison_types(
+        result: Union[ArrayLike, bool],
+        a: Union[ArrayLike, Scalar],
+        b: Union[ArrayLike, Scalar],
+    ) -> Union[ArrayLike, bool]:
+        """
+        Raises an error if the two arrays (a,b) cannot be compared.
+        Otherwise, returns the comparison result as expected.
+        """
+        if is_scalar(result) and (
+            isinstance(a, np.ndarray) or isinstance(b, np.ndarray)
+        ):
+            type_names = [type(a).__name__, type(b).__name__]
+
+            if isinstance(a, np.ndarray):
+                type_names[0] = f"ndarray(dtype={a.dtype})"
+
+            if isinstance(b, np.ndarray):
+                type_names[1] = f"ndarray(dtype={b.dtype})"
+
+            raise TypeError(
+                f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
+            )
+        return result
+
     if not regex:
         op = lambda x: operator.eq(x, b)
     else:
         op = np.vectorize(
-            lambda x: bool(re.search(b, x)) if isinstance(x, str) else False
+            lambda x: bool(re.search(b, x))
+            if isinstance(x, str) and isinstance(b, str)
+            else False
         )
 
-    is_a_array = isinstance(a, np.ndarray)
-    is_b_array = isinstance(b, np.ndarray)
+    # GH#32621 use mask to avoid comparing to NAs
+    if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray):
+        mask = np.reshape(~(isna(a)), a.shape)
+    elif isinstance(b, np.ndarray) and not isinstance(a, np.ndarray):
+        mask = np.reshape(~(isna(b)), b.shape)
+    elif isinstance(a, np.ndarray) and isinstance(b, np.ndarray):
+        mask = ~(isna(a) | isna(b))
+    if isinstance(a, np.ndarray):
+        a = a[mask]
+    if isinstance(b, np.ndarray):
+        b = b[mask]
 
     if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b):
         # GH#29553 avoid deprecation warnings from numpy
-        result = False
-    else:
-        result = op(a)
-
-    if is_scalar(result) and (is_a_array or is_b_array):
-        type_names = [type(a).__name__, type(b).__name__]
+        return _check_comparison_types(False, a, b)
 
-        if is_a_array:
-            type_names[0] = f"ndarray(dtype={a.dtype})"
+    result = op(a)
 
-        if is_b_array:
-            type_names[1] = f"ndarray(dtype={b.dtype})"
+    if isinstance(result, np.ndarray):
+        # The shape of the mask can differ to that of the result
+        # since we may compare only a subset of a's or b's elements
+        tmp = np.zeros(mask.shape, dtype=np.bool)
+        tmp[mask] = result
+        result = tmp
 
-        raise TypeError(
-            f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
-        )
-    return result
+    return _check_comparison_types(result, a, b)
 
 
 def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
@@ -241,6 +241,13 @@ def test_replace2(self):
         assert (ser[6:10] == -1).all()
         assert (ser[20:30] == -1).all()
 
+    def test_replace_with_dictlike_and_string_dtype(self):
+        # GH 32621
+        s = pd.Series(["one", "two", np.nan], dtype="string")
+        expected = pd.Series(["1", "2", np.nan])
+        result = s.replace({"one": "1", "two": "2"})
+        tm.assert_series_equal(expected, result)
+
     def test_replace_with_empty_dictlike(self):
         # GH 15289
         s = pd.Series(list("abcd"))