[backport 2.3.x] BUG (string dtype): replace with non-string to fall back to object dtype (#60285) (#60292)

jorisvandenbossche · web-flow · commit 54b47df88199 · 2024-11-14T09:22:39.000-08:00
* BUG (string dtype): replace with non-string to fall back to object dtype (#60285) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 938832b) * updates for 2.3 * fix inplace modification for 2.3.x branch with python storage
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -107,10 +107,10 @@ Conversion
 Strings
 ^^^^^^^
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
+- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
--
 
 Interval
 ^^^^^^^^
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]:
 
         return arr, self.dtype.na_value
 
-    def __setitem__(self, key, value) -> None:
-        value = extract_array(value, extract_numpy=True)
-        if isinstance(value, type(self)):
-            # extract_array doesn't extract NumpyExtensionArray subclasses
-            value = value._ndarray
-
-        key = check_array_indexer(self, key)
-        scalar_key = lib.is_scalar(key)
-        scalar_value = lib.is_scalar(value)
-        if scalar_key and not scalar_value:
-            raise ValueError("setting an array element with a sequence.")
-
-        # validate new items
-        if scalar_value:
+    def _maybe_convert_setitem_value(self, value):
+        """Maybe convert value to be pyarrow compatible."""
+        if lib.is_scalar(value):
             if isna(value):
                 value = self.dtype.na_value
             elif not isinstance(value, str):
@@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None:
                     "instead."
                 )
         else:
+            value = extract_array(value, extract_numpy=True)
             if not is_array_like(value):
                 value = np.asarray(value, dtype=object)
+            elif isinstance(value.dtype, type(self.dtype)):
+                return value
             else:
                 # cast categories and friends to arrays to see if values are
                 # compatible, compatibility with arrow backed strings
@@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None:
                     "Invalid value for dtype 'str'. Value should be a "
                     "string or missing value (or array of those)."
                 )
+        return value
 
-            mask = isna(value)
-            if mask.any():
-                value = value.copy()
-                value[isna(value)] = self.dtype.na_value
+    def __setitem__(self, key, value) -> None:
+        value = self._maybe_convert_setitem_value(value)
+
+        key = check_array_indexer(self, key)
+        scalar_key = lib.is_scalar(key)
+        scalar_value = lib.is_scalar(value)
+        if scalar_key and not scalar_value:
+            raise ValueError("setting an array element with a sequence.")
+
+        if not scalar_value:
+            if value.dtype == self.dtype:
+                value = value._ndarray
+            else:
+                value = np.asarray(value)
+                mask = isna(value)
+                if mask.any():
+                    value = value.copy()
+                    value[isna(value)] = self.dtype.na_value
 
         super().__setitem__(key, value)
 
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1754,6 +1754,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
             except (ValueError, TypeError):
                 return False
 
+        if dtype == "string":
+            try:
+                arr._maybe_convert_setitem_value(element)  # type: ignore[union-attr]
+                return True
+            except (ValueError, TypeError):
+                return False
+
         # This is technically incorrect, but maintains the behavior of
         # ExtensionBlock._can_hold_element
         return True
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -84,6 +84,7 @@
     ABCNumpyExtensionArray,
     ABCSeries,
 )
+from pandas.core.dtypes.inference import is_re
 from pandas.core.dtypes.missing import (
     is_valid_na_for_dtype,
     isna,
@@ -115,6 +116,7 @@
     PeriodArray,
     TimedeltaArray,
 )
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.base import PandasObject
 import pandas.core.common as com
 from pandas.core.computation import expressions
@@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
     # Up/Down-casting
 
     @final
-    def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
+    def coerce_to_target_dtype(
+        self, other, warn_on_upcast: bool = False, using_cow: bool = False
+    ) -> Block:
         """
         coerce the current block to a dtype compat for other
         we will return a block, possibly object, and not raise
@@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
                 f"{self.values.dtype}. Please report a bug at "
                 "https://github.com/pandas-dev/pandas/issues."
             )
-        return self.astype(new_dtype, copy=False)
+        copy = False
+        if (
+            not using_cow
+            and isinstance(self.dtype, StringDtype)
+            and self.dtype.storage == "python"
+        ):
+            copy = True
+        return self.astype(new_dtype, copy=copy, using_cow=using_cow)
 
     @final
     def _maybe_downcast(
@@ -879,7 +890,7 @@ def replace(
             else:
                 return [self] if inplace else [self.copy()]
 
-        elif self._can_hold_element(value):
+        elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)):
             # TODO(CoW): Maybe split here as well into columns where mask has True
             # and rest?
             blk = self._maybe_copy(using_cow, inplace)
@@ -926,12 +937,13 @@ def replace(
             if value is None or value is NA:
                 blk = self.astype(np.dtype(object))
             else:
-                blk = self.coerce_to_target_dtype(value)
+                blk = self.coerce_to_target_dtype(value, using_cow=using_cow)
             return blk.replace(
                 to_replace=to_replace,
                 value=value,
                 inplace=True,
                 mask=mask,
+                using_cow=using_cow,
             )
 
         else:
@@ -980,16 +992,26 @@ def _replace_regex(
         -------
         List[Block]
         """
-        if not self._can_hold_element(to_replace):
+        if not is_re(to_replace) and not self._can_hold_element(to_replace):
             # i.e. only if self.is_object is True, but could in principle include a
             #  String ExtensionBlock
             if using_cow:
                 return [self.copy(deep=False)]
             return [self] if inplace else [self.copy()]
 
-        rx = re.compile(to_replace)
+        if is_re(to_replace) and self.dtype not in [object, "string"]:
+            # only object or string dtype can hold strings, and a regex object
+            # will only match strings
+            return [self.copy(deep=False)]
 
-        block = self._maybe_copy(using_cow, inplace)
+        if not (
+            self._can_hold_element(value) or (self.dtype == "string" and is_re(value))
+        ):
+            block = self.astype(np.dtype(object))
+        else:
+            block = self._maybe_copy(using_cow, inplace)
+
+        rx = re.compile(to_replace)
 
         replace_regex(block.values, rx, value, mask)
 
@@ -1048,7 +1070,9 @@ def replace_list(
 
         # Exclude anything that we know we won't contain
         pairs = [
-            (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
+            (x, y)
+            for x, y in zip(src_list, dest_list)
+            if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x)))
         ]
         if not len(pairs):
             if using_cow:
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
@@ -297,7 +297,6 @@ def test_regex_replace_dict_nested_non_first_character(
             expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_regex_replace_dict_nested_gh4115(self):
         df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
         expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
@@ -556,7 +555,6 @@ def test_replace_series_dict(self):
         result = df.replace(s, df.mean())
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_replace_convert(self):
         # gh 3907
         df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]])
@@ -932,7 +930,6 @@ def test_replace_input_formats_listlike(self):
         with pytest.raises(ValueError, match=msg):
             df.replace(to_rep, values[1:])
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_replace_input_formats_scalar(self):
         df = DataFrame(
             {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
@@ -961,7 +958,6 @@ def test_replace_limit(self):
         # TODO
         pass
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_replace_dict_no_regex(self):
         answer = Series(
             {
@@ -985,7 +981,6 @@ def test_replace_dict_no_regex(self):
             result = answer.replace(weights)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_replace_series_no_regex(self):
         answer = Series(
             {
@@ -1104,7 +1099,6 @@ def test_replace_swapping_bug(self, using_infer_string):
         expect = DataFrame({"a": ["Y", "N", "Y"]})
         tm.assert_frame_equal(res, expect)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_replace_period(self):
         d = {
             "fname": {
@@ -1141,7 +1135,6 @@ def test_replace_period(self):
             result = df.replace(d)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     def test_replace_datetime(self):
         d = {
             "fname": {
@@ -1367,7 +1360,6 @@ def test_replace_commutative(self, df, to_replace, exp):
         result = df.replace(to_replace)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     @pytest.mark.parametrize(
         "replacer",
         [
@@ -1644,7 +1636,6 @@ def test_regex_replace_scalar(
         expected.loc[expected["a"] == ".", "a"] = expected_replace_val
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
     @pytest.mark.parametrize("regex", [False, True])
     def test_replace_regex_dtype_frame(self, regex):
         # GH-48644
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -886,24 +886,16 @@ def test_index_where(self, obj, key, expected, warn, val):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
 
-        if obj.dtype == "string" and not (isinstance(val, str) or isna(val)):
-            with pytest.raises(TypeError, match="Invalid value"):
-                Index(obj, dtype=obj.dtype).where(~mask, val)
-        else:
-            res = Index(obj, dtype=obj.dtype).where(~mask, val)
-            expected_idx = Index(expected, dtype=expected.dtype)
-            tm.assert_index_equal(res, expected_idx)
+        res = Index(obj, dtype=obj.dtype).where(~mask, val)
+        expected_idx = Index(expected, dtype=expected.dtype)
+        tm.assert_index_equal(res, expected_idx)
 
     def test_index_putmask(self, obj, key, expected, warn, val):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
 
-        if obj.dtype == "string" and not (isinstance(val, str) or isna(val)):
-            with pytest.raises(TypeError, match="Invalid value"):
-                Index(obj, dtype=obj.dtype).putmask(mask, val)
-        else:
-            res = Index(obj, dtype=obj.dtype).putmask(mask, val)
-            tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
+        res = Index(obj, dtype=obj.dtype).putmask(mask, val)
+        tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
 
 
 @pytest.mark.parametrize(
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
@@ -403,10 +403,6 @@ def test_replace_categorical(self, categorical, numeric, using_infer_string):
         ser = pd.Series(categorical)
         msg = "Downcasting behavior in `replace`"
         msg = "with CategoricalDtype is deprecated"
-        if using_infer_string:
-            with pytest.raises(TypeError, match="Invalid value"):
-                ser.replace({"A": 1, "B": 2})
-            return
         with tm.assert_produces_warning(FutureWarning, match=msg):
             result = ser.replace({"A": 1, "B": 2})
         expected = pd.Series(numeric).astype("category")
@@ -745,13 +741,13 @@ def test_replace_regex_dtype_series(self, regex):
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("regex", [False, True])
-    def test_replace_regex_dtype_series_string(self, regex, using_infer_string):
-        if not using_infer_string:
-            # then this is object dtype which is already tested above
-            return
+    def test_replace_regex_dtype_series_string(self, regex):
         series = pd.Series(["0"], dtype="str")
-        with pytest.raises(TypeError, match="Invalid value"):
-            series.replace(to_replace="0", value=1, regex=regex)
+        expected = pd.Series([1], dtype="int64")
+        msg = "Downcasting behavior in `replace`"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = series.replace(to_replace="0", value=1, regex=regex)
+        tm.assert_series_equal(result, expected)
 
     def test_replace_different_int_types(self, any_int_numpy_dtype):
         # GH#45311