diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 6ee140f59e096..6bd7378e05404 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) +- Fixed regression in setting ``None`` or non-string value into a ``string``-dtype Series using a mask (:issue:`47628`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c9abef226770c..c68ffec600c8a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -14,6 +14,7 @@ from pandas._typing import ( Dtype, Scalar, + npt, type_t, ) from pandas.compat import pa_version_under1p01 @@ -410,6 +411,12 @@ def __setitem__(self, key, value): super().__setitem__(key, value) + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + # the super() method NDArrayBackedExtensionArray._putmask uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + ExtensionArray._putmask(self, mask, value) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a5eb6189db6f1..4376a0de37a8c 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -588,3 +588,23 @@ def test_isin(dtype, fixed_now_ts): result = s.isin(["a", fixed_now_ts]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + + +def test_setitem_scalar_with_mask_validation(dtype): + # https://github.com/pandas-dev/pandas/issues/47628 + # setting None with a boolean mask (through _putmaks) should still result + # in pd.NA values in the underlying array + ser = pd.Series(["a", "b", "c"], dtype=dtype) + mask = np.array([False, True, False]) + + ser[mask] = None + assert ser.array[1] is pd.NA + + # for other non-string we should also raise an error + ser = pd.Series(["a", "b", "c"], dtype=dtype) + if type(ser.array) is pd.arrays.StringArray: + msg = "Cannot set non-string value" + else: + msg = "Scalar must be NA or str" + with pytest.raises(ValueError, match=msg): + ser[mask] = 1