From 5be91f086c6bfab4ea59319a582aacfa3af27282 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 19 Jul 2022 00:29:08 +0200 Subject: [PATCH] Backport PR #47763: BUG: fix regression in Series[string] setitem setting a scalar with a mask --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/arrays/string_.py | 7 +++++++ pandas/tests/arrays/string_/test_string.py | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 6ee140f59e096..6bd7378e05404 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) +- Fixed regression in setting ``None`` or non-string value into a ``string``-dtype Series using a mask (:issue:`47628`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 919b882f22ecb..655ccb3a474ae 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -17,6 +17,7 @@ from pandas._typing import ( Dtype, Scalar, + npt, type_t, ) from pandas.compat import pa_version_under1p01 @@ -413,6 +414,12 @@ def __setitem__(self, key, value): super().__setitem__(key, value) + def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: + # the super() method NDArrayBackedExtensionArray._putmask uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + ExtensionArray._putmask(self, mask, value) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b5b4007798135..24bb9df296a03 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -553,3 +553,23 @@ def test_isin(dtype, request, fixed_now_ts): result = s.isin(["a", fixed_now_ts]) expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + + +def test_setitem_scalar_with_mask_validation(dtype): + # https://github.com/pandas-dev/pandas/issues/47628 + # setting None with a boolean mask (through _putmaks) should still result + # in pd.NA values in the underlying array + ser = pd.Series(["a", "b", "c"], dtype=dtype) + mask = np.array([False, True, False]) + + ser[mask] = None + assert ser.array[1] is pd.NA + + # for other non-string we should also raise an error + ser = pd.Series(["a", "b", "c"], dtype=dtype) + if type(ser.array) is pd.arrays.StringArray: + msg = "Cannot set non-string value" + else: + msg = "Scalar must be NA or str" + with pytest.raises(ValueError, match=msg): + ser[mask] = 1