diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 719db5c03f07f..acd74591134bc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1624,6 +1624,10 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) + cdef bint is_valid_null(self, object value) except -1: + # We deliberately exclude None / NaN here since StringArray uses NA + return value is C_NA + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 84130132de4dc..c485d1f50dc9d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -93,9 +93,6 @@ class StringArray(PandasArray): StringArray is considered experimental. The implementation and parts of the API may change without warning. - In particular, the NA value used may change to no longer be - ``numpy.nan``. - Parameters ---------- values : array-like @@ -104,8 +101,11 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings. This may - change without warning in the future. + where the elements are Python strings or :attr:`pandas.NA`. + This may change without warning in the future. Use + :meth:`pandas.array` with ``dtype="string"`` for a stable way of + creating a `StringArray` from any sequence. + copy : bool, default False Whether to copy the array of data. @@ -119,6 +119,8 @@ class StringArray(PandasArray): See Also -------- + array + The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by a StringArray. @@ -165,12 +167,10 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError( - "StringArray requires a sequence of strings or missing values." - ) + raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( - "StringArray requires a sequence of strings. Got " + "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) @@ -178,12 +178,22 @@ def _validate(self): def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = super()._from_sequence(scalars, dtype=object, copy=copy) + + result = np.asarray(scalars, dtype="object") + if copy and result is scalars: + result = result.copy() + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = StringDtype.na_value - return result + na_values = isna(result) + if na_values.any(): + if result is scalars: + # force a copy now, if we haven't already + result = result.copy() + result[na_values] = StringDtype.na_value + + return cls(result) @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f8d9eeb211a1e..0323eafff8dee 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ import numpy as np import pandas._libs.lib as lib +import pandas._libs.missing as libmissing import pandas._libs.ops as libops from pandas._typing import ArrayLike, Dtype from pandas.util._decorators import Appender @@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str): return result -def _na_map(f, arr, na_result=np.nan, dtype=object): - # should really _check_ for NA +def _na_map(f, arr, na_result=None, dtype=object): if is_extension_array_dtype(arr.dtype): + if na_result is None: + na_result = libmissing.NA # just StringDtype arr = extract_array(arr) return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + if na_result is None: + na_result = np.nan return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 33e68f029922e..5e2f14af341ab 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -194,6 +194,25 @@ def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.StringArray(np.array([])) + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", None], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_from_sequence_no_mutate(copy): + a = np.array(["a", np.nan], dtype=object) + original = a.copy() + result = pd.arrays.StringArray._from_sequence(a, copy=copy) + expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(a, original) + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d022b0e97877a..5eb85de2b90f5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1114,11 +1114,16 @@ def test_is_string_array(self): assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( - np.array(["foo", "bar", np.nan], dtype=object), skipna=False + np.array(["foo", "bar", pd.NA], dtype=object), skipna=False ) assert lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=True + ) + # NaN is not valid for string array, just NA + assert not lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a92f917820bd0..c37c78f3b9235 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3521,7 +3521,7 @@ def test_string_array(any_string_method): if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( - expected.values, skipna=True + expected.dropna().values, ): assert result.dtype == "string" result = result.astype(object)