Skip to content

Commit 8140466

Browse files
meeseeksmachineTomAugspurger
authored andcommitted
Backport PR #30980: API: Disallow NaN in StringArray constructor (#31000)
Co-authored-by: Tom Augspurger <[email protected]>
1 parent 5bb0400 commit 8140466

File tree

6 files changed

+58
-16
lines changed

6 files changed

+58
-16
lines changed

pandas/_libs/lib.pyx

+4
Original file line numberDiff line numberDiff line change
@@ -1624,6 +1624,10 @@ cdef class StringValidator(Validator):
16241624
cdef inline bint is_array_typed(self) except -1:
16251625
return issubclass(self.dtype.type, np.str_)
16261626

1627+
cdef bint is_valid_null(self, object value) except -1:
1628+
# We deliberately exclude None / NaN here since StringArray uses NA
1629+
return value is C_NA
1630+
16271631

16281632
cpdef bint is_string_array(ndarray values, bint skipna=False):
16291633
cdef:

pandas/core/arrays/string_.py

+22-12
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,6 @@ class StringArray(PandasArray):
9393
StringArray is considered experimental. The implementation and
9494
parts of the API may change without warning.
9595
96-
In particular, the NA value used may change to no longer be
97-
``numpy.nan``.
98-
9996
Parameters
10097
----------
10198
values : array-like
@@ -104,8 +101,11 @@ class StringArray(PandasArray):
104101
.. warning::
105102
106103
Currently, this expects an object-dtype ndarray
107-
where the elements are Python strings. This may
108-
change without warning in the future.
104+
where the elements are Python strings or :attr:`pandas.NA`.
105+
This may change without warning in the future. Use
106+
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
107+
creating a `StringArray` from any sequence.
108+
109109
copy : bool, default False
110110
Whether to copy the array of data.
111111
@@ -119,6 +119,8 @@ class StringArray(PandasArray):
119119
120120
See Also
121121
--------
122+
array
123+
The recommended function for creating a StringArray.
122124
Series.str
123125
The string methods are available on Series backed by
124126
a StringArray.
@@ -165,25 +167,33 @@ def __init__(self, values, copy=False):
165167
def _validate(self):
166168
"""Validate that we only store NA or strings."""
167169
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
168-
raise ValueError(
169-
"StringArray requires a sequence of strings or missing values."
170-
)
170+
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
171171
if self._ndarray.dtype != "object":
172172
raise ValueError(
173-
"StringArray requires a sequence of strings. Got "
173+
"StringArray requires a sequence of strings or pandas.NA. Got "
174174
f"'{self._ndarray.dtype}' dtype instead."
175175
)
176176

177177
@classmethod
178178
def _from_sequence(cls, scalars, dtype=None, copy=False):
179179
if dtype:
180180
assert dtype == "string"
181-
result = super()._from_sequence(scalars, dtype=object, copy=copy)
181+
182+
result = np.asarray(scalars, dtype="object")
183+
if copy and result is scalars:
184+
result = result.copy()
185+
182186
# Standardize all missing-like values to NA
183187
# TODO: it would be nice to do this in _validate / lib.is_string_array
184188
# We are already doing a scan over the values there.
185-
result[result.isna()] = StringDtype.na_value
186-
return result
189+
na_values = isna(result)
190+
if na_values.any():
191+
if result is scalars:
192+
# force a copy now, if we haven't already
193+
result = result.copy()
194+
result[na_values] = StringDtype.na_value
195+
196+
return cls(result)
187197

188198
@classmethod
189199
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):

pandas/core/strings.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numpy as np
99

1010
import pandas._libs.lib as lib
11+
import pandas._libs.missing as libmissing
1112
import pandas._libs.ops as libops
1213
from pandas._typing import ArrayLike, Dtype
1314
from pandas.util._decorators import Appender
@@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str):
118119
return result
119120

120121

121-
def _na_map(f, arr, na_result=np.nan, dtype=object):
122-
# should really _check_ for NA
122+
def _na_map(f, arr, na_result=None, dtype=object):
123123
if is_extension_array_dtype(arr.dtype):
124+
if na_result is None:
125+
na_result = libmissing.NA
124126
# just StringDtype
125127
arr = extract_array(arr)
126128
return _map_stringarray(f, arr, na_value=na_result, dtype=dtype)
129+
if na_result is None:
130+
na_result = np.nan
127131
return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
128132

129133

pandas/tests/arrays/string_/test_string.py

+19
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,25 @@ def test_constructor_raises():
194194
with pytest.raises(ValueError, match="sequence of strings"):
195195
pd.arrays.StringArray(np.array([]))
196196

197+
with pytest.raises(ValueError, match="strings or pandas.NA"):
198+
pd.arrays.StringArray(np.array(["a", np.nan], dtype=object))
199+
200+
with pytest.raises(ValueError, match="strings or pandas.NA"):
201+
pd.arrays.StringArray(np.array(["a", None], dtype=object))
202+
203+
with pytest.raises(ValueError, match="strings or pandas.NA"):
204+
pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object))
205+
206+
207+
@pytest.mark.parametrize("copy", [True, False])
208+
def test_from_sequence_no_mutate(copy):
209+
a = np.array(["a", np.nan], dtype=object)
210+
original = a.copy()
211+
result = pd.arrays.StringArray._from_sequence(a, copy=copy)
212+
expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
213+
tm.assert_extension_array_equal(result, expected)
214+
tm.assert_numpy_array_equal(a, original)
215+
197216

198217
@pytest.mark.parametrize("skipna", [True, False])
199218
@pytest.mark.xfail(reason="Not implemented StringArray.sum")

pandas/tests/dtypes/test_inference.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1114,11 +1114,16 @@ def test_is_string_array(self):
11141114

11151115
assert lib.is_string_array(np.array(["foo", "bar"]))
11161116
assert not lib.is_string_array(
1117-
np.array(["foo", "bar", np.nan], dtype=object), skipna=False
1117+
np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
11181118
)
11191119
assert lib.is_string_array(
1120+
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
1121+
)
1122+
# NaN is not valid for string array, just NA
1123+
assert not lib.is_string_array(
11201124
np.array(["foo", "bar", np.nan], dtype=object), skipna=True
11211125
)
1126+
11221127
assert not lib.is_string_array(np.array([1, 2]))
11231128

11241129
def test_to_object_array_tuples(self):

pandas/tests/test_strings.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3521,7 +3521,7 @@ def test_string_array(any_string_method):
35213521

35223522
if isinstance(expected, Series):
35233523
if expected.dtype == "object" and lib.is_string_array(
3524-
expected.values, skipna=True
3524+
expected.dropna().values,
35253525
):
35263526
assert result.dtype == "string"
35273527
result = result.astype(object)

0 commit comments

Comments
 (0)