Skip to content

Commit e6a20bd

Browse files
authored
API: Allow other na values in StringArray Constructor (#45168)
1 parent 23f12a1 commit e6a20bd

File tree

7 files changed

+96
-15
lines changed

7 files changed

+96
-15
lines changed

asv_bench/benchmarks/strings.py

+17
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
import numpy as np
44

55
from pandas import (
6+
NA,
67
Categorical,
78
DataFrame,
89
Series,
910
)
11+
from pandas.arrays import StringArray
1012

1113
from .pandas_vb_common import tm
1214

@@ -285,3 +287,18 @@ class Iter(Dtypes):
285287
def time_iter(self, dtype):
286288
for i in self.s:
287289
pass
290+
291+
292+
class StringArrayConstruction:
293+
def setup(self):
294+
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
295+
self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])
296+
297+
def time_string_array_construction(self):
298+
StringArray(self.series_arr)
299+
300+
def time_string_array_with_nan_construction(self):
301+
StringArray(self.series_arr_nan)
302+
303+
def peakmem_stringarray_construction(self):
304+
StringArray(self.series_arr)

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ enhancement2
3030

3131
Other enhancements
3232
^^^^^^^^^^^^^^^^^^
33+
- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
3334
- Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
3435
-
3536

pandas/_libs/lib.pyi

+3
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ def astype_intsafe(
161161
arr: npt.NDArray[np.object_],
162162
new_dtype: np.dtype,
163163
) -> np.ndarray: ...
164+
def convert_nans_to_NA(
165+
arr: npt.NDArray[np.object_],
166+
) -> npt.NDArray[np.object_]: ...
164167
def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...
165168

166169
# TODO: can we be more specific about rows?

pandas/_libs/lib.pyx

+34-4
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,40 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
669669

670670
return result
671671

672+
ctypedef fused ndarr_object:
673+
ndarray[object, ndim=1]
674+
ndarray[object, ndim=2]
675+
676+
# TODO: get rid of this in StringArray and modify
677+
# and go through ensure_string_array instead
678+
@cython.wraparound(False)
679+
@cython.boundscheck(False)
680+
def convert_nans_to_NA(ndarr_object arr) -> ndarray:
681+
"""
682+
Helper for StringArray that converts null values that
683+
are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
684+
have already been validated as null.
685+
"""
686+
cdef:
687+
Py_ssize_t i, m, n
688+
object val
689+
ndarr_object result
690+
result = np.asarray(arr, dtype="object")
691+
if arr.ndim == 2:
692+
m, n = arr.shape[0], arr.shape[1]
693+
for i in range(m):
694+
for j in range(n):
695+
val = arr[i, j]
696+
if not isinstance(val, str):
697+
result[i, j] = <object>C_NA
698+
else:
699+
n = len(arr)
700+
for i in range(n):
701+
val = arr[i]
702+
if not isinstance(val, str):
703+
result[i] = <object>C_NA
704+
return result
705+
672706

673707
@cython.wraparound(False)
674708
@cython.boundscheck(False)
@@ -1880,10 +1914,6 @@ cdef class StringValidator(Validator):
18801914
cdef inline bint is_array_typed(self) except -1:
18811915
return issubclass(self.dtype.type, np.str_)
18821916

1883-
cdef bint is_valid_null(self, object value) except -1:
1884-
# We deliberately exclude None / NaN here since StringArray uses NA
1885-
return value is C_NA
1886-
18871917

18881918
cpdef bint is_string_array(ndarray values, bint skipna=False):
18891919
cdef:

pandas/core/arrays/string_.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray):
246246
.. warning::
247247
248248
Currently, this expects an object-dtype ndarray
249-
where the elements are Python strings or :attr:`pandas.NA`.
249+
where the elements are Python strings
250+
or nan-likes (``None``, ``np.nan``, ``NA``).
250251
This may change without warning in the future. Use
251252
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
252253
creating a `StringArray` from any sequence.
253254
255+
.. versionchanged:: 1.5.0
256+
257+
StringArray now accepts array-likes containing
258+
nan-likes(``None``, ``np.nan``) for the ``values`` parameter
259+
in addition to strings and :attr:`pandas.NA`
260+
254261
copy : bool, default False
255262
Whether to copy the array of data.
256263
@@ -310,11 +317,11 @@ def __init__(self, values, copy=False):
310317
values = extract_array(values)
311318

312319
super().__init__(values, copy=copy)
320+
if not isinstance(values, type(self)):
321+
self._validate()
313322
# error: Incompatible types in assignment (expression has type "StringDtype",
314323
# variable has type "PandasDtype")
315324
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
316-
if not isinstance(values, type(self)):
317-
self._validate()
318325

319326
def _validate(self):
320327
"""Validate that we only store NA or strings."""
@@ -325,6 +332,12 @@ def _validate(self):
325332
"StringArray requires a sequence of strings or pandas.NA. Got "
326333
f"'{self._ndarray.dtype}' dtype instead."
327334
)
335+
# Check to see if need to convert Na values to pd.NA
336+
if self._ndarray.ndim > 2:
337+
# Ravel if ndims > 2 b/c no cythonized version available
338+
lib.convert_nans_to_NA(self._ndarray.ravel("K"))
339+
else:
340+
lib.convert_nans_to_NA(self._ndarray)
328341

329342
@classmethod
330343
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

pandas/tests/arrays/string_/test_string.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -267,15 +267,20 @@ def test_constructor_raises(cls):
267267
cls(np.array([]))
268268

269269
with pytest.raises(ValueError, match=msg):
270-
cls(np.array(["a", np.nan], dtype=object))
271-
272-
with pytest.raises(ValueError, match=msg):
273-
cls(np.array(["a", None], dtype=object))
270+
cls(np.array(["a", np.datetime64("nat")], dtype=object))
274271

275272
with pytest.raises(ValueError, match=msg):
276273
cls(np.array(["a", pd.NaT], dtype=object))
277274

278275

276+
@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA])
277+
def test_constructor_nan_like(na):
278+
expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
279+
tm.assert_extension_array_equal(
280+
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
281+
)
282+
283+
279284
@pytest.mark.parametrize("copy", [True, False])
280285
def test_from_sequence_no_mutate(copy, cls, request):
281286
if cls is ArrowStringArray and copy is False:

pandas/tests/dtypes/test_inference.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -1534,19 +1534,31 @@ def test_is_numeric_array(self):
15341534
assert not lib.is_integer_array(np.array([1, 2.0]))
15351535

15361536
def test_is_string_array(self):
1537-
1537+
# We should only be accepting pd.NA, np.nan,
1538+
# other floating point nans e.g. float('nan')]
1539+
# when skipna is True.
15381540
assert lib.is_string_array(np.array(["foo", "bar"]))
15391541
assert not lib.is_string_array(
15401542
np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
15411543
)
15421544
assert lib.is_string_array(
15431545
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
15441546
)
1545-
# NaN is not valid for string array, just NA
1546-
assert not lib.is_string_array(
1547+
assert lib.is_string_array(
1548+
np.array(["foo", "bar", None], dtype=object), skipna=True
1549+
)
1550+
assert lib.is_string_array(
15471551
np.array(["foo", "bar", np.nan], dtype=object), skipna=True
15481552
)
1549-
1553+
assert not lib.is_string_array(
1554+
np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True
1555+
)
1556+
assert not lib.is_string_array(
1557+
np.array(["foo", "bar", None], dtype=object), skipna=False
1558+
)
1559+
assert not lib.is_string_array(
1560+
np.array(["foo", "bar", np.nan], dtype=object), skipna=False
1561+
)
15501562
assert not lib.is_string_array(np.array([1, 2]))
15511563

15521564
def test_to_object_array_tuples(self):

0 commit comments

Comments
 (0)