Skip to content

API: Disallow NaN in StringArray constructor #30980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 14, 2020
4 changes: 4 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1624,6 +1624,10 @@ cdef class StringValidator(Validator):
cdef inline bint is_array_typed(self) except -1:
return issubclass(self.dtype.type, np.str_)

cdef bint is_valid_null(self, object value) except -1:
# We deliberately exclude None / NaN here since StringArray uses NA
return value is C_NA


cpdef bint is_string_array(ndarray values, bint skipna=False):
cdef:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
result = np.asarray(scalars, dtype=dtype)
if copy and result is scalars:
result = result.copy()
# _coerce_from_sequence_values is useful for StringArray
result = cls._coerce_from_sequence_values(result, copy=copy)
return cls(result)

@classmethod
Expand Down
34 changes: 22 additions & 12 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ class StringArray(PandasArray):
StringArray is considered experimental. The implementation and
parts of the API may change without warning.

In particular, the NA value used may change to no longer be
``numpy.nan``.

Parameters
----------
values : array-like
Expand All @@ -104,8 +101,11 @@ class StringArray(PandasArray):
.. warning::

Currently, this expects an object-dtype ndarray
where the elements are Python strings. This may
change without warning in the future.
where the elements are Python strings or :attr:`pandas.NA`.
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.

copy : bool, default False
Whether to copy the array of data.

Expand All @@ -119,6 +119,8 @@ class StringArray(PandasArray):

See Also
--------
array
The recommended function for creating a StringArray.
Series.str
The string methods are available on Series backed by
a StringArray.
Expand Down Expand Up @@ -165,25 +167,33 @@ def __init__(self, values, copy=False):
def _validate(self):
"""Validate that we only store NA or strings."""
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
raise ValueError(
"StringArray requires a sequence of strings or missing values."
)
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
if self._ndarray.dtype != "object":
raise ValueError(
"StringArray requires a sequence of strings. Got "
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
if dtype:
assert dtype == "string"
result = super()._from_sequence(scalars, dtype=object, copy=copy)

result = np.asarray(scalars, dtype="object")
if copy and result is scalars:
result = result.copy()

# Standardize all missing-like values to NA
# TODO: it would be nice to do this in _validate / lib.is_string_array
# We are already doing a scan over the values there.
result[result.isna()] = StringDtype.na_value
return result
na_values = isna(result)
if na_values.any():
if result is scalars:
# force a copy now, if we haven't already
result = result.copy()
result[na_values] = StringDtype.na_value

return cls(result)

@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np

import pandas._libs.lib as lib
import pandas._libs.missing as libmissing
import pandas._libs.ops as libops
from pandas._typing import ArrayLike, Dtype
from pandas.util._decorators import Appender
Expand Down Expand Up @@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str):
return result


def _na_map(f, arr, na_result=np.nan, dtype=object):
# should really _check_ for NA
def _na_map(f, arr, na_result=None, dtype=object):
if is_extension_array_dtype(arr.dtype):
if na_result is None:
na_result = libmissing.NA
# just StringDtype
arr = extract_array(arr)
return _map_stringarray(f, arr, na_value=na_result, dtype=dtype)
if na_result is None:
na_result = np.nan
return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype)


Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,25 @@ def test_constructor_raises():
with pytest.raises(ValueError, match="sequence of strings"):
pd.arrays.StringArray(np.array([]))

with pytest.raises(ValueError, match="strings or pandas.NA"):
pd.arrays.StringArray(np.array(["a", np.nan], dtype=object))

with pytest.raises(ValueError, match="strings or pandas.NA"):
pd.arrays.StringArray(np.array(["a", None], dtype=object))

with pytest.raises(ValueError, match="strings or pandas.NA"):
pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object))


@pytest.mark.parametrize("copy", [True, False])
def test_from_sequnce_no_mutate(copy):
a = np.array(["a", np.nan], dtype=object)
original = a.copy()
result = pd.arrays.StringArray._from_sequence(a, copy=copy)
expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(a, original)


@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.xfail(reason="Not implemented StringArray.sum")
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,11 +1114,16 @@ def test_is_string_array(self):

assert lib.is_string_array(np.array(["foo", "bar"]))
assert not lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=False
np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
)
assert lib.is_string_array(
np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
)
# NaN is not valid for string array, just NA
assert not lib.is_string_array(
np.array(["foo", "bar", np.nan], dtype=object), skipna=True
)

assert not lib.is_string_array(np.array([1, 2]))

def test_to_object_array_tuples(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3521,7 +3521,7 @@ def test_string_array(any_string_method):

if isinstance(expected, Series):
if expected.dtype == "object" and lib.is_string_array(
expected.values, skipna=True
expected.dropna().values,
):
assert result.dtype == "string"
result = result.astype(object)
Expand Down