-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
API: allow nan-likes in StringArray constructor #41412
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 36 commits
3e1784d
96ff1da
418e1d2
25a6c4d
47d68f7
8257dbd
922436a
2f28086
3ee2198
9426a52
3ee55f2
fe4981a
a66948a
e852719
91b73bb
42ec3e4
41f49d2
62cc5be
29909f3
153b6b4
b27a839
ed5b953
caa5705
2d75031
52a00d1
8dc0b66
1bacaed
66be087
7b058cd
1be1bdf
3c57094
03738a9
12351de
889829a
358000f
c649b1d
5e5aa9c
eb7d8f2
2426319
20817a7
33d8f9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,6 +99,7 @@ from pandas._libs.missing cimport ( | |
is_null_timedelta64, | ||
isnaobj, | ||
) | ||
from pandas._libs.missing import checknull | ||
from pandas._libs.tslibs.conversion cimport convert_to_tsobject | ||
from pandas._libs.tslibs.nattype cimport ( | ||
NPY_NAT, | ||
|
@@ -675,7 +676,7 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: | |
cpdef ndarray[object] ensure_string_array( | ||
arr, | ||
object na_value=np.nan, | ||
bint convert_na_value=True, | ||
coerce="all", | ||
bint copy=True, | ||
bint skipna=True, | ||
): | ||
|
@@ -688,8 +689,16 @@ cpdef ndarray[object] ensure_string_array( | |
The values to be converted to str, if needed. | ||
na_value : Any, default np.nan | ||
The value to use for na. For example, np.nan or pd.NA. | ||
convert_na_value : bool, default True | ||
If False, existing na values will be used unchanged in the new array. | ||
coerce : {'all', 'null', 'non-null', None}, default 'all' | ||
Whether to coerce non-string elements to strings. | ||
- 'all' will convert null values and non-null non-string values. | ||
- 'strict-null' will only convert pd.NA, np.nan, or None to na_value | ||
without converting other non-strings. | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
- 'null' will convert nulls to na_value w/out converting other non-strings. | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
- 'non-null' will only convert non-null non-string elements to string. | ||
- None will not convert anything. | ||
If coerce is not 'all', a ValueError will be raised for values | ||
that are not strings or na_value. | ||
copy : bool, default True | ||
Whether to ensure that a new array is returned. | ||
skipna : bool, default True | ||
|
@@ -699,10 +708,20 @@ cpdef ndarray[object] ensure_string_array( | |
Returns | ||
------- | ||
np.ndarray[object] | ||
An array with the input array's elements casted to str or nan-like. | ||
An array of strings and na_value. | ||
|
||
Raises | ||
------ | ||
ValueError | ||
If an element is encountered that is not a string or valid NA value | ||
and element is not coerced. | ||
""" | ||
if coerce not in {"all", "strict-null", "null", "non-null", None}: | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise ValueError("coerce argument must be one of " | ||
f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}") | ||
cdef: | ||
Py_ssize_t i = 0, n = len(arr) | ||
set strict_na_values = {C_NA, np.nan, None} | ||
|
||
if hasattr(arr, "to_numpy"): | ||
|
||
|
@@ -722,21 +741,34 @@ cpdef ndarray[object] ensure_string_array( | |
if copy and result is arr: | ||
result = result.copy() | ||
|
||
if coerce == 'strict-null': | ||
# We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid | ||
# If they are present, they are treated like a regular Python object | ||
# and will either cause an exception to be raised or be coerced. | ||
check_null = strict_na_values.__contains__ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this always work with np.nan or should this be a function using np.isnan? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i recall seeing something somewhere about np.nan not guaranteed to be a singleton. maybe untrue or not relevant these days. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tentatively resolving since CI is green. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
nope.
|
||
else: | ||
check_null = checknull | ||
|
||
for i in range(n): | ||
val = arr[i] | ||
|
||
if isinstance(val, str): | ||
continue | ||
|
||
if not checknull(val): | ||
if not isinstance(val, np.floating): | ||
# f"{val}" is faster than str(val) | ||
result[i] = f"{val}" | ||
if not check_null(val): | ||
if coerce =="all" or coerce == "non-null": | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not isinstance(val, np.floating): | ||
# f"{val}" is faster than str(val) | ||
result[i] = f"{val}" | ||
else: | ||
# f"{val}" is not always equivalent to str(val) for floats | ||
result[i] = str(val) | ||
else: | ||
# f"{val}" is not always equivalent to str(val) for floats | ||
result[i] = str(val) | ||
raise ValueError(f"Element {val} is not a string or valid null." | ||
"If you want it to be coerced to a string," | ||
"specify coerce='all'") | ||
else: | ||
if convert_na_value: | ||
if coerce=="all" or coerce == "null" or coerce == 'strict-null': | ||
val = na_value | ||
if skipna: | ||
result[i] = val | ||
|
@@ -1861,8 +1893,8 @@ cdef class StringValidator(Validator): | |
return issubclass(self.dtype.type, np.str_) | ||
|
||
cdef bint is_valid_null(self, object value) except -1: | ||
# We deliberately exclude None / NaN here since StringArray uses NA | ||
return value is C_NA | ||
# Override to exclude float('Nan') and complex NaN | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return value is None or value is C_NA or value is np.nan | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
cpdef bint is_string_array(ndarray values, bint skipna=False): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -247,11 +247,18 @@ class StringArray(BaseStringArray, PandasArray): | |
.. warning:: | ||
|
||
Currently, this expects an object-dtype ndarray | ||
where the elements are Python strings or :attr:`pandas.NA`. | ||
where the elements are Python strings | ||
or nan-likes(``None``, ``np.nan``, ``NA``). | ||
This may change without warning in the future. Use | ||
:meth:`pandas.array` with ``dtype="string"`` for a stable way of | ||
creating a `StringArray` from any sequence. | ||
|
||
.. versionchanged:: 1.3 | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
StringArray now accepts nan-likes(``None``, ``np.nan``) for the | ||
``values`` parameter in its constructor | ||
in addition to strings and :attr:`pandas.NA` | ||
|
||
copy : bool, default False | ||
Whether to copy the array of data. | ||
|
||
|
@@ -311,6 +318,8 @@ def __init__(self, values, copy=False): | |
values = extract_array(values) | ||
|
||
super().__init__(values, copy=copy) | ||
if not isinstance(values, type(self)): | ||
self._validate() | ||
# error: Incompatible types in assignment (expression has type "StringDtype", | ||
# variable has type "PandasDtype") | ||
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) | ||
|
@@ -319,18 +328,25 @@ def __init__(self, values, copy=False): | |
|
||
def _validate(self): | ||
"""Validate that we only store NA or strings.""" | ||
if len(self._ndarray) and not lib.is_string_array( | ||
self._ndarray.ravel("K"), skipna=True | ||
): | ||
raise ValueError("StringArray requires a sequence of strings or pandas.NA") | ||
if self._ndarray.dtype != "object": | ||
raise ValueError( | ||
"StringArray requires a sequence of strings or pandas.NA. Got " | ||
f"'{self._ndarray.dtype}' dtype instead." | ||
) | ||
try: | ||
lib.ensure_string_array( | ||
lithomas1 marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this going through ensure_string_array instead of e.g. is_string_array? For the latter, the ravel would be unnecessary. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is_string_array will not convert the other nans(None, np.nan, etc.) to the correct na_value of pd.NA. FWIW, switching to ensure_string_array will also align us with _from_sequence. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does any of this become simpler if done in conjunction with the edits in #45057 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will try to split this one up. I'm thinking of maybe sticking to is_string_array and then doing another pass over the data to convert the not pd.NA nans, as a quick short term fix to unblock you. This will probably give a perf regression, but since is_string_array got sped up in #44495 on master(not 1.3.x), all I have to do is make the perf regression less than the perf improvement there, so that the regression is not user visible. |
||
self._ndarray.ravel("K"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm pretty sure this is broken for 2D EAs even with the ravel. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not sure I understand the question. The 2D EA has a working
I'm still getting the hang of this (intending to make array_to_timedelta64 handle 2D directly to avoid ravels), would advise NOT trying to handle it in the cython code. |
||
na_value=StringDtype.na_value, | ||
coerce="strict-null", | ||
copy=False, | ||
) | ||
except ValueError: | ||
raise ValueError("StringArray requires a sequence of strings or pandas.NA") | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): | ||
def _from_sequence( | ||
cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this is like but I still think the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will take a look soon-ish. I'm wary of adding keywords here |
||
): | ||
if dtype and not (isinstance(dtype, str) and dtype == "string"): | ||
dtype = pandas_dtype(dtype) | ||
assert isinstance(dtype, StringDtype) and dtype.storage == "python" | ||
|
@@ -339,15 +355,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): | |
|
||
if isinstance(scalars, BaseMaskedArray): | ||
# avoid costly conversion to object dtype | ||
if coerce: | ||
coerce = "non-null" | ||
else: | ||
coerce = None | ||
na_values = scalars._mask | ||
result = scalars._data | ||
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) | ||
result = lib.ensure_string_array(result, copy=copy, coerce=coerce) | ||
result[na_values] = StringDtype.na_value | ||
|
||
else: | ||
# convert non-na-likes to str, and nan-likes to StringDtype.na_value | ||
if coerce: | ||
coerce = "all" | ||
else: | ||
coerce = "strict-null" | ||
result = lib.ensure_string_array( | ||
scalars, na_value=StringDtype.na_value, copy=copy | ||
scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce | ||
) | ||
|
||
# Manually creating new array avoids the validation step in the __init__, so is | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,12 +5,14 @@ | |
import numpy as np | ||
import pytest | ||
|
||
import pandas._libs.lib as lib | ||
import pandas.util._test_decorators as td | ||
|
||
from pandas.core.dtypes.common import is_dtype_equal | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
from pandas.core.arrays import BaseMaskedArray | ||
from pandas.core.arrays.string_arrow import ArrowStringArray | ||
|
||
|
||
|
@@ -269,13 +271,61 @@ def test_constructor_raises(cls): | |
cls(np.array([])) | ||
|
||
with pytest.raises(ValueError, match=msg): | ||
cls(np.array(["a", np.nan], dtype=object)) | ||
cls(np.array(["a", pd.NaT], dtype=object)) | ||
|
||
with pytest.raises(ValueError, match=msg): | ||
cls(np.array(["a", None], dtype=object)) | ||
|
||
with pytest.raises(ValueError, match=msg): | ||
cls(np.array(["a", pd.NaT], dtype=object)) | ||
@pytest.mark.parametrize("na", [np.nan, None, pd.NA]) | ||
def test_constructor_nan_like(na): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you parameterize over list as well (assume same result). |
||
expected = pd.arrays.StringArray(np.array(["a", pd.NA])) | ||
tm.assert_extension_array_equal( | ||
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected | ||
) | ||
|
||
|
||
def test_invalid_coerce_raises(): | ||
data = np.array(["a", "b'"], dtype=object) | ||
with pytest.raises( | ||
ValueError, | ||
match="coerce argument must be one of " | ||
"'all'|'strict-null'|'null'|'non-null'|None, not abcd", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do the pipes here need backslashes? |
||
): | ||
lib.ensure_string_array(data, coerce="abcd") | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"values", | ||
[ | ||
np.array(["foo", "bar", pd.NA], dtype=object), | ||
np.array(["foo", "bar", np.nan], dtype=object), | ||
np.array(["foo", "bar", None], dtype=object), | ||
BaseMaskedArray( | ||
np.array(["foo", "bar", "garbage"]), np.array([False, False, True]) | ||
), | ||
], | ||
) | ||
def test_from_sequence_no_coerce(cls, values): | ||
expected = pd.arrays.StringArray(np.array(["foo", "bar", pd.NA], dtype=object)) | ||
result = cls._from_sequence(values, coerce=False) | ||
# Use bare assert since classes are different | ||
assert (result == expected).all() | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"values", | ||
[ | ||
np.array(["foo", "bar", pd.NaT], dtype=object), | ||
np.array(["foo", "bar", np.datetime64("nat")], dtype=object), | ||
np.array(["foo", "bar", float("nan")], dtype=object), | ||
], | ||
) | ||
def test_from_sequence_no_coerce_invalid(cls, values): | ||
with pytest.raises( | ||
ValueError, | ||
match="Element .* is not a string or valid null." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing whitespace at the end of this line (and the next)? |
||
"If you want it to be coerced to a string," | ||
"specify coerce='all'", | ||
): | ||
cls._from_sequence(values, coerce=False) | ||
|
||
|
||
@pytest.mark.parametrize("copy", [True, False]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what does 'all' do for null values?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
converts them to
na_value
.