Skip to content

API: allow nan-likes in StringArray constructor #41412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 41 commits into from
Closed
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
3e1784d
API: allow nan-likes in StringArray constructor
lithomas1 May 10, 2021
96ff1da
Revert weird changes & Fix stuff
lithomas1 May 11, 2021
418e1d2
Remove failing test
lithomas1 May 11, 2021
25a6c4d
Changes from code review
lithomas1 May 19, 2021
47d68f7
Merge branch 'master' of https://github.com/pandas-dev/pandas into st…
lithomas1 May 19, 2021
8257dbd
typo
lithomas1 May 20, 2021
922436a
Update lib.pyi
lithomas1 May 21, 2021
2f28086
Update lib.pyx
lithomas1 May 29, 2021
3ee2198
Update lib.pyx
lithomas1 May 29, 2021
9426a52
Merge branch 'master' of https://github.com/pandas-dev/pandas into st…
lithomas1 May 30, 2021
3ee55f2
Updates
lithomas1 May 30, 2021
fe4981a
Update lib.pyx
lithomas1 May 30, 2021
a66948a
Update lib.pyx
lithomas1 May 30, 2021
e852719
Update lib.pyx
lithomas1 May 31, 2021
91b73bb
disallow invalid nans in stringarray constructor
lithomas1 Jun 4, 2021
42ec3e4
Merge branch 'master' into stringarray-nan
lithomas1 Jun 4, 2021
41f49d2
add to _from_sequence and fixes
lithomas1 Jun 4, 2021
62cc5be
address code review
lithomas1 Jun 4, 2021
29909f3
Merge branch 'master' into stringarray-nan
lithomas1 Jun 4, 2021
153b6b4
Fix failures
lithomas1 Jun 5, 2021
b27a839
maybe fix benchmarks?
lithomas1 Jun 5, 2021
ed5b953
Partially address code review
lithomas1 Jun 5, 2021
caa5705
Test coerce=False
lithomas1 Jun 6, 2021
2d75031
move benchmarks
lithomas1 Jun 7, 2021
52a00d1
accidental formatting changes
lithomas1 Jun 7, 2021
8dc0b66
Fix
lithomas1 Jun 8, 2021
1bacaed
Merge branch 'master' into stringarray-nan
lithomas1 Jun 8, 2021
66be087
missing import from conflict
lithomas1 Jun 8, 2021
7b058cd
Merge branch 'master' into stringarray-nan
lithomas1 Jun 9, 2021
1be1bdf
Merge branch 'pandas-dev:master' into stringarray-nan
lithomas1 Jun 22, 2021
3c57094
remove old whatsnew
lithomas1 Jul 21, 2021
03738a9
Merge branch 'master' of https://github.com/pandas-dev/pandas into st…
lithomas1 Jul 21, 2021
12351de
move whatsnew
lithomas1 Jul 21, 2021
889829a
Merge branch 'master' into stringarray-nan
lithomas1 Oct 4, 2021
358000f
typo
lithomas1 Oct 4, 2021
c649b1d
Merge branch 'master' into stringarray-nan
lithomas1 Oct 16, 2021
5e5aa9c
Merge branch 'master' into stringarray-nan
lithomas1 Nov 27, 2021
eb7d8f2
Merge branch 'master' into stringarray-nan
lithomas1 Dec 18, 2021
2426319
Merge branch 'master' into stringarray-nan
lithomas1 Dec 27, 2021
20817a7
address comments
lithomas1 Dec 27, 2021
33d8f9a
accept any float nan w/ util.is_nan
lithomas1 Dec 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import numpy as np

from pandas import (
NA,
Categorical,
DataFrame,
Series,
)
from pandas.core.arrays import StringArray

from .pandas_vb_common import tm

Expand Down Expand Up @@ -285,3 +287,18 @@ class Iter(Dtypes):
def time_iter(self, dtype):
for i in self.s:
pass


class StringArrayConstruction:
def setup(self):
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])

def time_string_array_construction(self):
StringArray(self.series_arr)

def time_string_array_with_nan_construction(self):
StringArray(self.series_arr_nan)

def peakmem_stringarray_construction(self):
StringArray(self.series_arr)
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
Other API changes
^^^^^^^^^^^^^^^^^
- :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`)
-
- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)

.. ---------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def maybe_convert_numeric(
def ensure_string_array(
arr,
na_value: object = ...,
convert_na_value: bool = ...,
coerce: str = ...,
copy: bool = ...,
skipna: bool = ...,
) -> npt.NDArray[np.object_]: ...
Expand Down
58 changes: 45 additions & 13 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ from pandas._libs.missing cimport (
is_null_timedelta64,
isnaobj,
)
from pandas._libs.missing import checknull
from pandas._libs.tslibs.conversion cimport convert_to_tsobject
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
Expand Down Expand Up @@ -675,7 +676,7 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
cpdef ndarray[object] ensure_string_array(
arr,
object na_value=np.nan,
bint convert_na_value=True,
coerce="all",
bint copy=True,
bint skipna=True,
):
Expand All @@ -688,8 +689,16 @@ cpdef ndarray[object] ensure_string_array(
The values to be converted to str, if needed.
na_value : Any, default np.nan
The value to use for na. For example, np.nan or pd.NA.
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
coerce : {'all', 'null', 'non-null', None}, default 'all'
Whether to coerce non-string elements to strings.
- 'all' will convert null values and non-null non-string values.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does 'all' do for null values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

converts them to na_value.

- 'strict-null' will only convert pd.NA, np.nan, or None to na_value
without converting other non-strings.
- 'null' will convert nulls to na_value w/out converting other non-strings.
- 'non-null' will only convert non-null non-string elements to string.
- None will not convert anything.
If coerce is not 'all', a ValueError will be raised for values
that are not strings or na_value.
copy : bool, default True
Whether to ensure that a new array is returned.
skipna : bool, default True
Expand All @@ -699,10 +708,20 @@ cpdef ndarray[object] ensure_string_array(
Returns
-------
np.ndarray[object]
An array with the input array's elements casted to str or nan-like.
An array of strings and na_value.

Raises
------
ValueError
If an element is encountered that is not a string or valid NA value
and element is not coerced.
"""
if coerce not in {"all", "strict-null", "null", "non-null", None}:
raise ValueError("coerce argument must be one of "
f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}")
cdef:
Py_ssize_t i = 0, n = len(arr)
set strict_na_values = {C_NA, np.nan, None}

if hasattr(arr, "to_numpy"):

Expand All @@ -722,21 +741,34 @@ cpdef ndarray[object] ensure_string_array(
if copy and result is arr:
result = result.copy()

if coerce == 'strict-null':
# We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
# If they are present, they are treated like a regular Python object
# and will either cause an exception to be raised or be coerced.
check_null = strict_na_values.__contains__
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this always work with np.nan or should this be a function using np.isnan?

Copy link
Member Author

@lithomas1 lithomas1 Jun 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__contains__() checks for identity(is) so should work since nan is singleton.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i recall seeing something somewhere about np.nan not guaranteed to be a singleton. maybe untrue or not relevant these days.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tentatively resolving since CI is green.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since nan is singleton

nope.

>>> np.float64("nan") is np.float64("nan")
False

else:
check_null = checknull

for i in range(n):
val = arr[i]

if isinstance(val, str):
continue

if not checknull(val):
if not isinstance(val, np.floating):
# f"{val}" is faster than str(val)
result[i] = f"{val}"
if not check_null(val):
if coerce =="all" or coerce == "non-null":
if not isinstance(val, np.floating):
# f"{val}" is faster than str(val)
result[i] = f"{val}"
else:
# f"{val}" is not always equivalent to str(val) for floats
result[i] = str(val)
else:
# f"{val}" is not always equivalent to str(val) for floats
result[i] = str(val)
raise ValueError(f"Element {val} is not a string or valid null."
"If you want it to be coerced to a string,"
"specify coerce='all'")
else:
if convert_na_value:
if coerce=="all" or coerce == "null" or coerce == 'strict-null':
val = na_value
if skipna:
result[i] = val
Expand Down Expand Up @@ -1861,8 +1893,8 @@ cdef class StringValidator(Validator):
return issubclass(self.dtype.type, np.str_)

cdef bint is_valid_null(self, object value) except -1:
# We deliberately exclude None / NaN here since StringArray uses NA
return value is C_NA
# Override to exclude float('Nan') and complex NaN
return value is None or value is C_NA or value is np.nan


cpdef bint is_string_array(ndarray values, bint skipna=False):
Expand Down
40 changes: 32 additions & 8 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,18 @@ class StringArray(BaseStringArray, PandasArray):
.. warning::

Currently, this expects an object-dtype ndarray
where the elements are Python strings or :attr:`pandas.NA`.
where the elements are Python strings
or nan-likes(``None``, ``np.nan``, ``NA``).
This may change without warning in the future. Use
:meth:`pandas.array` with ``dtype="string"`` for a stable way of
creating a `StringArray` from any sequence.

.. versionchanged:: 1.3

StringArray now accepts nan-likes(``None``, ``np.nan``) for the
``values`` parameter in its constructor
in addition to strings and :attr:`pandas.NA`

copy : bool, default False
Whether to copy the array of data.

Expand Down Expand Up @@ -311,6 +318,8 @@ def __init__(self, values, copy=False):
values = extract_array(values)

super().__init__(values, copy=copy)
if not isinstance(values, type(self)):
self._validate()
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
Expand All @@ -319,18 +328,25 @@ def __init__(self, values, copy=False):

def _validate(self):
"""Validate that we only store NA or strings."""
if len(self._ndarray) and not lib.is_string_array(
self._ndarray.ravel("K"), skipna=True
):
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
if self._ndarray.dtype != "object":
raise ValueError(
"StringArray requires a sequence of strings or pandas.NA. Got "
f"'{self._ndarray.dtype}' dtype instead."
)
try:
lib.ensure_string_array(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this going through ensure_string_array instead of e.g. is_string_array? For the latter, the ravel would be unnecessary.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_string_array will not convert the other nans(None, np.nan, etc.) to the correct na_value of pd.NA. FWIW, switching to ensure_string_array will also align us with _from_sequence.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does any of this become simpler if done in conjunction with the edits in #45057

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will try to split this one up.

I'm thinking of maybe sticking to is_string_array and then doing another pass over the data to convert the not pd.NA nans, as a quick short term fix to unblock you. This will probably give a perf regression, but since is_string_array got sped up in #44495 on master(not 1.3.x), all I have to do is make the perf regression less than the perf improvement there, so that the regression is not user visible.

self._ndarray.ravel("K"),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure this is broken for 2D EAs even with the ravel.
@jbrockmendel Do you know how I can create a 2D StringArray to test this? Would the correct way to iterate over a 2D EA be using the PyArray_GETITEM and PyArray_ITER_DATA combo like the Validator in libs does? Then I would replace values by PyArray_SETITEMing on the array?
(Not familiar at all with Numpy C API, so going to need some help here)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how I can create a 2D StringArray to test this?

pd.array(["foo", "bar"] * 5).reshape(5, 2)

Would the correct way to iterate over a 2D EA be using the PyArray_GETITEM and PyArray_ITER_DATA combo like the Validator in libs does?

Not sure I understand the question. The 2D EA has a working __iter__ method, but you shouldn't be passing any EA to any of the cython functions.

Then I would replace values by PyArray_SETITEM ing on the array?

I'm still getting the hang of this (intending to make array_to_timedelta64 handle 2D directly to avoid ravels), would advise NOT trying to handle it in the cython code.

na_value=StringDtype.na_value,
coerce="strict-null",
copy=False,
)
except ValueError:
raise ValueError("StringArray requires a sequence of strings or pandas.NA")

@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
def _from_sequence(
cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is coerce: bool enough here?

this is like errors='coerce' for coerce=True and errors='raise' for coerce=False, i guess 'ignore' would be meaningless.

but I still think the errors= keyword is better for flexiblity.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will take a look soon-ish. I'm wary of adding keywords here

):
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "python"
Expand All @@ -339,15 +355,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype
if coerce:
coerce = "non-null"
else:
coerce = None
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
result[na_values] = StringDtype.na_value

else:
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
if coerce:
coerce = "all"
else:
coerce = "strict-null"
result = lib.ensure_string_array(
scalars, na_value=StringDtype.na_value, copy=copy
scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce
)

# Manually creating new array avoids the validation step in the __init__, so is
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,9 @@ def __init__(self, values):
)

@classmethod
def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
def _from_sequence(
cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True
):
from pandas.core.arrays.masked import BaseMaskedArray

_chk_pyarrow_available()
Expand All @@ -172,11 +174,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
# numerical issues with Float32Dtype
na_values = scalars._mask
result = scalars._data
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
if coerce:
coerce = "non-null"
else:
coerce = None
result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
return cls(pa.array(result, mask=na_values, type=pa.string()))

# convert non-na-likes to str
result = lib.ensure_string_array(scalars, copy=copy)
if coerce:
coerce = "all"
else:
coerce = "strict-null"
result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce)
return cls(pa.array(result, type=pa.string(), from_pandas=True))

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,7 @@ def _try_cast(

elif dtype.kind == "U":
# TODO: test cases with arr.dtype.kind in ["m", "M"]
return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)
return lib.ensure_string_array(arr, coerce="non-null", copy=copy)

elif dtype.kind in ["m", "M"]:
return maybe_cast_to_datetime(arr, dtype)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1121,7 +1121,7 @@ def astype_nansafe(
return arr.astype(dtype, copy=copy)

if issubclass(dtype.type, str):
return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")

elif is_datetime64_dtype(arr):
# Non-overlapping equality check (left operand type: "dtype[Any]", right
Expand Down
60 changes: 55 additions & 5 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
import numpy as np
import pytest

import pandas._libs.lib as lib
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_dtype_equal

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import BaseMaskedArray
from pandas.core.arrays.string_arrow import ArrowStringArray


Expand Down Expand Up @@ -269,13 +271,61 @@ def test_constructor_raises(cls):
cls(np.array([]))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", np.nan], dtype=object))
cls(np.array(["a", pd.NaT], dtype=object))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", None], dtype=object))

with pytest.raises(ValueError, match=msg):
cls(np.array(["a", pd.NaT], dtype=object))
@pytest.mark.parametrize("na", [np.nan, None, pd.NA])
def test_constructor_nan_like(na):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you parameterize over list as well (assume same result).

expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
tm.assert_extension_array_equal(
pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
)


def test_invalid_coerce_raises():
data = np.array(["a", "b'"], dtype=object)
with pytest.raises(
ValueError,
match="coerce argument must be one of "
"'all'|'strict-null'|'null'|'non-null'|None, not abcd",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do the pipes here need backslashes?

):
lib.ensure_string_array(data, coerce="abcd")


@pytest.mark.parametrize(
"values",
[
np.array(["foo", "bar", pd.NA], dtype=object),
np.array(["foo", "bar", np.nan], dtype=object),
np.array(["foo", "bar", None], dtype=object),
BaseMaskedArray(
np.array(["foo", "bar", "garbage"]), np.array([False, False, True])
),
],
)
def test_from_sequence_no_coerce(cls, values):
expected = pd.arrays.StringArray(np.array(["foo", "bar", pd.NA], dtype=object))
result = cls._from_sequence(values, coerce=False)
# Use bare assert since classes are different
assert (result == expected).all()


@pytest.mark.parametrize(
"values",
[
np.array(["foo", "bar", pd.NaT], dtype=object),
np.array(["foo", "bar", np.datetime64("nat")], dtype=object),
np.array(["foo", "bar", float("nan")], dtype=object),
],
)
def test_from_sequence_no_coerce_invalid(cls, values):
with pytest.raises(
ValueError,
match="Element .* is not a string or valid null."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing whitespace at the end of this line (and the next)?

"If you want it to be coerced to a string,"
"specify coerce='all'",
):
cls._from_sequence(values, coerce=False)


@pytest.mark.parametrize("copy", [True, False])
Expand Down
Loading