From 6f3e367bd50457fbe91b7fc9aa13c9b0648a9803 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 13:19:03 -0600 Subject: [PATCH 01/10] Disallow NaN in StringArray constructor Closes https://github.com/pandas-dev/pandas/issues/30966 --- pandas/_libs/lib.pyx | 30 +++++++++++++++++++--- pandas/core/arrays/string_.py | 22 ++++++++-------- pandas/tests/arrays/string_/test_string.py | 6 +++++ 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 719db5c03f07f..1ac63adbde361 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1472,12 +1472,30 @@ cdef class Validator: Py_ssize_t n dtype dtype bint skipna + bint na_only def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), - bint skipna=False): + bint skipna=False, + bint na_only=False): + """ + + Parameters + ---------- + n + dtype + skipna + na_only : bool, default False + Whether to only treat pandas.NA as NA. Values like None + and NaN won't be treated as NA. + + Returns + ------- + + """ self.n = n self.dtype = dtype self.skipna = skipna + self.na_only = na_only cdef bint validate(self, ndarray values) except -1: if not self.n: @@ -1530,7 +1548,10 @@ cdef class Validator: "must define is_value_typed") cdef bint is_valid_null(self, object value) except -1: - return value is None or value is C_NA or util.is_nan(value) + if self.na_only: + return value is C_NA + else: + return value is None or value is C_NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False @@ -1625,11 +1646,12 @@ cdef class StringValidator(Validator): return issubclass(self.dtype.type, np.str_) -cpdef bint is_string_array(ndarray values, bint skipna=False): +cpdef bint is_string_array(ndarray values, bint skipna=False, na_only=False): cdef: StringValidator validator = StringValidator(len(values), values.dtype, - skipna=skipna) + skipna=skipna, + na_only=na_only) return validator.validate(values) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 84130132de4dc..305031c7f52b0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -93,9 +93,6 @@ class StringArray(PandasArray): StringArray is considered experimental. The implementation and parts of the API may change without warning. - In particular, the NA value used may change to no longer be - ``numpy.nan``. - Parameters ---------- values : array-like @@ -104,8 +101,11 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings. This may - change without warning in the future. + where the elements are Python strings or :attr:`pandas.NA`. + This may change without warning in the future. Use + :meth:`pandas.array` with ``dtype="string"`` for a stable way of + creating a `StringArray` from any sequence. + copy : bool, default False Whether to copy the array of data. @@ -119,6 +119,8 @@ class StringArray(PandasArray): See Also -------- + pandas.array + The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by a StringArray. @@ -164,13 +166,13 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError( - "StringArray requires a sequence of strings or missing values." - ) + if len(self._ndarray) and not lib.is_string_array( + self._ndarray, skipna=True, na_only=True + ): + raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( - "StringArray requires a sequence of strings. Got " + "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 33e68f029922e..b15e56b18d73f 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -194,6 +194,12 @@ def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.StringArray(np.array([])) + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", None], dtype=object)) + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") From 1e62f26e61860af8492797577856b741a637afdb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 13:43:17 -0600 Subject: [PATCH 02/10] change it --- pandas/_libs/lib.pyx | 34 +++++----------------- pandas/core/arrays/numpy_.py | 6 ++++ pandas/core/arrays/string_.py | 18 ++++++++---- pandas/tests/arrays/string_/test_string.py | 9 ++++++ pandas/tests/dtypes/test_inference.py | 7 ++++- 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1ac63adbde361..acd74591134bc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1472,30 +1472,12 @@ cdef class Validator: Py_ssize_t n dtype dtype bint skipna - bint na_only def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), - bint skipna=False, - bint na_only=False): - """ - - Parameters - ---------- - n - dtype - skipna - na_only : bool, default False - Whether to only treat pandas.NA as NA. Values like None - and NaN won't be treated as NA. - - Returns - ------- - - """ + bint skipna=False): self.n = n self.dtype = dtype self.skipna = skipna - self.na_only = na_only cdef bint validate(self, ndarray values) except -1: if not self.n: @@ -1548,10 +1530,7 @@ cdef class Validator: "must define is_value_typed") cdef bint is_valid_null(self, object value) except -1: - if self.na_only: - return value is C_NA - else: - return value is None or value is C_NA or util.is_nan(value) + return value is None or value is C_NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False @@ -1645,13 +1624,16 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) + cdef bint is_valid_null(self, object value) except -1: + # We deliberately exclude None / NaN here since StringArray uses NA + return value is C_NA + -cpdef bint is_string_array(ndarray values, bint skipna=False, na_only=False): +cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: StringValidator validator = StringValidator(len(values), values.dtype, - skipna=skipna, - na_only=na_only) + skipna=skipna) return validator.validate(values) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 4db3d3010adaf..8e63032e18b0c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -162,8 +162,14 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = np.asarray(scalars, dtype=dtype) if copy and result is scalars: result = result.copy() + # Primarily for StringArray + result = cls._from_sequence_finalize(result, copy=copy) return cls(result) + @staticmethod + def _from_sequence_finalize(values, copy): + return values + @classmethod def _from_factorized(cls, values, original): return cls(values) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 305031c7f52b0..0d0ddf9759551 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -166,9 +166,7 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array( - self._ndarray, skipna=True, na_only=True - ): + if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( @@ -181,11 +179,21 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) + return result + + @staticmethod + def _from_sequence_finalize(values, copy): # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = StringDtype.na_value - return result + na_values = isna(values) + if na_values.any(): + if not copy: + # force a copy now, if we haven't already + values = values.copy() + values[na_values] = StringDtype.na_value + + return values @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b15e56b18d73f..9b6d76f151001 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -201,6 +201,15 @@ def test_constructor_raises(): pd.arrays.StringArray(np.array(["a", None], dtype=object)) +def test_from_sequnce_no_mutate(): + a = np.array(["a", pd.NA], dtype=object) + original = a.copy() + result = pd.arrays.StringArray._from_sequence(a) + expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(a, original) + + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") def test_reduce(skipna): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d022b0e97877a..5eb85de2b90f5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1114,11 +1114,16 @@ def test_is_string_array(self): assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( - np.array(["foo", "bar", np.nan], dtype=object), skipna=False + np.array(["foo", "bar", pd.NA], dtype=object), skipna=False ) assert lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=True + ) + # NaN is not valid for string array, just NA + assert not lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): From 5e720ce625c1b46bfbe516406be154e9b9d9edf8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 13:45:25 -0600 Subject: [PATCH 03/10] update test --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index a92f917820bd0..c37c78f3b9235 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3521,7 +3521,7 @@ def test_string_array(any_string_method): if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( - expected.values, skipna=True + expected.dropna().values, ): assert result.dtype == "string" result = result.astype(object) From 17b6f1027bb1b9ff6dff5ffa82386ab17537559d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 13:48:45 -0600 Subject: [PATCH 04/10] update test --- pandas/core/strings.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f8d9eeb211a1e..0323eafff8dee 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ import numpy as np import pandas._libs.lib as lib +import pandas._libs.missing as libmissing import pandas._libs.ops as libops from pandas._typing import ArrayLike, Dtype from pandas.util._decorators import Appender @@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str): return result -def _na_map(f, arr, na_result=np.nan, dtype=object): - # should really _check_ for NA +def _na_map(f, arr, na_result=None, dtype=object): if is_extension_array_dtype(arr.dtype): + if na_result is None: + na_result = libmissing.NA # just StringDtype arr = extract_array(arr) return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + if na_result is None: + na_result = np.nan return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) From 0e2468a3f51a248b1d640c639028c2fbee631448 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 13:49:59 -0600 Subject: [PATCH 05/10] test NaT --- pandas/tests/arrays/string_/test_string.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 9b6d76f151001..1ff09d73ebc2d 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -200,6 +200,9 @@ def test_constructor_raises(): with pytest.raises(ValueError, match="strings or pandas.NA"): pd.arrays.StringArray(np.array(["a", None], dtype=object)) + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + def test_from_sequnce_no_mutate(): a = np.array(["a", pd.NA], dtype=object) From 21e6e590888bab8eb6187c7a0d15d60144ae8121 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 14:03:41 -0600 Subject: [PATCH 06/10] fixup --- pandas/tests/arrays/string_/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1ff09d73ebc2d..5674d738889b2 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -205,7 +205,7 @@ def test_constructor_raises(): def test_from_sequnce_no_mutate(): - a = np.array(["a", pd.NA], dtype=object) + a = np.array(["a", np.nan], dtype=object) original = a.copy() result = pd.arrays.StringArray._from_sequence(a) expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) From f680db9f7cf93e04a88489ea97d05bc6a525b772 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 14:09:39 -0600 Subject: [PATCH 07/10] fixup --- pandas/core/arrays/numpy_.py | 26 +++++++++++++++++++--- pandas/core/arrays/string_.py | 2 +- pandas/tests/arrays/string_/test_string.py | 5 +++-- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 8e63032e18b0c..182acd1cd633c 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -162,12 +162,32 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = np.asarray(scalars, dtype=dtype) if copy and result is scalars: result = result.copy() - # Primarily for StringArray - result = cls._from_sequence_finalize(result, copy=copy) + # _coerce_from_sequence_values is useful for StringArray + result = cls._coerce_from_sequence_values(result, copy=copy) return cls(result) @staticmethod - def _from_sequence_finalize(values, copy): + def _coerce_from_sequence_values(values: np.ndarray, copy: bool): + """ + Coerce the values used in _from_sequence before constructing + + This may mutate `values` inplace. + + Parameters + ---------- + values : ndarray + The values to coerce, probably from asarray on the sequence + passed to _from_sequence. + copy : bool + This should be the value of `copy` passed to _from_sequence. + It's used to determine if we potentially haven't done a + copy, and so a copy will be needed to not mutate the user input. + + Returns + ------- + numpy.ndarray + The coerced values. + """ return values @classmethod diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 0d0ddf9759551..e8cf0e84c8c3f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -182,7 +182,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return result @staticmethod - def _from_sequence_finalize(values, copy): + def _coerce_from_sequence_values(values: np.ndarray, copy: bool): # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5674d738889b2..8e17d3c8cd1e0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -204,10 +204,11 @@ def test_constructor_raises(): pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) -def test_from_sequnce_no_mutate(): +@pytest.mark.parametrize("copy", [True, False]) +def test_from_sequnce_no_mutate(copy): a = np.array(["a", np.nan], dtype=object) original = a.copy() - result = pd.arrays.StringArray._from_sequence(a) + result = pd.arrays.StringArray._from_sequence(a, copy=copy) expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) tm.assert_extension_array_equal(result, expected) tm.assert_numpy_array_equal(a, original) From fad5b7b0752295840a5897aea8a65800fe3a82de Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 16:40:06 -0600 Subject: [PATCH 08/10] inline from_sequence --- pandas/core/arrays/numpy_.py | 24 ------------------------ pandas/core/arrays/string_.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 182acd1cd633c..6476614a19ac9 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -166,30 +166,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = cls._coerce_from_sequence_values(result, copy=copy) return cls(result) - @staticmethod - def _coerce_from_sequence_values(values: np.ndarray, copy: bool): - """ - Coerce the values used in _from_sequence before constructing - - This may mutate `values` inplace. - - Parameters - ---------- - values : ndarray - The values to coerce, probably from asarray on the sequence - passed to _from_sequence. - copy : bool - This should be the value of `copy` passed to _from_sequence. - It's used to determine if we potentially haven't done a - copy, and so a copy will be needed to not mutate the user input. - - Returns - ------- - numpy.ndarray - The coerced values. - """ - return values - @classmethod def _from_factorized(cls, values, original): return cls(values) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e8cf0e84c8c3f..8a61885cd0e3b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -178,22 +178,22 @@ def _validate(self): def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = super()._from_sequence(scalars, dtype=object, copy=copy) - return result - @staticmethod - def _coerce_from_sequence_values(values: np.ndarray, copy: bool): + result = np.asarray(scalars, dtype="object") + if copy and result is scalars: + result = result.copy() + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - na_values = isna(values) + na_values = isna(result) if na_values.any(): - if not copy: + if result is scalars: # force a copy now, if we haven't already - values = values.copy() - values[na_values] = StringDtype.na_value + result = result.copy() + result[na_values] = StringDtype.na_value - return values + return cls(result) @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): From 4c2416debfd171194bf52c5a379d293317248276 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jan 2020 16:40:56 -0600 Subject: [PATCH 09/10] fixup docstring --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8a61885cd0e3b..c485d1f50dc9d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -119,7 +119,7 @@ class StringArray(PandasArray): See Also -------- - pandas.array + array The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by From 08e049dd95bc2fdbef9939a2bd35920a2ff52f53 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jan 2020 06:05:48 -0600 Subject: [PATCH 10/10] fixups --- pandas/core/arrays/numpy_.py | 2 -- pandas/tests/arrays/string_/test_string.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6476614a19ac9..4db3d3010adaf 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -162,8 +162,6 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): result = np.asarray(scalars, dtype=dtype) if copy and result is scalars: result = result.copy() - # _coerce_from_sequence_values is useful for StringArray - result = cls._coerce_from_sequence_values(result, copy=copy) return cls(result) @classmethod diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8e17d3c8cd1e0..5e2f14af341ab 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -205,7 +205,7 @@ def test_constructor_raises(): @pytest.mark.parametrize("copy", [True, False]) -def test_from_sequnce_no_mutate(copy): +def test_from_sequence_no_mutate(copy): a = np.array(["a", np.nan], dtype=object) original = a.copy() result = pd.arrays.StringArray._from_sequence(a, copy=copy)