pandas-dev · lithomas1 · May 10, 2021 · May 11, 2021 · May 11, 2021 · May 19, 2021
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -3,10 +3,12 @@
 import numpy as np
 
 from pandas import (
+    NA,
     Categorical,
     DataFrame,
     Series,
 )
+from pandas.core.arrays import StringArray
 
 from .pandas_vb_common import tm
 
@@ -285,3 +287,18 @@ class Iter(Dtypes):
     def time_iter(self, dtype):
         for i in self.s:
             pass
+
+
+class StringArrayConstruction:
+    def setup(self):
+        self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
+        self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])
+
+    def time_string_array_construction(self):
+        StringArray(self.series_arr)
+
+    def time_string_array_with_nan_construction(self):
+        StringArray(self.series_arr_nan)
+
+    def peakmem_stringarray_construction(self):
+        StringArray(self.series_arr)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -330,7 +330,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor
 Other API changes
 ^^^^^^^^^^^^^^^^^
 - :meth:`Index.get_indexer_for` no longer accepts keyword arguments (other than 'target'); in the past these would be silently ignored if the index was not unique (:issue:`42310`)
--
+- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -150,7 +150,7 @@ def maybe_convert_numeric(
 def ensure_string_array(
     arr,
     na_value: object = ...,
-    convert_na_value: bool = ...,
+    coerce: str = ...,
     copy: bool = ...,
     skipna: bool = ...,
 ) -> npt.NDArray[np.object_]: ...

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -99,6 +99,7 @@ from pandas._libs.missing cimport (
     is_null_timedelta64,
     isnaobj,
 )
+from pandas._libs.missing import checknull
 from pandas._libs.tslibs.conversion cimport convert_to_tsobject
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
@@ -675,7 +676,7 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
 cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
-        bint convert_na_value=True,
+        coerce="all",
         bint copy=True,
         bint skipna=True,
 ):
@@ -688,8 +689,16 @@ cpdef ndarray[object] ensure_string_array(
         The values to be converted to str, if needed.
     na_value : Any, default np.nan
         The value to use for na. For example, np.nan or pd.NA.
-    convert_na_value : bool, default True
-        If False, existing na values will be used unchanged in the new array.
+    coerce : {'all', 'null', 'non-null', None}, default 'all'
+        Whether to coerce non-string elements to strings.
+            - 'all' will convert null values and non-null non-string values.
+            - 'strict-null' will only convert pd.NA, np.nan, or None to na_value
+              without converting other non-strings.
+            - 'null' will convert nulls to na_value w/out converting other non-strings.
+            - 'non-null' will only convert non-null non-string elements to string.
+            - None will not convert anything.
+        If coerce is not 'all', a ValueError will be raised for values
+        that are not strings or na_value.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -699,10 +708,20 @@ cpdef ndarray[object] ensure_string_array(
     Returns
     -------
     np.ndarray[object]
-        An array with the input array's elements casted to str or nan-like.
+        An array of strings and na_value.
+
+    Raises
+    ------
+    ValueError
+        If an element is encountered that is not a string or valid NA value
+        and element is not coerced.
     """
+    if coerce not in {"all", "strict-null", "null", "non-null", None}:
+        raise ValueError("coerce argument must be one of "
+                         f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}")
     cdef:
         Py_ssize_t i = 0, n = len(arr)
+        set strict_na_values = {C_NA, np.nan, None}
 
     if hasattr(arr, "to_numpy"):
 
@@ -722,21 +741,34 @@ cpdef ndarray[object] ensure_string_array(
     if copy and result is arr:
         result = result.copy()
 
+    if coerce == 'strict-null':
+        # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
+        # If they are present, they are treated like a regular Python object
+        # and will either cause an exception to be raised or be coerced.
+        check_null = strict_na_values.__contains__
+    else:
+        check_null = checknull
+
     for i in range(n):
         val = arr[i]
 
         if isinstance(val, str):
             continue
 
-        if not checknull(val):
-            if not isinstance(val, np.floating):
-                # f"{val}" is faster than str(val)
-                result[i] = f"{val}"
+        if not check_null(val):
+            if coerce =="all" or coerce == "non-null":
+                if not isinstance(val, np.floating):
+                    # f"{val}" is faster than str(val)
+                    result[i] = f"{val}"
+                else:
+                    # f"{val}" is not always equivalent to str(val) for floats
+                    result[i] = str(val)
             else:
-                # f"{val}" is not always equivalent to str(val) for floats
-                result[i] = str(val)
+                raise ValueError(f"Element {val} is not a string or valid null."
+                                 "If you want it to be coerced to a string,"
+                                 "specify coerce='all'")
         else:
-            if convert_na_value:
+            if coerce=="all" or coerce == "null" or coerce == 'strict-null':
                 val = na_value
             if skipna:
                 result[i] = val
@@ -1861,8 +1893,8 @@ cdef class StringValidator(Validator):
         return issubclass(self.dtype.type, np.str_)
 
     cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
+        # Override to exclude float('Nan') and complex NaN
+        return value is None or value is C_NA or value is np.nan
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -247,11 +247,18 @@ class StringArray(BaseStringArray, PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes(``None``, ``np.nan``, ``NA``).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.3
+
+           StringArray now accepts nan-likes(``None``, ``np.nan``) for the
+           ``values`` parameter in its constructor
+           in addition to strings and :attr:`pandas.NA`
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -311,6 +318,8 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
@@ -319,18 +328,25 @@ def __init__(self, values, copy=False):
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(
-            self._ndarray.ravel("K"), skipna=True
-        ):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray.ravel("K"),
+                na_value=StringDtype.na_value,
+                coerce="strict-null",
+                copy=False,
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
-    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True
+    ):
         if dtype and not (isinstance(dtype, str) and dtype == "string"):
             dtype = pandas_dtype(dtype)
             assert isinstance(dtype, StringDtype) and dtype.storage == "python"
@@ -339,15 +355,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             result[na_values] = StringDtype.na_value
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+            if coerce:
+                coerce = "all"
+            else:
+                coerce = "strict-null"
             result = lib.ensure_string_array(
-                scalars, na_value=StringDtype.na_value, copy=copy
+                scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce
             )
 
         # Manually creating new array avoids the validation step in the __init__, so is

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -158,7 +158,9 @@ def __init__(self, values):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
+    def _from_sequence(
+        cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True
+    ):
         from pandas.core.arrays.masked import BaseMaskedArray
 
         _chk_pyarrow_available()
@@ -172,11 +174,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             # numerical issues with Float32Dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
         # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
+        if coerce:
+            coerce = "all"
+        else:
+            coerce = "strict-null"
+        result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
 
     @classmethod

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -746,7 +746,7 @@ def _try_cast(
 
     elif dtype.kind == "U":
         # TODO: test cases with arr.dtype.kind in ["m", "M"]
-        return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)
+        return lib.ensure_string_array(arr, coerce="non-null", copy=copy)
 
     elif dtype.kind in ["m", "M"]:
         return maybe_cast_to_datetime(arr, dtype)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1121,7 +1121,7 @@ def astype_nansafe(
         return arr.astype(dtype, copy=copy)
 
     if issubclass(dtype.type, str):
-        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
+        return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")
 
     elif is_datetime64_dtype(arr):
         # Non-overlapping equality check (left operand type: "dtype[Any]", right

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -5,12 +5,14 @@
 import numpy as np
 import pytest
 
+import pandas._libs.lib as lib
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.common import is_dtype_equal
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.core.arrays import BaseMaskedArray
 from pandas.core.arrays.string_arrow import ArrowStringArray
 
 
@@ -269,13 +271,61 @@ def test_constructor_raises(cls):
         cls(np.array([]))
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan], dtype=object))
+        cls(np.array(["a", pd.NaT], dtype=object))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None], dtype=object))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", pd.NaT], dtype=object))
+@pytest.mark.parametrize("na", [np.nan, None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
+
+
+def test_invalid_coerce_raises():
+    data = np.array(["a", "b'"], dtype=object)
+    with pytest.raises(
+        ValueError,
+        match="coerce argument must be one of "
+        "'all'|'strict-null'|'null'|'non-null'|None, not abcd",
+    ):
+        lib.ensure_string_array(data, coerce="abcd")
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        np.array(["foo", "bar", pd.NA], dtype=object),
+        np.array(["foo", "bar", np.nan], dtype=object),
+        np.array(["foo", "bar", None], dtype=object),
+        BaseMaskedArray(
+            np.array(["foo", "bar", "garbage"]), np.array([False, False, True])
+        ),
+    ],
+)
+def test_from_sequence_no_coerce(cls, values):
+    expected = pd.arrays.StringArray(np.array(["foo", "bar", pd.NA], dtype=object))
+    result = cls._from_sequence(values, coerce=False)
+    # Use bare assert since classes are different
+    assert (result == expected).all()
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        np.array(["foo", "bar", pd.NaT], dtype=object),
+        np.array(["foo", "bar", np.datetime64("nat")], dtype=object),
+        np.array(["foo", "bar", float("nan")], dtype=object),
+    ],
+)
+def test_from_sequence_no_coerce_invalid(cls, values):
+    with pytest.raises(
+        ValueError,
+        match="Element .* is not a string or valid null."
+        "If you want it to be coerced to a string,"
+        "specify coerce='all'",
+    ):
+        cls._from_sequence(values, coerce=False)
 
 
 @pytest.mark.parametrize("copy", [True, False])