pandas-dev · lithomas1 · Mar 29, 2021 · Mar 29, 2021 · Mar 30, 2021 · Mar 30, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -225,6 +225,7 @@ Other enhancements
 - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :class:`Timedelta` (:issue:`37172`)
 - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
 - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
+- :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`)
 - :meth:`Series.round` and :meth:`DataFrame.round` now work with nullable integer and floating dtypes (:issue:`38844`)
 - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
@@ -644,8 +645,10 @@ Other API changes
 - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
 - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
 - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`)
+- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NA``) in its constructor in addition to strings.
 - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`)
 
+
 Build
 =====
 

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -157,7 +157,7 @@ def maybe_convert_numeric(
 def ensure_string_array(
     arr,
     na_value: object = np.nan,
-    convert_na_value: bool = True,
+    coerce: str = "all",
     copy: bool = True,
     skipna: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[object]

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -98,6 +98,7 @@ from pandas._libs.missing cimport (
     is_null_timedelta64,
     isnaobj,
 )
+from pandas._libs.missing import checknull
 from pandas._libs.tslibs.conversion cimport convert_to_tsobject
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
@@ -682,21 +683,31 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
 cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
-        bint convert_na_value=True,
+        coerce="all",
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Returns a new numpy array with object dtype and only strings and na values.
+    Checks that all elements in numpy array are string or null
+    and returns a new numpy array with object dtype
+    and only strings and na values if so. Otherwise, raise a ValueError.
 
     Parameters
     ----------
     arr : array-like
         The values to be converted to str, if needed.
     na_value : Any, default np.nan
         The value to use for na. For example, np.nan or pd.NA.
-    convert_na_value : bool, default True
-        If False, existing na values will be used unchanged in the new array.
+    coerce : {{'all', 'null', 'non-null', None}}, default 'all'
+        Whether to coerce non-string elements to strings.
+            - 'all' will convert null values and non-null non-string values.
+            - 'strict-null' will only convert pd.NA, np.nan, or None to na_value
+              without converting other non-strings.
+            - 'null' will convert nulls to na_value w/out converting other non-strings.
+            - 'non-null' will only convert non-null non-string elements to string.
+            - None will not convert anything.
+        If coerce is not 'all', a ValueError will be raised for values
+        that are not strings or na_value.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -710,6 +721,7 @@ cpdef ndarray[object] ensure_string_array(
     """
     cdef:
         Py_ssize_t i = 0, n = len(arr)
+        set strict_na_values = {C_NA, np.nan, None}
 
     if hasattr(arr, "to_numpy"):
         arr = arr.to_numpy()
@@ -721,16 +733,27 @@ cpdef ndarray[object] ensure_string_array(
     if copy and result is arr:
         result = result.copy()
 
+    if coerce == 'strict-null':
+        # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
+        # If they are present, they are treated like a regular Python object
+        # and will either cause an exception to be raised or be coerced.
+        check_null = strict_na_values.__contains__
+    else:
+        check_null = checknull
+
     for i in range(n):
         val = arr[i]
 
         if isinstance(val, str):
             continue
 
-        if not checknull(val):
-            result[i] = str(val)
+        if not check_null(val):
+            if coerce =="all" or coerce == "non-null":
+                result[i] = str(val)
+            else:
+                raise ValueError("Non-string element encountered in array.")
         else:
-            if convert_na_value:
+            if coerce=="all" or coerce == "null" or coerce == 'strict-null':
                 val = na_value
             if skipna:
                 result[i] = val
@@ -1864,8 +1887,8 @@ cdef class StringValidator(Validator):
         return issubclass(self.dtype.type, np.str_)
 
     cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
+        # Override to exclude float('Nan') and complex NaN
+        return value is None or value is C_NA or value is np.nan
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -92,6 +92,7 @@ from pandas._libs.khash cimport (
     kh_strbox_t,
     khiter_t,
 )
+from pandas._libs.missing cimport C_NA
 
 from pandas.errors import (
     EmptyDataError,
@@ -109,6 +110,13 @@ from pandas.core.dtypes.common import (
 )
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
+from pandas.core.arrays import (
+    BooleanArray,
+    FloatingArray,
+    IntegerArray,
+)
+from pandas.core.arrays.string_ import StringDtype
+
 cdef:
     float64_t INF = <float64_t>np.inf
     float64_t NEGINF = -INF
@@ -311,7 +319,7 @@ cdef class TextReader:
         object handle
         object orig_header
         bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
-        bint mangle_dupe_cols, allow_leading_cols
+        bint mangle_dupe_cols, allow_leading_cols, use_nullable_dtypes
         uint64_t parser_start  # this is modified after __init__
         list clocks
         const char *encoding_errors
@@ -369,6 +377,7 @@ cdef class TextReader:
                   bint mangle_dupe_cols=True,
                   float_precision=None,
                   bint skip_blank_lines=True,
+                  bint use_nullable_dtypes=False,
                   encoding_errors=b"strict"):
 
         # set encoding for native Python and C library
@@ -432,6 +441,7 @@ cdef class TextReader:
             # consistent with csv module semantics, cast all to float
             dtype_order = dtype_order[1:]
         self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
+        self.use_nullable_dtypes = use_nullable_dtypes
 
         if comment is not None:
             if len(comment) > 1:
@@ -1035,7 +1045,7 @@ cdef class TextReader:
             # don't try to upcast EAs
             try_upcast = upcast_na and na_count > 0
             if try_upcast and not is_extension_array_dtype(col_dtype):
-                col_res = _maybe_upcast(col_res)
+                col_res = _maybe_upcast(col_res, self.use_nullable_dtypes)
 
             if col_res is None:
                 raise ParserError(f'Unable to parse column {i}')
@@ -1328,18 +1338,53 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
-def _maybe_upcast(arr):
+def _maybe_upcast(arr, use_nullable_dtypes=False):
     """
+    Tries to upcast null values or use nullable dtypes if set to True.
+
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array to upcast.
+    use_nullable_dtypes: bool, default False
+        Whether to use nullable datatypes instead of upcasting.
+        If true, then:
+            - int w/ NaN -> IntegerArray
+            - bool w/ NaN -> BooleanArray
+            - float w/NaN -> FloatingArray
+            - object(strings) w/NaN -> StringArray
 
     """
+    na_value = na_values[arr.dtype]
     if issubclass(arr.dtype.type, np.integer):
-        na_value = na_values[arr.dtype]
-        arr = arr.astype(float)
-        np.putmask(arr, arr == na_value, np.nan)
+        mask = arr == na_value
+        if use_nullable_dtypes:
+            # only convert to integer array if not all NAN
+            if not mask.all():
+                arr = IntegerArray(arr, mask)
+            else:
+                arr = arr.astype(float)
+                arr = FloatingArray(arr, mask)
+        else:
+            arr = arr.astype(float)
+            np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
-        mask = arr.view(np.uint8) == na_values[np.uint8]
-        arr = arr.astype(object)
-        np.putmask(arr, mask, np.nan)
+        mask = arr.view(np.uint8) == na_value
+        if use_nullable_dtypes:
+            arr = BooleanArray(arr, mask)
+        else:
+            arr = arr.astype(object)
+            np.putmask(arr, mask, np.nan)
+    elif use_nullable_dtypes and arr.dtype == np.float64:
+        mask = np.isnan(arr)
+        arr = FloatingArray(arr, mask)
+    elif use_nullable_dtypes and arr.dtype == np.object_:
+        # Maybe convert StringArray & catch error for non-strings
+        try:
+            arr = StringDtype.construct_array_type()._from_sequence(arr)
+        except ValueError as e:
+            pass
 
     return arr
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -144,11 +144,18 @@ class StringArray(PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes(``None``, ``nan``, ``NA``).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.3
+
+           StringArray now accepts nan-likes in the constructor in addition
+           to strings, whereas it only accepted strings and :attr:`pandas.NA`
+           before.
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -208,40 +215,57 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype())
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray,
+                na_value=StringDtype.na_value,
+                coerce="strict-null",
+                copy=False,
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
-    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True
+    ):
         if dtype:
             assert dtype == "string"
 
         from pandas.core.arrays.masked import BaseMaskedArray
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             result[na_values] = StringDtype.na_value
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+            if coerce:
+                coerce = "all"
+            else:
+                coerce = "strict-null"
             result = lib.ensure_string_array(
-                scalars, na_value=StringDtype.na_value, copy=copy
+                scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce
             )
 
         # Manually creating new array avoids the validation step in the __init__, so is

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -237,7 +237,9 @@ def __init__(self, values):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
+    def _from_sequence(
+        cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True
+    ):
         from pandas.core.arrays.masked import BaseMaskedArray
 
         _chk_pyarrow_available()
@@ -247,11 +249,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             # numerical issues with Float32Dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
         # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
+        if coerce:
+            coerce = "all"
+        else:
+            coerce = "strict-null"
+        result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
 
     @classmethod

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1105,7 +1105,7 @@ def astype_nansafe(
         return arr.astype(dtype, copy=copy)
 
     if issubclass(dtype.type, str):
-        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
+        return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")
 
     elif is_datetime64_dtype(arr):
         if dtype == np.int64: