pandas-dev · lithomas1 · Mar 29, 2021 · Mar 29, 2021 · Mar 30, 2021 · Mar 30, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -220,6 +220,7 @@ Other enhancements
 - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
 - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
 - :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
+- :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
 - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
 - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
         bint convert_na_value=True,
+        bint coerce=True,
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Returns a new numpy array with object dtype and only strings and na values.
+    Checks that all elements in numpy are string or null and returns a new numpy array
+    with object dtype and only strings and na values if so. Otherwise,
+    raise a ValueError.
 
     Parameters
     ----------
@@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array(
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
+    coerce : bool, default True
+        Whether to coerce non-null non-string elements to strings.
+        Will raise ValueError otherwise.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            result[i] = str(val)
+            if coerce:
+                result[i] = str(val)
+            else:
+                raise ValueError("Non-string element encountered in array.")
         else:
             if convert_na_value:
                 val = na_value
@@ -2059,7 +2068,7 @@ def maybe_convert_numeric(
         upcasting for ints with nulls to float64.
     Returns
     -------
-    np.ndarray
+    np.ndarray or tuple of converted values and its mask
         Array of converted object values to numerical ones.
 
     Optional[np.ndarray]
@@ -2224,6 +2233,11 @@ def maybe_convert_numeric(
     if allow_null_in_int and seen.null_ and not seen.int_:
         seen.float_ = True
 
+    # This occurs since we disabled float nulls showing as null in anticipation
+    # of seeing ints that were never seen. So then, we return float
+    if allow_null_in_int and seen.null_ and not seen.int_:
+        seen.float_ = True
+
     if seen.complex_:
         return (complexes, None)
     elif seen.float_:

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -92,6 +92,7 @@ from pandas._libs.khash cimport (
     kh_strbox_t,
     khiter_t,
 )
+from pandas._libs.missing cimport C_NA
 
 from pandas.errors import (
     EmptyDataError,
@@ -109,6 +110,13 @@ from pandas.core.dtypes.common import (
 )
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
+from pandas.core.arrays import (
+    BooleanArray,
+    FloatingArray,
+    IntegerArray,
+    StringArray,
+)
+
 cdef:
     float64_t INF = <float64_t>np.inf
     float64_t NEGINF = -INF
@@ -307,7 +315,7 @@ cdef class TextReader:
         object handle
         object orig_header
         bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
-        bint mangle_dupe_cols, allow_leading_cols
+        bint mangle_dupe_cols, allow_leading_cols, use_nullable_dtypes
         uint64_t parser_start  # this is modified after __init__
         list clocks
         const char *encoding_errors
@@ -366,6 +374,7 @@ cdef class TextReader:
                   bint mangle_dupe_cols=True,
                   float_precision=None,
                   bint skip_blank_lines=True,
+                  bint use_nullable_dtypes=False,
                   encoding_errors=b"strict"):
 
         # set encoding for native Python and C library
@@ -429,6 +438,7 @@ cdef class TextReader:
             # consistent with csv module semantics, cast all to float
             dtype_order = dtype_order[1:]
         self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
+        self.use_nullable_dtypes = use_nullable_dtypes
 
         if comment is not None:
             if len(comment) > 1:
@@ -1020,7 +1030,7 @@ cdef class TextReader:
             # don't try to upcast EAs
             try_upcast = upcast_na and na_count > 0
             if try_upcast and not is_extension_array_dtype(col_dtype):
-                col_res = _maybe_upcast(col_res)
+                col_res = _maybe_upcast(col_res, self.use_nullable_dtypes)
 
             if col_res is None:
                 raise ParserError(f'Unable to parse column {i}')
@@ -1313,18 +1323,53 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
-def _maybe_upcast(arr):
+def _maybe_upcast(arr, use_nullable_dtypes=False):
     """
+    Tries to upcast null values or use nullable dtypes if set to True.
+
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array to upcast.
+    use_nullable_dtypes: bool, default False
+        Whether to use nullable datatypes instead of upcasting.
+        If true, then:
+            - int w/ NaN -> IntegerArray
+            - bool w/ NaN -> BooleanArray
+            - float w/NaN -> FloatingArray
+            - object(strings) w/NaN -> StringArray
 
     """
+    na_value = na_values[arr.dtype]
     if issubclass(arr.dtype.type, np.integer):
-        na_value = na_values[arr.dtype]
-        arr = arr.astype(float)
-        np.putmask(arr, arr == na_value, np.nan)
+        mask = arr == na_value
+        if use_nullable_dtypes:
+            # only convert to integer array if not all NAN
+            if not mask.all():
+                arr = IntegerArray(arr, mask)
+            else:
+                arr = arr.astype(float)
+                arr = FloatingArray(arr, mask)
+        else:
+            arr = arr.astype(float)
+            np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
-        mask = arr.view(np.uint8) == na_values[np.uint8]
-        arr = arr.astype(object)
-        np.putmask(arr, mask, np.nan)
+        mask = arr.view(np.uint8) == na_value
+        if use_nullable_dtypes:
+            arr = BooleanArray(arr, mask)
+        else:
+            arr = arr.astype(object)
+            np.putmask(arr, mask, np.nan)
+    elif use_nullable_dtypes and arr.dtype == np.float64:
+        mask = np.isnan(arr)
+        arr = FloatingArray(arr, mask)
+    elif use_nullable_dtypes and arr.dtype == np.object_:
+        # Maybe convert StringArray & catch error for non-strings
+        try:
+            arr = StringArray(arr)
+        except ValueError as e:
+            pass
 
     return arr
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -208,21 +208,30 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype())
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False
+            ),
+            NDArrayBacked.__init__(
+                self,
+                self._ndarray,
+                StringDtype(),
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -44,6 +44,7 @@
     is_dict_like,
     is_dtype_equal,
     is_extension_array_dtype,
+    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
@@ -56,7 +57,17 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
-from pandas.core.arrays import Categorical
+from pandas.core.api import (
+    NA,
+    array as pd_array,
+)
+from pandas.core.arrays import (
+    BooleanArray,
+    Categorical,
+    FloatingArray,
+    IntegerArray,
+    StringArray,
+)
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
@@ -111,6 +122,7 @@
     "mangle_dupe_cols": True,
     "infer_datetime_format": False,
     "skip_blank_lines": True,
+    "use_nullable_dtypes": False,
     "encoding_errors": "strict",
 }
 
@@ -205,6 +217,7 @@ def __init__(self, kwds):
 
         self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
 
+        self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False)
         self.handles: Optional[IOHandles] = None
 
     def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -573,10 +586,7 @@ def _convert_to_ndarrays(
                 )
 
                 # type specified in dtype param or cast_type is an EA
-                if cast_type and (
-                    not is_dtype_equal(cvals, cast_type)
-                    or is_extension_array_dtype(cast_type)
-                ):
+                if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):
                     if not is_ea and na_count > 0:
                         try:
                             if is_bool_dtype(cast_type):
@@ -671,12 +681,12 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         ----------
         values : ndarray
         na_values : set
-        try_num_bool : bool, default try
+        try_num_bool : bool, default True
            try to cast values to numeric (first preference) or boolean
 
         Returns
         -------
-        converted : ndarray
+        converted : ndarray or ExtensionArray
         na_count : int
         """
         na_count = 0
@@ -689,14 +699,29 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             na_count = mask.sum()  # type: ignore[assignment]
             if na_count > 0:
                 if is_integer_dtype(values):
-                    values = values.astype(np.float64)
+                    if self.use_nullable_dtypes:
+                        values = pd_array(values, dtype="Int64")
+                        values[mask] = NA  # <- This is pd.NA
+                        return values, na_count
+                    else:
+                        values = values.astype(np.float64)
                 np.putmask(values, mask, np.nan)
             return values, na_count
 
         if try_num_bool and is_object_dtype(values.dtype):
             # exclude e.g DatetimeIndex here
             try:
-                result, _ = lib.maybe_convert_numeric(values, na_values, False)
+                result, mask = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    False,
+                    convert_to_masked_nullable=self.use_nullable_dtypes,
+                )
+                if mask is not None:
+                    if is_integer_dtype(result):
+                        result = IntegerArray(result, mask)
+                    elif is_float_dtype(result):
+                        result = FloatingArray(result, mask)
             except (ValueError, TypeError):
                 # e.g. encountering datetime string gets ValueError
                 #  TypeError can be raised in floatify
@@ -710,11 +735,26 @@ def _infer_types(self, values, na_values, try_num_bool=True):
                 na_count = parsers.sanitize_objects(values, na_values, False)
 
         if result.dtype == np.object_ and try_num_bool:
-            result, _ = libops.maybe_convert_bool(
+            result, mask = libops.maybe_convert_bool(
                 np.asarray(values),
                 true_values=self.true_values,
                 false_values=self.false_values,
+                convert_to_masked_nullable=self.use_nullable_dtypes,
             )
+            if mask is not None:
+                result = BooleanArray(result, mask)
+        # Maybe StringArray? Must have NA value to trigger
+        # Since it is called use_nullable_dtypes after all
+        # However, all NA -> Float64 not StringArray
+        if (
+            result.dtype == np.object_
+            and self.use_nullable_dtypes
+            and 0 < na_count < len(result)
+        ):
+            try:
+                result = StringArray(result)
+            except ValueError:
+                pass
 
         return result, na_count
 

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -352,6 +352,13 @@
 
     .. versionchanged:: 1.2
 
+use_nullable_dtypes : bool, default False
+    If True, use dtypes that use pd.NA as missing value indicator for
+    the resulting DataFrame. Currently supports reading data into the nullable boolean
+    and integer array types, but not string arrays.
+
+    .. versionadded:: 1.3
+
 {storage_options}
 
     .. versionadded:: 1.2
@@ -538,6 +545,7 @@ def read_csv(
     low_memory=_c_parser_defaults["low_memory"],
     memory_map=False,
     float_precision=None,
+    use_nullable_dtypes=False,
     storage_options: StorageOptions = None,
 ):
     kwds = locals()
@@ -619,6 +627,7 @@ def read_table(
     low_memory=_c_parser_defaults["low_memory"],
     memory_map=False,
     float_precision=None,
+    use_nullable_dtypes=False,
 ):
     kwds = locals()
     del kwds["filepath_or_buffer"]
@@ -827,7 +836,6 @@ def _clean_options(self, options, engine):
 
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
-
         if sep is None and not delim_whitespace:
             if engine == "c":
                 fallback_reason = (