pandas-dev · lithomas1 · Mar 29, 2021 · Mar 29, 2021 · Mar 30, 2021 · Mar 30, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -169,6 +169,7 @@ Other enhancements
 - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
 - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
 - :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
+- :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2006,7 +2006,8 @@ def maybe_convert_numeric(
     set na_values,
     bint convert_empty=True,
     bint coerce_numeric=False,
-) -> ndarray:
+    bint convert_to_nullable_integer=False,
+) -> "ArrayLike":
     """
     Convert object array to a numeric array if possible.
 
@@ -2030,10 +2031,13 @@ def maybe_convert_numeric(
         numeric array has no suitable numerical dtype to return (i.e. uint64,
         int32, uint8). If set to False, the original object array will be
         returned. Otherwise, a ValueError will be raised.
+    convert_to_nullable_integer : bool, default False
+        If an array-like object contains only integer values (and NaN) is
+        encountered, whether to convert and return an IntegerArray.
 
     Returns
     -------
-    np.ndarray
+    np.ndarray or ExtensionArray
         Array of converted object values to numerical ones.
     """
     if len(values) == 0:
@@ -2061,21 +2065,34 @@ def maybe_convert_numeric(
         ndarray[int64_t] ints = np.empty(n, dtype='i8')
         ndarray[uint64_t] uints = np.empty(n, dtype='u8')
         ndarray[uint8_t] bools = np.empty(n, dtype='u1')
+        ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
         float64_t fval
 
     for i in range(n):
         val = values[i]
+        # We only want to disable NaNs showing as float if
+        # a) convert_to_nullable_integer = True
+        # b) no floats have been seen ( assuming an int shows up later )
+        # However, if no ints present (all null array), we need to return floats
+        allow_nullable_dtypes = convert_to_nullable_integer and not seen.float_
 
         if val.__hash__ is not None and val in na_values:
-            seen.saw_null()
+            if allow_nullable_dtypes:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif util.is_float_object(val):
             fval = val
             if fval != fval:
+                mask[i] = 1
                 seen.null_ = True
-
+                if not allow_nullable_dtypes:
+                    seen.float_ = True
+            else:
+                seen.float_ = True
             floats[i] = complexes[i] = fval
-            seen.float_ = True
         elif util.is_integer_object(val):
             floats[i] = complexes[i] = val
 
@@ -2098,7 +2115,11 @@ def maybe_convert_numeric(
             floats[i] = uints[i] = ints[i] = bools[i] = val
             seen.bool_ = True
         elif val is None or val is C_NA:
-            seen.saw_null()
+            if allow_nullable_dtypes:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif hasattr(val, '__len__') and len(val) == 0:
             if convert_empty or seen.coerce_numeric:
@@ -2122,14 +2143,18 @@ def maybe_convert_numeric(
                 else:
                     if fval != fval:
                         seen.null_ = True
+                        mask[i] = 1
 
                     floats[i] = fval
 
                 if maybe_int:
                     as_int = int(val)
 
                     if as_int in na_values:
-                        seen.saw_null()
+                        mask[i] = 1
+                        seen.null_ = True
+                        if not convert_to_nullable_integer:
+                            seen.float_ = True
                     else:
                         seen.saw_int(as_int)
 
@@ -2159,11 +2184,22 @@ def maybe_convert_numeric(
     if seen.check_uint64_conflict():
         return values
 
+    # This occurs since we disabled float nulls showing as null in anticipation
+    # of seeing ints that were never seen. So then, we return float
+    if convert_to_nullable_integer and seen.null_ and not seen.int_:
+        seen.float_ = True
+
     if seen.complex_:
         return complexes
     elif seen.float_:
         return floats
     elif seen.int_:
+        if seen.null_ and convert_to_nullable_integer:
+            from pandas.core.arrays import IntegerArray
+            if seen.uint_:
+                return IntegerArray(uints, mask.view(np.bool_))
+            else:
+                return IntegerArray(ints, mask.view(np.bool_))
         if seen.uint_:
             return uints
         else:

diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
@@ -39,5 +39,6 @@ def vec_binop(
 def maybe_convert_bool(
     arr: np.ndarray,  # np.ndarray[object]
     true_values=...,
-    false_values=...
+    false_values=...,
+    convert_to_nullable_boolean: bool = True,
 ) -> np.ndarray: ...
diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx
@@ -24,10 +24,7 @@ import_array()
 
 
 from pandas._libs.missing cimport checknull
-from pandas._libs.util cimport (
-    UINT8_MAX,
-    is_nan,
-)
+from pandas._libs.util cimport is_nan
 
 
 @cython.wraparound(False)
@@ -258,17 +255,20 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
 
 
 def maybe_convert_bool(ndarray[object] arr,
-                       true_values=None, false_values=None) -> ndarray:
+                       true_values=None,
+                       false_values=None,
+                       convert_to_nullable_boolean=False) -> "ArrayLike":
     cdef:
         Py_ssize_t i, n
         ndarray[uint8_t] result
+        ndarray[uint8_t] mask
         object val
         set true_vals, false_vals
-        int na_count = 0
+        bint has_na = False
 
     n = len(arr)
     result = np.empty(n, dtype=np.uint8)
-
+    mask = np.zeros(n, dtype=np.uint8)
     # the defaults
     true_vals = {'True', 'TRUE', 'true'}
     false_vals = {'False', 'FALSE', 'false'}
@@ -292,15 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
         elif val in false_vals:
             result[i] = 0
         elif isinstance(val, float):
-            result[i] = UINT8_MAX
-            na_count += 1
+            mask[i] = 1
+            result[i] = 0  # Value here doesn't matter, will be replaced w/ nan
+            has_na = True
         else:
             return arr
 
-    if na_count > 0:
-        mask = result == UINT8_MAX
-        arr = result.view(np.bool_).astype(object)
-        np.putmask(arr, mask, np.nan)
-        return arr
+    if has_na:
+        if convert_to_nullable_boolean:
+            from pandas.core.arrays import BooleanArray
+            return BooleanArray(result.view(np.bool_), mask.view(np.bool_))
+        else:
+            arr = result.view(np.bool_).astype(object)
+            np.putmask(arr, mask, np.nan)
+            return arr
     else:
         return result.view(np.bool_)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -112,6 +112,11 @@ from pandas.core.dtypes.common import (
 )
 from pandas.core.dtypes.concat import union_categoricals
 
+from pandas.core.arrays import (
+    BooleanArray,
+    IntegerArray,
+)
+
 cdef:
     float64_t INF = <float64_t>np.inf
     float64_t NEGINF = -INF
@@ -319,6 +324,7 @@ cdef class TextReader:
         int64_t leading_cols, table_width, skipfooter, buffer_lines
         bint allow_leading_cols, mangle_dupe_cols, low_memory
         bint delim_whitespace
+        bint use_nullable_dtypes
         object delimiter, converters
         object na_values
         object header, orig_header, names, header_start, header_end
@@ -367,6 +373,7 @@ cdef class TextReader:
                   bint mangle_dupe_cols=True,
                   float_precision=None,
                   bint skip_blank_lines=True,
+                  bint use_nullable_dtypes=False,
                   encoding_errors=b"strict"):
 
         # set encoding for native Python and C library
@@ -430,6 +437,7 @@ cdef class TextReader:
             # consistent with csv module semantics, cast all to float
             dtype_order = dtype_order[1:]
         self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
+        self.use_nullable_dtypes = use_nullable_dtypes
 
         if comment is not None:
             if len(comment) > 1:
@@ -1009,7 +1017,8 @@ cdef class TextReader:
             # don't try to upcast EAs
             try_upcast = upcast_na and na_count > 0
             if try_upcast and not is_extension_array_dtype(col_dtype):
-                col_res = _maybe_upcast(col_res)
+                col_res = _maybe_upcast(col_res,
+                                        use_nullable_dtypes=self.use_nullable_dtypes)
 
             if col_res is None:
                 raise ParserError(f'Unable to parse column {i}')
@@ -1299,18 +1308,45 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
-def _maybe_upcast(arr):
+def _maybe_upcast(arr, use_nullable_dtypes=False):
     """
+    Tries to upcast null values for integer and boolean data types.
+    If arr of boolean dtype, arr is upcast to object dtype or BooleanArray
+    and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray.
+
+    Note: If all values are null, array will be upcast to float64 even if
+    use_nullable_dtypes is True
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array to upcast.
+    use_nullable_dtypes: bool, default False
+        Whether to use nullable integer/boolean(IntegerArray/BooleanArray)
+        datatypes instead of upcasting.
 
     """
     if issubclass(arr.dtype.type, np.integer):
         na_value = na_values[arr.dtype]
-        arr = arr.astype(float)
-        np.putmask(arr, arr == na_value, np.nan)
+        mask = arr == na_value
+        if mask.all():
+            # Array of all NaN, dtype -> float64
+            use_nullable_dtypes = False
+        if use_nullable_dtypes:
+            arr = IntegerArray(arr, mask)
+        else:
+            arr = arr.astype(float)
+            np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
         mask = arr.view(np.uint8) == na_values[np.uint8]
-        arr = arr.astype(object)
-        np.putmask(arr, mask, np.nan)
+        if mask.all():
+            # Array of all NaN, dtype -> float64
+            use_nullable_dtypes = False
+        if use_nullable_dtypes:
+            arr = BooleanArray(arr, mask)
+        else:
+            arr = arr.astype(object)
+            np.putmask(arr, mask, np.nan)
 
     return arr
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -54,6 +54,10 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
+from pandas.core.api import (
+    NA,
+    array as pd_array,
+)
 from pandas.core.arrays import Categorical
 from pandas.core.indexes.api import (
     Index,
@@ -109,6 +113,7 @@
     "mangle_dupe_cols": True,
     "infer_datetime_format": False,
     "skip_blank_lines": True,
+    "use_nullable_dtypes": False,
     "encoding_errors": "strict",
 }
 
@@ -200,6 +205,7 @@ def __init__(self, kwds):
 
         self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
 
+        self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False)
         self.handles: Optional[IOHandles] = None
 
     def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -554,10 +560,7 @@ def _convert_to_ndarrays(
                 )
 
                 # type specified in dtype param or cast_type is an EA
-                if cast_type and (
-                    not is_dtype_equal(cvals, cast_type)
-                    or is_extension_array_dtype(cast_type)
-                ):
+                if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):
                     if not is_ea and na_count > 0:
                         try:
                             if is_bool_dtype(cast_type):
@@ -651,12 +654,12 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         ----------
         values : ndarray
         na_values : set
-        try_num_bool : bool, default try
+        try_num_bool : bool, default True
            try to cast values to numeric (first preference) or boolean
 
         Returns
         -------
-        converted : ndarray
+        converted : ndarray or ExtensionArray
         na_count : int
         """
         na_count = 0
@@ -669,14 +672,24 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             na_count = mask.sum()  # type: ignore[assignment]
             if na_count > 0:
                 if is_integer_dtype(values):
-                    values = values.astype(np.float64)
+                    if self.use_nullable_dtypes:
+                        values = pd_array(values, dtype="Int64")
+                        values[mask] = NA  # <- This is pd.NA
+                        return values, na_count
+                    else:
+                        values = values.astype(np.float64)
                 np.putmask(values, mask, np.nan)
             return values, na_count
 
         if try_num_bool and is_object_dtype(values.dtype):
             # exclude e.g DatetimeIndex here
             try:
-                result = lib.maybe_convert_numeric(values, na_values, False)
+                result = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    convert_empty=False,
+                    convert_to_nullable_integer=self.use_nullable_dtypes,
+                )
             except (ValueError, TypeError):
                 # e.g. encountering datetime string gets ValueError
                 #  TypeError can be raised in floatify
@@ -694,6 +707,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
                 np.asarray(values),
                 true_values=self.true_values,
                 false_values=self.false_values,
+                convert_to_nullable_boolean=self.use_nullable_dtypes,
             )
 
         return result, na_count