pandas-dev · mroeschke · Sep 19, 2022 · Sep 5, 2022 · Sep 5, 2022 · Sep 5, 2022
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -15,6 +15,13 @@ import warnings
 
 from pandas.util._exceptions import find_stack_level
 
+from pandas import StringDtype
+from pandas.core.arrays import (
+    BooleanArray,
+    FloatingArray,
+    IntegerArray,
+)
+
 cimport cython
 from cpython.bytes cimport (
     PyBytes_AsString,
@@ -1378,18 +1385,53 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
-def _maybe_upcast(arr):
-    """
+def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
+    """Sets nullable dtypes or upcasts if nans are present.
 
+    Upcast, if use_nullable_dtypes is false and nans are present so that the
+    current dtype can not hold the na value. We use nullable dtypes if the
+    flag is true for every array.
+
+    Parameters
+    ----------
+    arr: ndarray
+        Numpy array that is potentially being upcast.
+
+    use_nullable_dtypes: bool, default False
+        If true, we cast to the associated nullable dtypes.
+
+    Returns
+    -------
+    The casted array.
     """
+    na_value = na_values[arr.dtype]
+
     if issubclass(arr.dtype.type, np.integer):
-        na_value = na_values[arr.dtype]
-        arr = arr.astype(float)
-        np.putmask(arr, arr == na_value, np.nan)
+        mask = arr == na_value
+
+        if use_nullable_dtypes:
+            arr = IntegerArray(arr, mask)
+        else:
+            arr = arr.astype(float)
+            np.putmask(arr, mask, np.nan)
+
     elif arr.dtype == np.bool_:
-        mask = arr.view(np.uint8) == na_values[np.uint8]
-        arr = arr.astype(object)
-        np.putmask(arr, mask, np.nan)
+        mask = arr.view(np.uint8) == na_value
+
+        if use_nullable_dtypes:
+            arr = BooleanArray(arr, mask)
+        else:
+            arr = arr.astype(object)
+            np.putmask(arr, mask, np.nan)
+
+    elif issubclass(arr.dtype.type, float):
+        if use_nullable_dtypes:
+            mask = np.isnan(arr)
+            arr = FloatingArray(arr, mask)
+
+    elif arr.dtype == np.object_:
+        if use_nullable_dtypes:
+            arr = StringDtype().construct_array_type()._from_sequence(arr)
 
     return arr
 

diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py
@@ -0,0 +1,99 @@
+import numpy as np
+import pytest
+
+from pandas._libs.parsers import (  # type: ignore[attr-defined]
+    _maybe_upcast,
+    na_values,
+)
+
+from pandas import NA
+import pandas._testing as tm
+from pandas.core.arrays import (
+    BooleanArray,
+    FloatingArray,
+    IntegerArray,
+    StringArray,
+)
+
+
+def test_maybe_upcast(any_real_numpy_dtype):
+    # GH#36712
+    if any_real_numpy_dtype == "float32":
+        # na values not defined for float32
+        pytest.skip()
+
+    dtype = np.dtype(any_real_numpy_dtype)
+    na_value = na_values[dtype]
+    arr = np.array([1, 2, na_value], dtype=dtype)
+    result = _maybe_upcast(arr, use_nullable_dtypes=True)
+
+    expected_mask = np.array([False, False, True])
+    if issubclass(dtype.type, np.integer):
+        expected = IntegerArray(arr, mask=expected_mask)
+    else:
+        expected = FloatingArray(arr, mask=expected_mask)
+
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_maybe_upcast_no_na(any_real_numpy_dtype):
+    # GH#36712
+    if any_real_numpy_dtype == "float32":
+        pytest.skip()
+
+    arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
+    result = _maybe_upcast(arr, use_nullable_dtypes=True)
+
+    expected_mask = np.array([False, False, False])
+    if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
+        expected = IntegerArray(arr, mask=expected_mask)
+    else:
+        expected = FloatingArray(arr, mask=expected_mask)
+
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_maybe_upcaste_bool():
+    # GH#36712
+    dtype = np.bool_
+    na_value = na_values[dtype]
+    arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
+    result = _maybe_upcast(arr, use_nullable_dtypes=True)
+
+    expected_mask = np.array([False, False, True])
+    expected = BooleanArray(arr, mask=expected_mask)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_maybe_upcaste_bool_no_nan():
+    # GH#36712
+    dtype = np.bool_
+    arr = np.array([True, False, False], dtype="uint8").view(dtype)
+    result = _maybe_upcast(arr, use_nullable_dtypes=True)
+
+    expected_mask = np.array([False, False, False])
+    expected = BooleanArray(arr, mask=expected_mask)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_maybe_upcaste_all_nan():
+    # GH#36712
+    dtype = np.int64
+    na_value = na_values[dtype]
+    arr = np.array([na_value, na_value], dtype=dtype)
+    result = _maybe_upcast(arr, use_nullable_dtypes=True)
+
+    expected_mask = np.array([True, True])
+    expected = IntegerArray(arr, mask=expected_mask)
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
+def test_maybe_upcast_object(val):
+    # GH#36712
+    arr = np.array(["a", "b", val], dtype=np.object_)
+    result = _maybe_upcast(arr, use_nullable_dtypes=True)
+
+    exp_val = "c" if val == "c" else NA
+    expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
+    tm.assert_extension_array_equal(result, expected)