diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e8b7160af9b2c..07bf7f69ec907 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -15,6 +15,13 @@ import warnings from pandas.util._exceptions import find_stack_level +from pandas import StringDtype +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) + cimport cython from cpython.bytes cimport ( PyBytes_AsString, @@ -1378,18 +1385,53 @@ STR_NA_VALUES = { _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) -def _maybe_upcast(arr): - """ +def _maybe_upcast(arr, use_nullable_dtypes: bool = False): + """Sets nullable dtypes or upcasts if nans are present. + Upcast, if use_nullable_dtypes is false and nans are present so that the + current dtype can not hold the na value. We use nullable dtypes if the + flag is true for every array. + + Parameters + ---------- + arr: ndarray + Numpy array that is potentially being upcast. + + use_nullable_dtypes: bool, default False + If true, we cast to the associated nullable dtypes. + + Returns + ------- + The casted array. """ + na_value = na_values[arr.dtype] + if issubclass(arr.dtype.type, np.integer): - na_value = na_values[arr.dtype] - arr = arr.astype(float) - np.putmask(arr, arr == na_value, np.nan) + mask = arr == na_value + + if use_nullable_dtypes: + arr = IntegerArray(arr, mask) + else: + arr = arr.astype(float) + np.putmask(arr, mask, np.nan) + elif arr.dtype == np.bool_: - mask = arr.view(np.uint8) == na_values[np.uint8] - arr = arr.astype(object) - np.putmask(arr, mask, np.nan) + mask = arr.view(np.uint8) == na_value + + if use_nullable_dtypes: + arr = BooleanArray(arr, mask) + else: + arr = arr.astype(object) + np.putmask(arr, mask, np.nan) + + elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32: + if use_nullable_dtypes: + mask = np.isnan(arr) + arr = FloatingArray(arr, mask) + + elif arr.dtype == np.object_: + if use_nullable_dtypes: + arr = StringDtype().construct_array_type()._from_sequence(arr) return arr @@ -1985,6 +2027,7 @@ def _compute_na_values(): uint16info = np.iinfo(np.uint16) uint8info = np.iinfo(np.uint8) na_values = { + np.float32: np.nan, np.float64: np.nan, np.int64: int64info.min, np.int32: int32info.min, diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py new file mode 100644 index 0000000000000..428050ac01b58 --- /dev/null +++ b/pandas/tests/io/parser/test_upcast.py @@ -0,0 +1,108 @@ +import numpy as np +import pytest + +from pandas._libs.parsers import ( # type: ignore[attr-defined] + _maybe_upcast, + na_values, +) +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import NA +import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + BooleanArray, + FloatingArray, + IntegerArray, + StringArray, +) + + +def test_maybe_upcast(any_real_numpy_dtype): + # GH#36712 + + dtype = np.dtype(any_real_numpy_dtype) + na_value = na_values[dtype] + arr = np.array([1, 2, na_value], dtype=dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, True]) + if issubclass(dtype.type, np.integer): + expected = IntegerArray(arr, mask=expected_mask) + else: + expected = FloatingArray(arr, mask=expected_mask) + + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcast_no_na(any_real_numpy_dtype): + # GH#36712 + if any_real_numpy_dtype == "float32": + pytest.skip() + + arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, False]) + if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer): + expected = IntegerArray(arr, mask=expected_mask) + else: + expected = FloatingArray(arr, mask=expected_mask) + + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_bool(): + # GH#36712 + dtype = np.bool_ + na_value = na_values[dtype] + arr = np.array([True, False, na_value], dtype="uint8").view(dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, True]) + expected = BooleanArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_bool_no_nan(): + # GH#36712 + dtype = np.bool_ + arr = np.array([True, False, False], dtype="uint8").view(dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([False, False, False]) + expected = BooleanArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +def test_maybe_upcaste_all_nan(): + # GH#36712 + dtype = np.int64 + na_value = na_values[dtype] + arr = np.array([na_value, na_value], dtype=dtype) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + expected_mask = np.array([True, True]) + expected = IntegerArray(arr, mask=expected_mask) + tm.assert_extension_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize("storage", ["pyarrow", "python"]) +@pytest.mark.parametrize("val", [na_values[np.object_], "c"]) +def test_maybe_upcast_object(val, storage): + # GH#36712 + import pyarrow as pa + + with pd.option_context("mode.string_storage", storage): + arr = np.array(["a", "b", val], dtype=np.object_) + result = _maybe_upcast(arr, use_nullable_dtypes=True) + + if storage == "python": + exp_val = "c" if val == "c" else NA + expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) + else: + exp_val = "c" if val == "c" else None + expected = ArrowStringArray(pa.array(["a", "b", exp_val])) + tm.assert_extension_array_equal(result, expected)