Skip to content

ENH: Add use_nullable_dtypes in csv internals #48403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
58 changes: 50 additions & 8 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ import warnings

from pandas.util._exceptions import find_stack_level

from pandas import StringDtype
from pandas.core.arrays import (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a viable way to do this outside of the cython code?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A possible option might be to change _maybe_upcast to optionally return a values array + mask array, and then do the actual construction in python?

Although since _maybe_upcast is only being called in this file (and thus from cython), that won't help. Unless if we would propagate such a (values, mask) tuple (instead of ArrayLike) through the different calls and return that from the TextReader.read method.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah not worth contorting ourselves to avoid non-cython imports. just if there's a convenient alternative

Copy link
Member Author

@phofl phofl Sep 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be done yes, but would make logic more complex, which is something we should avoid here I think

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah, never mind then. thanks for taking a look

BooleanArray,
FloatingArray,
IntegerArray,
)

cimport cython
from cpython.bytes cimport (
PyBytes_AsString,
Expand Down Expand Up @@ -1378,18 +1385,53 @@ STR_NA_VALUES = {
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))


def _maybe_upcast(arr):
"""
def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
"""Sets nullable dtypes or upcasts if nans are present.

Upcast, if use_nullable_dtypes is false and nans are present so that the
current dtype can not hold the na value. We use nullable dtypes if the
flag is true for every array.

Parameters
----------
arr: ndarray
Numpy array that is potentially being upcast.

use_nullable_dtypes: bool, default False
If true, we cast to the associated nullable dtypes.

Returns
-------
The casted array.
"""
na_value = na_values[arr.dtype]

if issubclass(arr.dtype.type, np.integer):
na_value = na_values[arr.dtype]
arr = arr.astype(float)
np.putmask(arr, arr == na_value, np.nan)
mask = arr == na_value

if use_nullable_dtypes:
arr = IntegerArray(arr, mask)
else:
arr = arr.astype(float)
np.putmask(arr, mask, np.nan)

elif arr.dtype == np.bool_:
mask = arr.view(np.uint8) == na_values[np.uint8]
arr = arr.astype(object)
np.putmask(arr, mask, np.nan)
mask = arr.view(np.uint8) == na_value

if use_nullable_dtypes:
arr = BooleanArray(arr, mask)
else:
arr = arr.astype(object)
np.putmask(arr, mask, np.nan)

elif issubclass(arr.dtype.type, float):
if use_nullable_dtypes:
mask = np.isnan(arr)
arr = FloatingArray(arr, mask)

elif arr.dtype == np.object_:
if use_nullable_dtypes:
arr = StringDtype().construct_array_type()._from_sequence(arr)

return arr

Expand Down
99 changes: 99 additions & 0 deletions pandas/tests/io/parser/test_upcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import numpy as np
import pytest

from pandas._libs.parsers import ( # type: ignore[attr-defined]
_maybe_upcast,
na_values,
)

from pandas import NA
import pandas._testing as tm
from pandas.core.arrays import (
BooleanArray,
FloatingArray,
IntegerArray,
StringArray,
)


def test_maybe_upcast(any_real_numpy_dtype):
# GH#36712
if any_real_numpy_dtype == "float32":
# na values not defined for float32
pytest.skip()

dtype = np.dtype(any_real_numpy_dtype)
na_value = na_values[dtype]
arr = np.array([1, 2, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_nullable_dtypes=True)

expected_mask = np.array([False, False, True])
if issubclass(dtype.type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)

tm.assert_extension_array_equal(result, expected)


def test_maybe_upcast_no_na(any_real_numpy_dtype):
# GH#36712
if any_real_numpy_dtype == "float32":
pytest.skip()

arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
result = _maybe_upcast(arr, use_nullable_dtypes=True)

expected_mask = np.array([False, False, False])
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)

tm.assert_extension_array_equal(result, expected)


def test_maybe_upcaste_bool():
# GH#36712
dtype = np.bool_
na_value = na_values[dtype]
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_nullable_dtypes=True)

expected_mask = np.array([False, False, True])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)


def test_maybe_upcaste_bool_no_nan():
# GH#36712
dtype = np.bool_
arr = np.array([True, False, False], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_nullable_dtypes=True)

expected_mask = np.array([False, False, False])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)


def test_maybe_upcaste_all_nan():
# GH#36712
dtype = np.int64
na_value = na_values[dtype]
arr = np.array([na_value, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_nullable_dtypes=True)

expected_mask = np.array([True, True])
expected = IntegerArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val):
# GH#36712
arr = np.array(["a", "b", val], dtype=np.object_)
result = _maybe_upcast(arr, use_nullable_dtypes=True)

exp_val = "c" if val == "c" else NA
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
tm.assert_extension_array_equal(result, expected)