Skip to content

ENH: Add use_nullable_dtypes to to_numeric #50505

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
* :func:`read_sql_query`
* :func:`read_sql_table`
* :func:`read_orc`
* :func:`to_numeric`

Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
to select the nullable dtypes implementation.
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2272,6 +2272,7 @@ def maybe_convert_numeric(
if convert_empty or seen.coerce_numeric:
seen.saw_null()
floats[i] = complexes[i] = NaN
mask[i] = 1
else:
raise ValueError("Empty string encountered")
elif util.is_complex_object(val):
Expand Down Expand Up @@ -2328,6 +2329,7 @@ def maybe_convert_numeric(

seen.saw_null()
floats[i] = NaN
mask[i] = 1

if seen.check_uint64_conflict():
return (values, None)
Expand Down
49 changes: 40 additions & 9 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
from pandas.core.dtypes.cast import maybe_downcast_numeric
from pandas.core.dtypes.common import (
ensure_object,
is_bool_dtype,
is_datetime_or_timedelta_dtype,
is_decimal,
is_integer_dtype,
is_number,
is_numeric_dtype,
is_object_dtype,
is_scalar,
needs_i8_conversion,
)
Expand All @@ -27,13 +29,14 @@
)

import pandas as pd
from pandas.core.arrays.numeric import NumericArray
from pandas.core.arrays import BaseMaskedArray


def to_numeric(
arg,
errors: DateTimeErrorChoices = "raise",
downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
use_nullable_dtypes: bool = False,
):
"""
Convert argument to a numeric type.
Expand All @@ -47,7 +50,7 @@ def to_numeric(
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
passed in, it is very likely they will be converted to float so that
they can stored in an `ndarray`. These warnings apply similarly to
they can be stored in an `ndarray`. These warnings apply similarly to
`Series` since it internally leverages `ndarray`.

Parameters
Expand Down Expand Up @@ -78,6 +81,10 @@ def to_numeric(
the dtype it is to be cast to, so if none of the dtypes
checked satisfy that specification, no downcasting will be
performed on the data.
use_nullable_dtypes : bool = False
Whether or not to use nullable dtypes as default when converting data. If
set to True, nullable dtypes are used for all dtypes that have a nullable
implementation, even if no nulls are present.

Returns
-------
Expand Down Expand Up @@ -178,11 +185,12 @@ def to_numeric(
# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
# save mask to reconstruct the full array after casting
mask: npt.NDArray[np.bool_] | None = None
if isinstance(values, NumericArray):
if isinstance(values, BaseMaskedArray):
mask = values._mask
values = values._data[~mask]

values_dtype = getattr(values, "dtype", None)
new_mask: np.ndarray | None = None
if is_numeric_dtype(values_dtype):
pass
elif is_datetime_or_timedelta_dtype(values_dtype):
Expand All @@ -191,13 +199,23 @@ def to_numeric(
values = ensure_object(values)
coerce_numeric = errors not in ("ignore", "raise")
try:
values, _ = lib.maybe_convert_numeric(
values, set(), coerce_numeric=coerce_numeric
values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload]
values,
set(),
coerce_numeric=coerce_numeric,
convert_to_masked_nullable=use_nullable_dtypes,
)
except (ValueError, TypeError):
if errors == "raise":
raise

if new_mask is not None:
# Remove unnecessary values, is expected later anyway and enables
# downcasting
values = values[~new_mask]
elif use_nullable_dtypes and new_mask is None:
new_mask = np.zeros(values.shape, dtype=np.bool_)

# attempt downcast only if the data has been successfully converted
# to a numerical dtype and if a downcast method has been specified
if downcast is not None and is_numeric_dtype(values.dtype):
Expand Down Expand Up @@ -228,18 +246,31 @@ def to_numeric(
if values.dtype == dtype:
break

# GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
if mask is not None:
# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
# masked array
if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype):
if mask is None:
mask = new_mask
Comment on lines +252 to +253
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if mask is None:
mask = new_mask
if mask is None:
mask = new_mask
else:
mask = mask.copy()

and then remove the .copy() below when constructing the array (right now the mask is always copied, but assuming this newly generated mask doesn't need that)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DOne

else:
mask = mask.copy()
assert isinstance(mask, np.ndarray)
data = np.zeros(mask.shape, dtype=values.dtype)
data[~mask] = values

from pandas.core.arrays import (
BooleanArray,
FloatingArray,
IntegerArray,
)

klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
values = klass(data, mask.copy())
klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray]
if is_integer_dtype(data.dtype):
klass = IntegerArray
elif is_bool_dtype(data.dtype):
klass = BooleanArray
else:
klass = FloatingArray
values = klass(data, mask)

if is_series:
return arg._constructor(values, index=arg.index, name=arg.name)
Expand Down
69 changes: 69 additions & 0 deletions pandas/tests/tools/test_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,3 +807,72 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val):
expected = Series([val])
result = to_numeric(expected, downcast="float")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
)
def test_to_numeric_use_nullable_dtypes(val, dtype):
# GH#50505
ser = Series([val], dtype=object)
result = to_numeric(ser, use_nullable_dtypes=True)
expected = Series([val], dtype=dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
)
def test_to_numeric_use_nullable_dtypes_na(val, dtype):
# GH#50505
ser = Series([val, None], dtype=object)
result = to_numeric(ser, use_nullable_dtypes=True)
expected = Series([val, pd.NA], dtype=dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"val, dtype, downcast",
[(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")],
)
def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast):
# GH#50505
ser = Series([val, None], dtype=object)
result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast)
expected = Series([val, pd.NA], dtype=dtype)
tm.assert_series_equal(result, expected)


def test_to_numeric_use_nullable_dtypes_downcasting_uint():
# GH#50505
ser = Series([1, pd.NA], dtype="UInt64")
result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned")
expected = Series([1, pd.NA], dtype="UInt8")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("dtype", ["Int64", "UInt64", "Float64", "boolean"])
def test_to_numeric_use_nullable_dtypes_already_nullable(dtype):
# GH#50505
ser = Series([1, pd.NA], dtype=dtype)
result = to_numeric(ser, use_nullable_dtypes=True)
expected = Series([1, pd.NA], dtype=dtype)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")]
)
def test_to_numeric_use_nullable_dtypes_error(use_nullable_dtypes, dtype):
# GH#50505
ser = Series(["a", "b", ""])
expected = ser.copy()
with pytest.raises(ValueError, match="Unable to parse string"):
to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes)

result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="ignore")
tm.assert_series_equal(result, expected)

result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="coerce")
expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
tm.assert_series_equal(result, expected)