Skip to content

Commit bad3774

Browse files
authored
ENH: Add use_nullable_dtypes to to_numeric (#50505)
1 parent 5115f09 commit bad3774

File tree

4 files changed

+112
-9
lines changed

4 files changed

+112
-9
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
4545
* :func:`read_sql_query`
4646
* :func:`read_sql_table`
4747
* :func:`read_orc`
48+
* :func:`to_numeric`
4849

4950
Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
5051
to select the nullable dtypes implementation.

pandas/_libs/lib.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -2272,6 +2272,7 @@ def maybe_convert_numeric(
22722272
if convert_empty or seen.coerce_numeric:
22732273
seen.saw_null()
22742274
floats[i] = complexes[i] = NaN
2275+
mask[i] = 1
22752276
else:
22762277
raise ValueError("Empty string encountered")
22772278
elif util.is_complex_object(val):
@@ -2328,6 +2329,7 @@ def maybe_convert_numeric(
23282329

23292330
seen.saw_null()
23302331
floats[i] = NaN
2332+
mask[i] = 1
23312333

23322334
if seen.check_uint64_conflict():
23332335
return (values, None)

pandas/core/tools/numeric.py

+40-9
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@
1313
from pandas.core.dtypes.cast import maybe_downcast_numeric
1414
from pandas.core.dtypes.common import (
1515
ensure_object,
16+
is_bool_dtype,
1617
is_datetime_or_timedelta_dtype,
1718
is_decimal,
1819
is_integer_dtype,
1920
is_number,
2021
is_numeric_dtype,
22+
is_object_dtype,
2123
is_scalar,
2224
needs_i8_conversion,
2325
)
@@ -27,13 +29,14 @@
2729
)
2830

2931
import pandas as pd
30-
from pandas.core.arrays.numeric import NumericArray
32+
from pandas.core.arrays import BaseMaskedArray
3133

3234

3335
def to_numeric(
3436
arg,
3537
errors: DateTimeErrorChoices = "raise",
3638
downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
39+
use_nullable_dtypes: bool = False,
3740
):
3841
"""
3942
Convert argument to a numeric type.
@@ -47,7 +50,7 @@ def to_numeric(
4750
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
4851
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
4952
passed in, it is very likely they will be converted to float so that
50-
they can stored in an `ndarray`. These warnings apply similarly to
53+
they can be stored in an `ndarray`. These warnings apply similarly to
5154
`Series` since it internally leverages `ndarray`.
5255
5356
Parameters
@@ -78,6 +81,10 @@ def to_numeric(
7881
the dtype it is to be cast to, so if none of the dtypes
7982
checked satisfy that specification, no downcasting will be
8083
performed on the data.
84+
use_nullable_dtypes : bool = False
85+
Whether or not to use nullable dtypes as default when converting data. If
86+
set to True, nullable dtypes are used for all dtypes that have a nullable
87+
implementation, even if no nulls are present.
8188
8289
Returns
8390
-------
@@ -178,11 +185,12 @@ def to_numeric(
178185
# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
179186
# save mask to reconstruct the full array after casting
180187
mask: npt.NDArray[np.bool_] | None = None
181-
if isinstance(values, NumericArray):
188+
if isinstance(values, BaseMaskedArray):
182189
mask = values._mask
183190
values = values._data[~mask]
184191

185192
values_dtype = getattr(values, "dtype", None)
193+
new_mask: np.ndarray | None = None
186194
if is_numeric_dtype(values_dtype):
187195
pass
188196
elif is_datetime_or_timedelta_dtype(values_dtype):
@@ -191,13 +199,23 @@ def to_numeric(
191199
values = ensure_object(values)
192200
coerce_numeric = errors not in ("ignore", "raise")
193201
try:
194-
values, _ = lib.maybe_convert_numeric(
195-
values, set(), coerce_numeric=coerce_numeric
202+
values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload]
203+
values,
204+
set(),
205+
coerce_numeric=coerce_numeric,
206+
convert_to_masked_nullable=use_nullable_dtypes,
196207
)
197208
except (ValueError, TypeError):
198209
if errors == "raise":
199210
raise
200211

212+
if new_mask is not None:
213+
# Remove unnecessary values, is expected later anyway and enables
214+
# downcasting
215+
values = values[~new_mask]
216+
elif use_nullable_dtypes and new_mask is None:
217+
new_mask = np.zeros(values.shape, dtype=np.bool_)
218+
201219
# attempt downcast only if the data has been successfully converted
202220
# to a numerical dtype and if a downcast method has been specified
203221
if downcast is not None and is_numeric_dtype(values.dtype):
@@ -228,18 +246,31 @@ def to_numeric(
228246
if values.dtype == dtype:
229247
break
230248

231-
# GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
232-
if mask is not None:
249+
# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
250+
# masked array
251+
if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype):
252+
if mask is None:
253+
mask = new_mask
254+
else:
255+
mask = mask.copy()
256+
assert isinstance(mask, np.ndarray)
233257
data = np.zeros(mask.shape, dtype=values.dtype)
234258
data[~mask] = values
235259

236260
from pandas.core.arrays import (
261+
BooleanArray,
237262
FloatingArray,
238263
IntegerArray,
239264
)
240265

241-
klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
242-
values = klass(data, mask.copy())
266+
klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray]
267+
if is_integer_dtype(data.dtype):
268+
klass = IntegerArray
269+
elif is_bool_dtype(data.dtype):
270+
klass = BooleanArray
271+
else:
272+
klass = FloatingArray
273+
values = klass(data, mask)
243274

244275
if is_series:
245276
return arg._constructor(values, index=arg.index, name=arg.name)

pandas/tests/tools/test_to_numeric.py

+69
Original file line numberDiff line numberDiff line change
@@ -807,3 +807,72 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val):
807807
expected = Series([val])
808808
result = to_numeric(expected, downcast="float")
809809
tm.assert_series_equal(result, expected)
810+
811+
812+
@pytest.mark.parametrize(
813+
"val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
814+
)
815+
def test_to_numeric_use_nullable_dtypes(val, dtype):
816+
# GH#50505
817+
ser = Series([val], dtype=object)
818+
result = to_numeric(ser, use_nullable_dtypes=True)
819+
expected = Series([val], dtype=dtype)
820+
tm.assert_series_equal(result, expected)
821+
822+
823+
@pytest.mark.parametrize(
824+
"val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
825+
)
826+
def test_to_numeric_use_nullable_dtypes_na(val, dtype):
827+
# GH#50505
828+
ser = Series([val, None], dtype=object)
829+
result = to_numeric(ser, use_nullable_dtypes=True)
830+
expected = Series([val, pd.NA], dtype=dtype)
831+
tm.assert_series_equal(result, expected)
832+
833+
834+
@pytest.mark.parametrize(
835+
"val, dtype, downcast",
836+
[(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")],
837+
)
838+
def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast):
839+
# GH#50505
840+
ser = Series([val, None], dtype=object)
841+
result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast)
842+
expected = Series([val, pd.NA], dtype=dtype)
843+
tm.assert_series_equal(result, expected)
844+
845+
846+
def test_to_numeric_use_nullable_dtypes_downcasting_uint():
847+
# GH#50505
848+
ser = Series([1, pd.NA], dtype="UInt64")
849+
result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned")
850+
expected = Series([1, pd.NA], dtype="UInt8")
851+
tm.assert_series_equal(result, expected)
852+
853+
854+
@pytest.mark.parametrize("dtype", ["Int64", "UInt64", "Float64", "boolean"])
855+
def test_to_numeric_use_nullable_dtypes_already_nullable(dtype):
856+
# GH#50505
857+
ser = Series([1, pd.NA], dtype=dtype)
858+
result = to_numeric(ser, use_nullable_dtypes=True)
859+
expected = Series([1, pd.NA], dtype=dtype)
860+
tm.assert_series_equal(result, expected)
861+
862+
863+
@pytest.mark.parametrize(
864+
"use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")]
865+
)
866+
def test_to_numeric_use_nullable_dtypes_error(use_nullable_dtypes, dtype):
867+
# GH#50505
868+
ser = Series(["a", "b", ""])
869+
expected = ser.copy()
870+
with pytest.raises(ValueError, match="Unable to parse string"):
871+
to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes)
872+
873+
result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="ignore")
874+
tm.assert_series_equal(result, expected)
875+
876+
result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="coerce")
877+
expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
878+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)