diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5b725eb4d2a98..43571b3879622 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -44,6 +44,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql_query` * :func:`read_sql_table` * :func:`read_orc` +* :func:`to_numeric` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 89e02ac0fa86d..176307ef27cff 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2272,6 +2272,7 @@ def maybe_convert_numeric( if convert_empty or seen.coerce_numeric: seen.saw_null() floats[i] = complexes[i] = NaN + mask[i] = 1 else: raise ValueError("Empty string encountered") elif util.is_complex_object(val): @@ -2328,6 +2329,7 @@ def maybe_convert_numeric( seen.saw_null() floats[i] = NaN + mask[i] = 1 if seen.check_uint64_conflict(): return (values, None) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index da47aa549dfa3..a8ae8c47b0d19 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -13,11 +13,13 @@ from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( ensure_object, + is_bool_dtype, is_datetime_or_timedelta_dtype, is_decimal, is_integer_dtype, is_number, is_numeric_dtype, + is_object_dtype, is_scalar, needs_i8_conversion, ) @@ -27,13 +29,14 @@ ) import pandas as pd -from pandas.core.arrays.numeric import NumericArray +from pandas.core.arrays import BaseMaskedArray def to_numeric( arg, errors: DateTimeErrorChoices = "raise", downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, + use_nullable_dtypes: bool = False, ): """ Convert argument to a numeric type. @@ -47,7 +50,7 @@ def to_numeric( numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are passed in, it is very likely they will be converted to float so that - they can stored in an `ndarray`. These warnings apply similarly to + they can be stored in an `ndarray`. These warnings apply similarly to `Series` since it internally leverages `ndarray`. Parameters @@ -78,6 +81,10 @@ def to_numeric( the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when converting data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. Returns ------- @@ -178,11 +185,12 @@ def to_numeric( # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None - if isinstance(values, NumericArray): + if isinstance(values, BaseMaskedArray): mask = values._mask values = values._data[~mask] values_dtype = getattr(values, "dtype", None) + new_mask: np.ndarray | None = None if is_numeric_dtype(values_dtype): pass elif is_datetime_or_timedelta_dtype(values_dtype): @@ -191,13 +199,23 @@ def to_numeric( values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, _ = lib.maybe_convert_numeric( - values, set(), coerce_numeric=coerce_numeric + values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] + values, + set(), + coerce_numeric=coerce_numeric, + convert_to_masked_nullable=use_nullable_dtypes, ) except (ValueError, TypeError): if errors == "raise": raise + if new_mask is not None: + # Remove unnecessary values, is expected later anyway and enables + # downcasting + values = values[~new_mask] + elif use_nullable_dtypes and new_mask is None: + new_mask = np.zeros(values.shape, dtype=np.bool_) + # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): @@ -228,18 +246,31 @@ def to_numeric( if values.dtype == dtype: break - # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array - if mask is not None: + # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct + # masked array + if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype): + if mask is None: + mask = new_mask + else: + mask = mask.copy() + assert isinstance(mask, np.ndarray) data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values from pandas.core.arrays import ( + BooleanArray, FloatingArray, IntegerArray, ) - klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray - values = klass(data, mask.copy()) + klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray] + if is_integer_dtype(data.dtype): + klass = IntegerArray + elif is_bool_dtype(data.dtype): + klass = BooleanArray + else: + klass = FloatingArray + values = klass(data, mask) if is_series: return arg._constructor(values, index=arg.index, name=arg.name) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1347f6eb50b09..1c0a8301d65cc 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -807,3 +807,72 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val): expected = Series([val]) result = to_numeric(expected, downcast="float") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] +) +def test_to_numeric_use_nullable_dtypes(val, dtype): + # GH#50505 + ser = Series([val], dtype=object) + result = to_numeric(ser, use_nullable_dtypes=True) + expected = Series([val], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")] +) +def test_to_numeric_use_nullable_dtypes_na(val, dtype): + # GH#50505 + ser = Series([val, None], dtype=object) + result = to_numeric(ser, use_nullable_dtypes=True) + expected = Series([val, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "val, dtype, downcast", + [(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")], +) +def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast): + # GH#50505 + ser = Series([val, None], dtype=object) + result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast) + expected = Series([val, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + +def test_to_numeric_use_nullable_dtypes_downcasting_uint(): + # GH#50505 + ser = Series([1, pd.NA], dtype="UInt64") + result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned") + expected = Series([1, pd.NA], dtype="UInt8") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "UInt64", "Float64", "boolean"]) +def test_to_numeric_use_nullable_dtypes_already_nullable(dtype): + # GH#50505 + ser = Series([1, pd.NA], dtype=dtype) + result = to_numeric(ser, use_nullable_dtypes=True) + expected = Series([1, pd.NA], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")] +) +def test_to_numeric_use_nullable_dtypes_error(use_nullable_dtypes, dtype): + # GH#50505 + ser = Series(["a", "b", ""]) + expected = ser.copy() + with pytest.raises(ValueError, match="Unable to parse string"): + to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes) + + result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="ignore") + tm.assert_series_equal(result, expected) + + result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="coerce") + expected = Series([np.nan, np.nan, np.nan], dtype=dtype) + tm.assert_series_equal(result, expected)