ENH: Add use_nullable_dtypes to to_numeric (#50505)

phofl · web-flow · commit bad377488059 · 2023-01-10T09:27:02.000+01:00
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -45,6 +45,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_sql_query`
 * :func:`read_sql_table`
 * :func:`read_orc`
+* :func:`to_numeric`
 
 Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
 to select the nullable dtypes implementation.
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2272,6 +2272,7 @@ def maybe_convert_numeric(
             if convert_empty or seen.coerce_numeric:
                 seen.saw_null()
                 floats[i] = complexes[i] = NaN
+                mask[i] = 1
             else:
                 raise ValueError("Empty string encountered")
         elif util.is_complex_object(val):
@@ -2328,6 +2329,7 @@ def maybe_convert_numeric(
 
                 seen.saw_null()
                 floats[i] = NaN
+                mask[i] = 1
 
     if seen.check_uint64_conflict():
         return (values, None)
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -13,11 +13,13 @@
 from pandas.core.dtypes.cast import maybe_downcast_numeric
 from pandas.core.dtypes.common import (
     ensure_object,
+    is_bool_dtype,
     is_datetime_or_timedelta_dtype,
     is_decimal,
     is_integer_dtype,
     is_number,
     is_numeric_dtype,
+    is_object_dtype,
     is_scalar,
     needs_i8_conversion,
 )
@@ -27,13 +29,14 @@
 )
 
 import pandas as pd
-from pandas.core.arrays.numeric import NumericArray
+from pandas.core.arrays import BaseMaskedArray
 
 
 def to_numeric(
     arg,
     errors: DateTimeErrorChoices = "raise",
     downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
+    use_nullable_dtypes: bool = False,
 ):
     """
     Convert argument to a numeric type.
@@ -47,7 +50,7 @@ def to_numeric(
     numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
     or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
     passed in, it is very likely they will be converted to float so that
-    they can stored in an `ndarray`. These warnings apply similarly to
+    they can be stored in an `ndarray`. These warnings apply similarly to
     `Series` since it internally leverages `ndarray`.
 
     Parameters
@@ -78,6 +81,10 @@ def to_numeric(
         the dtype it is to be cast to, so if none of the dtypes
         checked satisfy that specification, no downcasting will be
         performed on the data.
+    use_nullable_dtypes : bool = False
+        Whether or not to use nullable dtypes as default when converting data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
 
     Returns
     -------
@@ -178,11 +185,12 @@ def to_numeric(
     # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
     # save mask to reconstruct the full array after casting
     mask: npt.NDArray[np.bool_] | None = None
-    if isinstance(values, NumericArray):
+    if isinstance(values, BaseMaskedArray):
         mask = values._mask
         values = values._data[~mask]
 
     values_dtype = getattr(values, "dtype", None)
+    new_mask: np.ndarray | None = None
     if is_numeric_dtype(values_dtype):
         pass
     elif is_datetime_or_timedelta_dtype(values_dtype):
@@ -191,13 +199,23 @@ def to_numeric(
         values = ensure_object(values)
         coerce_numeric = errors not in ("ignore", "raise")
         try:
-            values, _ = lib.maybe_convert_numeric(
-                values, set(), coerce_numeric=coerce_numeric
+            values, new_mask = lib.maybe_convert_numeric(  # type: ignore[call-overload]
+                values,
+                set(),
+                coerce_numeric=coerce_numeric,
+                convert_to_masked_nullable=use_nullable_dtypes,
             )
         except (ValueError, TypeError):
             if errors == "raise":
                 raise
 
+    if new_mask is not None:
+        # Remove unnecessary values, is expected later anyway and enables
+        # downcasting
+        values = values[~new_mask]
+    elif use_nullable_dtypes and new_mask is None:
+        new_mask = np.zeros(values.shape, dtype=np.bool_)
+
     # attempt downcast only if the data has been successfully converted
     # to a numerical dtype and if a downcast method has been specified
     if downcast is not None and is_numeric_dtype(values.dtype):
@@ -228,18 +246,31 @@ def to_numeric(
                     if values.dtype == dtype:
                         break
 
-    # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
-    if mask is not None:
+    # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
+    # masked array
+    if (mask is not None or new_mask is not None) and not is_object_dtype(values.dtype):
+        if mask is None:
+            mask = new_mask
+        else:
+            mask = mask.copy()
+        assert isinstance(mask, np.ndarray)
         data = np.zeros(mask.shape, dtype=values.dtype)
         data[~mask] = values
 
         from pandas.core.arrays import (
+            BooleanArray,
             FloatingArray,
             IntegerArray,
         )
 
-        klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
-        values = klass(data, mask.copy())
+        klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray]
+        if is_integer_dtype(data.dtype):
+            klass = IntegerArray
+        elif is_bool_dtype(data.dtype):
+            klass = BooleanArray
+        else:
+            klass = FloatingArray
+        values = klass(data, mask)
 
     if is_series:
         return arg._constructor(values, index=arg.index, name=arg.name)
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
@@ -807,3 +807,72 @@ def test_to_numeric_large_float_not_downcast_to_float_32(val):
     expected = Series([val])
     result = to_numeric(expected, downcast="float")
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
+)
+def test_to_numeric_use_nullable_dtypes(val, dtype):
+    # GH#50505
+    ser = Series([val], dtype=object)
+    result = to_numeric(ser, use_nullable_dtypes=True)
+    expected = Series([val], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
+)
+def test_to_numeric_use_nullable_dtypes_na(val, dtype):
+    # GH#50505
+    ser = Series([val, None], dtype=object)
+    result = to_numeric(ser, use_nullable_dtypes=True)
+    expected = Series([val, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val, dtype, downcast",
+    [(1, "Int8", "integer"), (1.5, "Float32", "float"), (1, "Int8", "signed")],
+)
+def test_to_numeric_use_nullable_dtypes_downcasting(val, dtype, downcast):
+    # GH#50505
+    ser = Series([val, None], dtype=object)
+    result = to_numeric(ser, use_nullable_dtypes=True, downcast=downcast)
+    expected = Series([val, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_numeric_use_nullable_dtypes_downcasting_uint():
+    # GH#50505
+    ser = Series([1, pd.NA], dtype="UInt64")
+    result = to_numeric(ser, use_nullable_dtypes=True, downcast="unsigned")
+    expected = Series([1, pd.NA], dtype="UInt8")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "UInt64", "Float64", "boolean"])
+def test_to_numeric_use_nullable_dtypes_already_nullable(dtype):
+    # GH#50505
+    ser = Series([1, pd.NA], dtype=dtype)
+    result = to_numeric(ser, use_nullable_dtypes=True)
+    expected = Series([1, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "use_nullable_dtypes, dtype", [(True, "Float64"), (False, "float64")]
+)
+def test_to_numeric_use_nullable_dtypes_error(use_nullable_dtypes, dtype):
+    # GH#50505
+    ser = Series(["a", "b", ""])
+    expected = ser.copy()
+    with pytest.raises(ValueError, match="Unable to parse string"):
+        to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes)
+
+    result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="ignore")
+    tm.assert_series_equal(result, expected)
+
+    result = to_numeric(ser, use_nullable_dtypes=use_nullable_dtypes, errors="coerce")
+    expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
+    tm.assert_series_equal(result, expected)