pandas/tests/tools/test_to_numeric.py

import decimal

import numpy as np
from numpy import iinfo
import pytest

import pandas as pd
from pandas import DataFrame, Index, Series, to_numeric
import pandas._testing as tm


@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
def errors(request):
    return request.param


@pytest.fixture(params=[True, False])
def signed(request):
    return request.param


@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
def transform(request):
    return request.param


@pytest.fixture(params=[47393996303418497800, 100000000000000000000])
def large_val(request):
    return request.param


@pytest.fixture(params=[True, False])
def multiple_elts(request):
    return request.param


@pytest.fixture(
    params=[
        (lambda x: Index(x, name="idx"), tm.assert_index_equal),
        (lambda x: Series(x, name="ser"), tm.assert_series_equal),
        (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal),
    ]
)
def transform_assert_equal(request):
    return request.param


@pytest.mark.parametrize(
    "input_kwargs,result_kwargs",
    [
        ({}, {"dtype": np.int64}),
        ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}),
    ],
)
def test_empty(input_kwargs, result_kwargs):
    # see gh-16302
    ser = Series([], dtype=object)
    result = to_numeric(ser, **input_kwargs)

    expected = Series([], **result_kwargs)
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("last_val", ["7", 7])
def test_series(last_val):
    ser = Series(["1", "-3.14", last_val])
    result = to_numeric(ser)

    expected = Series([1, -3.14, 7])
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "data",
    [
        [1, 3, 4, 5],
        [1.0, 3.0, 4.0, 5.0],
        # Bool is regarded as numeric.
        [True, False, True, True],
    ],
)
def test_series_numeric(data):
    ser = Series(data, index=list("ABCD"), name="EFG")

    result = to_numeric(ser)
    tm.assert_series_equal(result, ser)


@pytest.mark.parametrize(
    "data,msg",
    [
        ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'),
        (
            ["orange", 1, -3.14, "apple"],
            'Unable to parse string "orange" at position 0',
        ),
    ],
)
def test_error(data, msg):
    ser = Series(data)

    with pytest.raises(ValueError, match=msg):
        to_numeric(ser, errors="raise")


@pytest.mark.parametrize(
    "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])]
)
def test_ignore_error(errors, exp_data):
    ser = Series([1, -3.14, "apple"])
    result = to_numeric(ser, errors=errors)

    expected = Series(exp_data)
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "errors,exp",
    [
        ("raise", 'Unable to parse string "apple" at position 2'),
        ("ignore", [True, False, "apple"]),
        # Coerces to float.
        ("coerce", [1.0, 0.0, np.nan]),
    ],
)
def test_bool_handling(errors, exp):
    ser = Series([True, False, "apple"])

    if isinstance(exp, str):
        with pytest.raises(ValueError, match=exp):
            to_numeric(ser, errors=errors)
    else:
        result = to_numeric(ser, errors=errors)
        expected = Series(exp)

        tm.assert_series_equal(result, expected)


def test_list():
    ser = ["1", "-3.14", "7"]
    res = to_numeric(ser)

    expected = np.array([1, -3.14, 7])
    tm.assert_numpy_array_equal(res, expected)


@pytest.mark.parametrize(
    "data,arr_kwargs",
    [
        ([1, 3, 4, 5], {"dtype": np.int64}),
        ([1.0, 3.0, 4.0, 5.0], {}),
        # Boolean is regarded as numeric.
        ([True, False, True, True], {}),
    ],
)
def test_list_numeric(data, arr_kwargs):
    result = to_numeric(data)
    expected = np.array(data, **arr_kwargs)
    tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}])
def test_numeric(kwargs):
    data = [1, -3.14, 7]

    ser = Series(data, **kwargs)
    result = to_numeric(ser)

    expected = Series(data)
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "columns",
    [
        # One column.
        "a",
        # Multiple columns.
        ["a", "b"],
    ],
)
def test_numeric_df_columns(columns):
    # see gh-14827
    df = DataFrame(
        {
            "a": [1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"],
            "b": [1.0, 2.0, 3.0, 4.0],
        }
    )

    expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]})

    df_copy = df.copy()
    df_copy[columns] = df_copy[columns].apply(to_numeric)

    tm.assert_frame_equal(df_copy, expected)


@pytest.mark.parametrize(
    "data,exp_data",
    [
        (
            [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1],
            [[3.14, 1.0], 1.6, 0.1],
        ),
        ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]),
    ],
)
def test_numeric_embedded_arr_likes(data, exp_data):
    # Test to_numeric with embedded lists and arrays
    df = DataFrame({"a": data})
    df["a"] = df["a"].apply(to_numeric)

    expected = DataFrame({"a": exp_data})
    tm.assert_frame_equal(df, expected)


def test_all_nan():
    ser = Series(["a", "b", "c"])
    result = to_numeric(ser, errors="coerce")

    expected = Series([np.nan, np.nan, np.nan])
    tm.assert_series_equal(result, expected)


def test_type_check(errors):
    # see gh-11776
    df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
    kwargs = {"errors": errors} if errors is not None else {}
    error_ctx = pytest.raises(TypeError, match="1-d array")

    with error_ctx:
        to_numeric(df, **kwargs)


@pytest.mark.parametrize("val", [1, 1.1, 20001])
def test_scalar(val, signed, transform):
    val = -val if signed else val
    assert to_numeric(transform(val)) == float(val)


def test_really_large_scalar(large_val, signed, transform, errors):
    # see gh-24910
    kwargs = {"errors": errors} if errors is not None else {}
    val = -large_val if signed else large_val

    val = transform(val)
    val_is_string = isinstance(val, str)

    if val_is_string and errors in (None, "raise"):
        msg = "Integer out of range. at position 0"
        with pytest.raises(ValueError, match=msg):
            to_numeric(val, **kwargs)
    else:
        expected = float(val) if (errors == "coerce" and val_is_string) else val
        tm.assert_almost_equal(to_numeric(val, **kwargs), expected)


def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
    # see gh-24910
    kwargs = {"errors": errors} if errors is not None else {}
    val = -large_val if signed else large_val
    val = transform(val)

    extra_elt = "string"
    arr = [val] + multiple_elts * [extra_elt]

    val_is_string = isinstance(val, str)
    coercing = errors == "coerce"

    if errors in (None, "raise") and (val_is_string or multiple_elts):
        if val_is_string:
            msg = "Integer out of range. at position 0"
        else:
            msg = 'Unable to parse string "string" at position 1'

        with pytest.raises(ValueError, match=msg):
            to_numeric(arr, **kwargs)
    else:
        result = to_numeric(arr, **kwargs)

        exp_val = float(val) if (coercing and val_is_string) else val
        expected = [exp_val]

        if multiple_elts:
            if coercing:
                expected.append(np.nan)
                exp_dtype = float
            else:
                expected.append(extra_elt)
                exp_dtype = object
        else:
            exp_dtype = float if isinstance(exp_val, (int, float)) else object

        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors):
    # see gh-24910
    #
    # Even if we discover that we have to hold float, does not mean
    # we should be lenient on subsequent elements that fail to be integer.
    kwargs = {"errors": errors} if errors is not None else {}
    arr = [str(-large_val if signed else large_val)]

    if multiple_elts:
        arr.insert(0, large_val)

    if errors in (None, "raise"):
        index = int(multiple_elts)
        msg = f"Integer out of range. at position {index}"

        with pytest.raises(ValueError, match=msg):
            to_numeric(arr, **kwargs)
    else:
        result = to_numeric(arr, **kwargs)

        if errors == "coerce":
            expected = [float(i) for i in arr]
            exp_dtype = float
        else:
            expected = arr
            exp_dtype = object

        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


@pytest.mark.parametrize(
    "errors,checker",
    [
        ("raise", 'Unable to parse string "fail" at position 0'),
        ("ignore", lambda x: x == "fail"),
        ("coerce", lambda x: np.isnan(x)),
    ],
)
def test_scalar_fail(errors, checker):
    scalar = "fail"

    if isinstance(checker, str):
        with pytest.raises(ValueError, match=checker):
            to_numeric(scalar, errors=errors)
    else:
        assert checker(to_numeric(scalar, errors=errors))


@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]])
def test_numeric_dtypes(data, transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    data = transform(data)

    result = to_numeric(data)
    assert_equal(result, data)


@pytest.mark.parametrize(
    "data,exp",
    [
        (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")),
        (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])),
    ],
)
def test_str(data, exp, transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    result = to_numeric(transform(data))

    expected = transform(exp)
    assert_equal(result, expected)


def test_datetime_like(tz_naive_fixture, transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture)

    result = to_numeric(transform(idx))
    expected = transform(idx.asi8)
    assert_equal(result, expected)


def test_timedelta(transform_assert_equal):
    transform, assert_equal = transform_assert_equal
    idx = pd.timedelta_range("1 days", periods=3, freq="D")

    result = to_numeric(transform(idx))
    expected = transform(idx.asi8)
    assert_equal(result, expected)


def test_period(transform_assert_equal):
    transform, assert_equal = transform_assert_equal

    idx = pd.period_range("2011-01", periods=3, freq="M", name="")
    inp = transform(idx)

    if isinstance(inp, Index):
        result = to_numeric(inp)
        expected = transform(idx.asi8)
        assert_equal(result, expected)
    else:
        # TODO: PeriodDtype, so support it in to_numeric.
        pytest.skip("Missing PeriodDtype support in to_numeric")


@pytest.mark.parametrize(
    "errors,expected",
    [
        ("raise", "Invalid object type at position 0"),
        ("ignore", Series([[10.0, 2], 1.0, "apple"])),
        ("coerce", Series([np.nan, 1.0, np.nan])),
    ],
)
def test_non_hashable(errors, expected):
    # see gh-13324
    ser = Series([[10.0, 2], 1.0, "apple"])

    if isinstance(expected, str):
        with pytest.raises(TypeError, match=expected):
            to_numeric(ser, errors=errors)
    else:
        result = to_numeric(ser, errors=errors)
        tm.assert_series_equal(result, expected)


def test_downcast_invalid_cast():
    # see gh-13352
    data = ["1", 2, 3]
    invalid_downcast = "unsigned-integer"
    msg = "invalid downcasting method provided"

    with pytest.raises(ValueError, match=msg):
        to_numeric(data, downcast=invalid_downcast)


def test_errors_invalid_value():
    # see gh-26466
    data = ["1", 2, 3]
    invalid_error_value = "invalid"
    msg = "invalid error value specified"

    with pytest.raises(ValueError, match=msg):
        to_numeric(data, errors=invalid_error_value)


@pytest.mark.parametrize(
    "data",
    [
        ["1", 2, 3],
        [1, 2, 3],
        np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
    ],
)
@pytest.mark.parametrize(
    "kwargs,exp_dtype",
    [
        # Basic function tests.
        ({}, np.int64),
        ({"downcast": None}, np.int64),
        # Support below np.float32 is rare and far between.
        ({"downcast": "float"}, np.dtype(np.float32).char),
        # Basic dtype support.
        ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])),
    ],
)
def test_downcast_basic(data, kwargs, exp_dtype):
    # see gh-13352
    result = to_numeric(data, **kwargs)
    expected = np.array([1, 2, 3], dtype=exp_dtype)
    tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
@pytest.mark.parametrize(
    "data",
    [
        ["1", 2, 3],
        [1, 2, 3],
        np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
    ],
)
def test_signed_downcast(data, signed_downcast):
    # see gh-13352
    smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
    expected = np.array([1, 2, 3], dtype=smallest_int_dtype)

    res = to_numeric(data, downcast=signed_downcast)
    tm.assert_numpy_array_equal(res, expected)


def test_ignore_downcast_invalid_data():
    # If we can't successfully cast the given
    # data to a numeric dtype, do not bother
    # with the downcast parameter.
    data = ["foo", 2, 3]
    expected = np.array(data, dtype=object)

    res = to_numeric(data, errors="ignore", downcast="unsigned")
    tm.assert_numpy_array_equal(res, expected)


def test_ignore_downcast_neg_to_unsigned():
    # Cannot cast to an unsigned integer
    # because we have a negative number.
    data = ["-1", 2, 3]
    expected = np.array([-1, 2, 3], dtype=np.int64)

    res = to_numeric(data, downcast="unsigned")
    tm.assert_numpy_array_equal(res, expected)


@pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
@pytest.mark.parametrize(
    "data,expected",
    [
        (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)),
        (
            [10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
            np.array(
                [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64
            ),
        ),
    ],
)
def test_ignore_downcast_cannot_convert_float(data, expected, downcast):
    # Cannot cast to an integer (signed or unsigned)
    # because we have a float number.
    res = to_numeric(data, downcast=downcast)
    tm.assert_numpy_array_equal(res, expected)


@pytest.mark.parametrize(
    "downcast,expected_dtype",
    [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)],
)
def test_downcast_not8bit(downcast, expected_dtype):
    # the smallest integer dtype need not be np.(u)int8
    data = ["256", 257, 258]

    expected = np.array([256, 257, 258], dtype=expected_dtype)
    res = to_numeric(data, downcast=downcast)
    tm.assert_numpy_array_equal(res, expected)


@pytest.mark.parametrize(
    "dtype,downcast,min_max",
    [
        ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]),
        ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]),
        ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]),
        ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]),
        ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]),
        ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]),
        ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]),
        ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]),
        ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]),
        ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]),
        ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]),
        ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]),
        ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]),
        ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]),
        ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]),
        ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]),
        ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]),
    ],
)
def test_downcast_limits(dtype, downcast, min_max):
    # see gh-14404: test the limits of each downcast.
    series = to_numeric(Series(min_max), downcast=downcast)
    assert series.dtype == dtype


@pytest.mark.parametrize(
    "ser,expected",
    [
        (
            Series([0, 9223372036854775808]),
            Series([0, 9223372036854775808], dtype=np.uint64),
        )
    ],
)
def test_downcast_uint64(ser, expected):
    # see gh-14422:
    # BUG: to_numeric doesn't work uint64 numbers

    result = pd.to_numeric(ser, downcast="unsigned")

    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "data,exp_data",
    [
        (
            [200, 300, "", "NaN", 30000000000000000000],
            [200, 300, np.nan, np.nan, 30000000000000000000],
        ),
        (
            ["12345678901234567890", "1234567890", "ITEM"],
            [12345678901234567890, 1234567890, np.nan],
        ),
    ],
)
def test_coerce_uint64_conflict(data, exp_data):
    # see gh-17007 and gh-17125
    #
    # Still returns float despite the uint64-nan conflict,
    # which would normally force the casting to object.
    result = to_numeric(Series(data), errors="coerce")
    expected = Series(exp_data, dtype=float)
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "errors,exp",
    [
        ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])),
        ("raise", "Unable to parse string"),
    ],
)
def test_non_coerce_uint64_conflict(errors, exp):
    # see gh-17007 and gh-17125
    #
    # For completeness.
    ser = Series(["12345678901234567890", "1234567890", "ITEM"])

    if isinstance(exp, str):
        with pytest.raises(ValueError, match=exp):
            to_numeric(ser, errors=errors)
    else:
        result = to_numeric(ser, errors=errors)
        tm.assert_series_equal(result, ser)


@pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"])
@pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"])
def test_downcast_empty(dc1, dc2):
    # GH32493

    tm.assert_numpy_array_equal(
        pd.to_numeric([], downcast=dc1),
        pd.to_numeric([], downcast=dc2),
        check_dtype=False,
    )


def test_failure_to_convert_uint64_string_to_NaN():
    # GH 32394
    result = to_numeric("uint64", errors="coerce")
    assert np.isnan(result)

    ser = Series([32, 64, np.nan])
    result = to_numeric(Series(["32", "64", "uint64"]), errors="coerce")
    tm.assert_series_equal(result, ser)


@pytest.mark.parametrize(
    "strrep",
    [
        "243.164",
        "245.968",
        "249.585",
        "259.745",
        "265.742",
        "272.567",
        "279.196",
        "280.366",
        "275.034",
        "271.351",
        "272.889",
        "270.627",
        "280.828",
        "290.383",
        "308.153",
        "319.945",
        "336.0",
        "344.09",
        "351.385",
        "356.178",
        "359.82",
        "361.03",
        "367.701",
        "380.812",
        "387.98",
        "391.749",
        "391.171",
        "385.97",
        "385.345",
        "386.121",
        "390.996",
        "399.734",
        "413.073",
        "421.532",
        "430.221",
        "437.092",
        "439.746",
        "446.01",
        "451.191",
        "460.463",
        "469.779",
        "472.025",
        "479.49",
        "474.864",
        "467.54",
        "471.978",
    ],
)
def test_precision_float_conversion(strrep):
    # GH 31364
    result = to_numeric(strrep)

    assert result == float(strrep)


@pytest.mark.parametrize(
    "values, expected",
    [
        (["1", "2", None], Series([1, 2, np.nan])),
        (["1", "2", "3"], Series([1, 2, 3])),
        (["1", "2", 3], Series([1, 2, 3])),
        (["1", "2", 3.5], Series([1, 2, 3.5])),
        (["1", None, 3.5], Series([1, np.nan, 3.5])),
        (["1", "2", "3.5"], Series([1, 2, 3.5])),
    ],
)
def test_to_numeric_from_nullable_string(values, expected):
    # https://github.com/pandas-dev/pandas/issues/37262
    s = Series(values, dtype="string")
    result = to_numeric(s)
    tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
    "data, input_dtype, downcast, expected_dtype",
    (
        ([1, 1], "Int64", "integer", "Int8"),
        ([1.0, 1.0], "Float64", "integer", "Int8"),
        ([1.0, 1.1], "Float64", "integer", "Float64"),
        ([1, pd.NA], "Int64", "integer", "Int8"),
        ([450, 300], "Int64", "integer", "Int16"),
        ([1, 1], "Int64", "signed", "Int8"),
        ([1.0, 1.0], "Float32", "signed", "Int8"),
        ([1.0, 1.1], "Float64", "signed", "Float64"),
        ([1, pd.NA], "Int64", "signed", "Int8"),
        ([450, -300], "Int64", "signed", "Int16"),
        ([1, 1], "Int64", "unsigned", "UInt8"),
        ([1.0, 1.0], "Float32", "unsigned", "UInt8"),
        ([1.0, 1.1], "Float64", "unsigned", "Float64"),
        ([1, pd.NA], "Int64", "unsigned", "UInt8"),
        ([450, -300], "Int64", "unsigned", "Int64"),
        ([-1, -1], "Int32", "unsigned", "Int32"),
        ([1, 1], "Float64", "float", "Float32"),
        ([1, 1.1], "Float64", "float", "Float32"),
        ([1, 1], "Float64", "integer", "Int8"),
    ),
)
def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
    arr = pd.array(data, dtype=input_dtype)
    result = pd.to_numeric(arr, downcast=downcast)
    expected = pd.array(data, dtype=expected_dtype)
    tm.assert_extension_array_equal(result, expected)