Skip to content

Commit 07ceaaf

Browse files
gfyoungPingviinituutti
authored andcommitted
BUG: Better handle larger numbers in to_numeric (pandas-dev#24956)
* BUG: Better handle larger numbers in to_numeric * Warn about lossiness when passing really large numbers that exceed (u)int64 ranges. * Coerce negative numbers to float when requested instead of crashing and returning object. * Consistently parse numbers as integers / floats, even if we know that the resulting container has to be float. This is to ensure consistent error behavior when inputs numbers are too large. Closes pandas-devgh-24910. * MAINT: Address comments
1 parent 9e89eb1 commit 07ceaaf

File tree

4 files changed

+158
-25
lines changed

4 files changed

+158
-25
lines changed

doc/source/whatsnew/v0.25.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ Timezones
110110
Numeric
111111
^^^^^^^
112112

113+
- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`)
114+
- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`)
113115
-
114116
-
115117
-

pandas/_libs/lib.pyx

+15-10
Original file line numberDiff line numberDiff line change
@@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
18281828
except (ValueError, OverflowError, TypeError):
18291829
pass
18301830

1831-
# otherwise, iterate and do full infererence
1831+
# Otherwise, iterate and do full inference.
18321832
cdef:
18331833
int status, maybe_int
18341834
Py_ssize_t i, n = values.size
@@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
18651865
else:
18661866
seen.float_ = True
18671867

1868-
if val <= oINT64_MAX:
1868+
if oINT64_MIN <= val <= oINT64_MAX:
18691869
ints[i] = val
18701870

1871-
if seen.sint_ and seen.uint_:
1871+
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
18721872
seen.float_ = True
18731873

18741874
elif util.is_bool_object(val):
@@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
19101910
else:
19111911
seen.saw_int(as_int)
19121912

1913-
if not (seen.float_ or as_int in na_values):
1913+
if as_int not in na_values:
19141914
if as_int < oINT64_MIN or as_int > oUINT64_MAX:
1915-
raise ValueError('Integer out of range.')
1915+
if seen.coerce_numeric:
1916+
seen.float_ = True
1917+
else:
1918+
raise ValueError("Integer out of range.")
1919+
else:
1920+
if as_int >= 0:
1921+
uints[i] = as_int
19161922

1917-
if as_int >= 0:
1918-
uints[i] = as_int
1919-
if as_int <= oINT64_MAX:
1920-
ints[i] = as_int
1923+
if as_int <= oINT64_MAX:
1924+
ints[i] = as_int
19211925

19221926
seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
19231927
else:
19241928
seen.float_ = True
19251929
except (TypeError, ValueError) as e:
19261930
if not seen.coerce_numeric:
1927-
raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
1931+
raise type(e)(str(e) + " at position {pos}".format(pos=i))
19281932
elif "uint64" in str(e): # Exception from check functions.
19291933
raise
1934+
19301935
seen.saw_null()
19311936
floats[i] = NaN
19321937

pandas/core/tools/numeric.py

+8
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None):
1919
depending on the data supplied. Use the `downcast` parameter
2020
to obtain other dtypes.
2121
22+
Please note that precision loss may occur if really large numbers
23+
are passed in. Due to the internal limitations of `ndarray`, if
24+
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
25+
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
26+
passed in, it is very likely they will be converted to float so that
27+
they can stored in an `ndarray`. These warnings apply similarly to
28+
`Series` since it internally leverages `ndarray`.
29+
2230
Parameters
2331
----------
2432
arg : scalar, list, tuple, 1-d array, or Series

pandas/tests/tools/test_numeric.py

+133-15
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,50 @@
44
from numpy import iinfo
55
import pytest
66

7+
import pandas.compat as compat
8+
79
import pandas as pd
810
from pandas import DataFrame, Index, Series, to_numeric
911
from pandas.util import testing as tm
1012

1113

14+
@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
15+
def errors(request):
16+
return request.param
17+
18+
19+
@pytest.fixture(params=[True, False])
20+
def signed(request):
21+
return request.param
22+
23+
24+
@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
25+
def transform(request):
26+
return request.param
27+
28+
29+
@pytest.fixture(params=[
30+
47393996303418497800,
31+
100000000000000000000
32+
])
33+
def large_val(request):
34+
return request.param
35+
36+
37+
@pytest.fixture(params=[True, False])
38+
def multiple_elts(request):
39+
return request.param
40+
41+
42+
@pytest.fixture(params=[
43+
(lambda x: Index(x, name="idx"), tm.assert_index_equal),
44+
(lambda x: Series(x, name="ser"), tm.assert_series_equal),
45+
(lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal)
46+
])
47+
def transform_assert_equal(request):
48+
return request.param
49+
50+
1251
@pytest.mark.parametrize("input_kwargs,result_kwargs", [
1352
(dict(), dict(dtype=np.int64)),
1453
(dict(errors="coerce", downcast="integer"), dict(dtype=np.int8))
@@ -172,7 +211,6 @@ def test_all_nan():
172211
tm.assert_series_equal(result, expected)
173212

174213

175-
@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
176214
def test_type_check(errors):
177215
# see gh-11776
178216
df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
@@ -183,11 +221,100 @@ def test_type_check(errors):
183221
to_numeric(df, **kwargs)
184222

185223

186-
@pytest.mark.parametrize("val", [
187-
1, 1.1, "1", "1.1", -1.5, "-1.5"
188-
])
189-
def test_scalar(val):
190-
assert to_numeric(val) == float(val)
224+
@pytest.mark.parametrize("val", [1, 1.1, 20001])
225+
def test_scalar(val, signed, transform):
226+
val = -val if signed else val
227+
assert to_numeric(transform(val)) == float(val)
228+
229+
230+
def test_really_large_scalar(large_val, signed, transform, errors):
231+
# see gh-24910
232+
kwargs = dict(errors=errors) if errors is not None else dict()
233+
val = -large_val if signed else large_val
234+
235+
val = transform(val)
236+
val_is_string = isinstance(val, str)
237+
238+
if val_is_string and errors in (None, "raise"):
239+
msg = "Integer out of range. at position 0"
240+
with pytest.raises(ValueError, match=msg):
241+
to_numeric(val, **kwargs)
242+
else:
243+
expected = float(val) if (errors == "coerce" and
244+
val_is_string) else val
245+
assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
246+
247+
248+
def test_really_large_in_arr(large_val, signed, transform,
249+
multiple_elts, errors):
250+
# see gh-24910
251+
kwargs = dict(errors=errors) if errors is not None else dict()
252+
val = -large_val if signed else large_val
253+
val = transform(val)
254+
255+
extra_elt = "string"
256+
arr = [val] + multiple_elts * [extra_elt]
257+
258+
val_is_string = isinstance(val, str)
259+
coercing = errors == "coerce"
260+
261+
if errors in (None, "raise") and (val_is_string or multiple_elts):
262+
if val_is_string:
263+
msg = "Integer out of range. at position 0"
264+
else:
265+
msg = 'Unable to parse string "string" at position 1'
266+
267+
with pytest.raises(ValueError, match=msg):
268+
to_numeric(arr, **kwargs)
269+
else:
270+
result = to_numeric(arr, **kwargs)
271+
272+
exp_val = float(val) if (coercing and val_is_string) else val
273+
expected = [exp_val]
274+
275+
if multiple_elts:
276+
if coercing:
277+
expected.append(np.nan)
278+
exp_dtype = float
279+
else:
280+
expected.append(extra_elt)
281+
exp_dtype = object
282+
else:
283+
exp_dtype = float if isinstance(exp_val, (
284+
int, compat.long, float)) else object
285+
286+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
287+
288+
289+
def test_really_large_in_arr_consistent(large_val, signed,
290+
multiple_elts, errors):
291+
# see gh-24910
292+
#
293+
# Even if we discover that we have to hold float, does not mean
294+
# we should be lenient on subsequent elements that fail to be integer.
295+
kwargs = dict(errors=errors) if errors is not None else dict()
296+
arr = [str(-large_val if signed else large_val)]
297+
298+
if multiple_elts:
299+
arr.insert(0, large_val)
300+
301+
if errors in (None, "raise"):
302+
index = int(multiple_elts)
303+
msg = "Integer out of range. at position {index}".format(index=index)
304+
305+
with pytest.raises(ValueError, match=msg):
306+
to_numeric(arr, **kwargs)
307+
else:
308+
result = to_numeric(arr, **kwargs)
309+
310+
if errors == "coerce":
311+
expected = [float(i) for i in arr]
312+
exp_dtype = float
313+
else:
314+
expected = arr
315+
exp_dtype = object
316+
317+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
191318

192319

193320
@pytest.mark.parametrize("errors,checker", [
@@ -205,15 +332,6 @@ def test_scalar_fail(errors, checker):
205332
assert checker(to_numeric(scalar, errors=errors))
206333

207334

208-
@pytest.fixture(params=[
209-
(lambda x: Index(x, name="idx"), tm.assert_index_equal),
210-
(lambda x: Series(x, name="ser"), tm.assert_series_equal),
211-
(lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal)
212-
])
213-
def transform_assert_equal(request):
214-
return request.param
215-
216-
217335
@pytest.mark.parametrize("data", [
218336
[1, 2, 3],
219337
[1., np.nan, 3, np.nan]

0 commit comments

Comments
 (0)