Skip to content

Commit 56b6921

Browse files
committed
BUG: Better handle larger numbers in to_numeric
* Warn about lossiness when passing really large numbers that exceed (u)int64 ranges. * Coerce negative numbers to float when requested instead of crashing and returning object. * Consistently parse numbers as integers / floats, even if we know that the resulting container has to be float. This is to ensure consistent error behavior when inputs numbers are too large. Closes pandas-devgh-24910.
1 parent 95f8dca commit 56b6921

File tree

4 files changed

+148
-15
lines changed

4 files changed

+148
-15
lines changed

doc/source/whatsnew/v0.25.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ Timezones
103103
Numeric
104104
^^^^^^^
105105

106+
- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`)
107+
- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`)
106108
-
107109
-
108110
-

pandas/_libs/lib.pyx

+15-10
Original file line numberDiff line numberDiff line change
@@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
18281828
except (ValueError, OverflowError, TypeError):
18291829
pass
18301830

1831-
# otherwise, iterate and do full infererence
1831+
# Otherwise, iterate and do full inference.
18321832
cdef:
18331833
int status, maybe_int
18341834
Py_ssize_t i, n = values.size
@@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
18651865
else:
18661866
seen.float_ = True
18671867

1868-
if val <= oINT64_MAX:
1868+
if oINT64_MIN <= val <= oINT64_MAX:
18691869
ints[i] = val
18701870

1871-
if seen.sint_ and seen.uint_:
1871+
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
18721872
seen.float_ = True
18731873

18741874
elif util.is_bool_object(val):
@@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
19101910
else:
19111911
seen.saw_int(as_int)
19121912

1913-
if not (seen.float_ or as_int in na_values):
1913+
if as_int not in na_values:
19141914
if as_int < oINT64_MIN or as_int > oUINT64_MAX:
1915-
raise ValueError('Integer out of range.')
1915+
if seen.coerce_numeric:
1916+
seen.float_ = True
1917+
else:
1918+
raise ValueError("Integer out of range.")
1919+
else:
1920+
if as_int >= 0:
1921+
uints[i] = as_int
19161922

1917-
if as_int >= 0:
1918-
uints[i] = as_int
1919-
if as_int <= oINT64_MAX:
1920-
ints[i] = as_int
1923+
if as_int <= oINT64_MAX:
1924+
ints[i] = as_int
19211925

19221926
seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
19231927
else:
19241928
seen.float_ = True
19251929
except (TypeError, ValueError) as e:
19261930
if not seen.coerce_numeric:
1927-
raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
1931+
raise type(e)(str(e) + " at position {pos}".format(pos=i))
19281932
elif "uint64" in str(e): # Exception from check functions.
19291933
raise
1934+
19301935
seen.saw_null()
19311936
floats[i] = NaN
19321937

pandas/core/tools/numeric.py

+8
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None):
1919
depending on the data supplied. Use the `downcast` parameter
2020
to obtain other dtypes.
2121
22+
Please note that precision loss may occur if really large numbers
23+
are passed in. Due to the internal limitations of `ndarray`, if
24+
numbers smaller than `-9223372036854775808` or larger than
25+
`18446744073709551615` are passed in, it is very likely they
26+
will be converted to float so that they can stored in an `ndarray`.
27+
These warnings apply similarly to `Series` since it internally
28+
leverages `ndarray`.
29+
2230
Parameters
2331
----------
2432
arg : scalar, list, tuple, 1-d array, or Series

pandas/tests/tools/test_numeric.py

+123-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from numpy import iinfo
55
import pytest
66

7+
import pandas.compat as compat
8+
79
import pandas as pd
810
from pandas import DataFrame, Index, Series, to_numeric
911
from pandas.util import testing as tm
@@ -172,7 +174,11 @@ def test_all_nan():
172174
tm.assert_series_equal(result, expected)
173175

174176

175-
@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
177+
@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
178+
def errors(request):
179+
return request.param
180+
181+
176182
def test_type_check(errors):
177183
# see gh-11776
178184
df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
@@ -183,11 +189,123 @@ def test_type_check(errors):
183189
to_numeric(df, **kwargs)
184190

185191

186-
@pytest.mark.parametrize("val", [
187-
1, 1.1, "1", "1.1", -1.5, "-1.5"
192+
@pytest.fixture(params=[True, False])
193+
def signed(request):
194+
return request.param
195+
196+
197+
@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
198+
def transform(request):
199+
return request.param
200+
201+
202+
@pytest.mark.parametrize("val", [1, 1.1, 20001])
203+
def test_scalar(val, signed, transform):
204+
val = -val if signed else val
205+
assert to_numeric(transform(val)) == float(val)
206+
207+
208+
@pytest.fixture(params=[
209+
47393996303418497800,
210+
100000000000000000000
188211
])
189-
def test_scalar(val):
190-
assert to_numeric(val) == float(val)
212+
def large_val(request):
213+
return request.param
214+
215+
216+
def test_really_large_scalar(large_val, signed, transform, errors):
217+
# see gh-24910
218+
kwargs = dict(errors=errors) if errors is not None else dict()
219+
val = -large_val if signed else large_val
220+
221+
val = transform(val)
222+
val_is_string = isinstance(val, str)
223+
224+
if val_is_string and errors in (None, "raise"):
225+
msg = "Integer out of range. at position 0"
226+
with pytest.raises(ValueError, match=msg):
227+
to_numeric(val, **kwargs)
228+
else:
229+
expected = float(val) if (errors == "coerce" and
230+
val_is_string) else val
231+
assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
232+
233+
234+
@pytest.fixture(params=[True, False])
235+
def multiple_elts(request):
236+
return request.param
237+
238+
239+
def test_really_large_in_arr(large_val, signed, transform,
240+
multiple_elts, errors):
241+
# see gh-24910
242+
kwargs = dict(errors=errors) if errors is not None else dict()
243+
val = -large_val if signed else large_val
244+
val = transform(val)
245+
246+
extra_elt = "string"
247+
arr = [val] + multiple_elts * [extra_elt]
248+
249+
val_is_string = isinstance(val, str)
250+
coercing = errors == "coerce"
251+
252+
if errors in (None, "raise") and (val_is_string or multiple_elts):
253+
if val_is_string:
254+
msg = "Integer out of range. at position 0"
255+
else:
256+
msg = 'Unable to parse string "string" at position 1'
257+
258+
with pytest.raises(ValueError, match=msg):
259+
to_numeric(arr, **kwargs)
260+
else:
261+
result = to_numeric(arr, **kwargs)
262+
263+
exp_val = float(val) if (coercing and val_is_string) else val
264+
expected = [exp_val]
265+
266+
if multiple_elts:
267+
if coercing:
268+
expected.append(np.nan)
269+
exp_dtype = float
270+
else:
271+
expected.append(extra_elt)
272+
exp_dtype = object
273+
else:
274+
exp_dtype = float if isinstance(exp_val, (
275+
int, compat.long, float)) else object
276+
277+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
278+
279+
280+
def test_really_large_in_arr_consistent(large_val, signed,
281+
multiple_elts, errors):
282+
# see gh-24910
283+
#
284+
# Even if we discover that we have to hold float, does not mean
285+
# we should be lenient on subsequent elements that fail to be integer.
286+
kwargs = dict(errors=errors) if errors is not None else dict()
287+
arr = [str(-large_val if signed else large_val)]
288+
289+
if multiple_elts:
290+
arr.insert(0, large_val)
291+
292+
if errors in (None, "raise"):
293+
index = int(multiple_elts)
294+
msg = "Integer out of range. at position {index}".format(index=index)
295+
296+
with pytest.raises(ValueError, match=msg):
297+
to_numeric(arr, **kwargs)
298+
else:
299+
result = to_numeric(arr, **kwargs)
300+
301+
if errors == "coerce":
302+
expected = [float(i) for i in arr]
303+
exp_dtype = float
304+
else:
305+
expected = arr
306+
exp_dtype = object
307+
308+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
191309

192310

193311
@pytest.mark.parametrize("errors,checker", [

0 commit comments

Comments
 (0)