Skip to content

Commit 6b6cfb8

Browse files
gfyoungjorisvandenbossche
authored andcommitted
BUG: Don't overflow in DataFrame init (#18624)
For integers larger than what uint64 can handle, we gracefully default to the object dtype instead of overflowing. For integers smaller than what int64 can handle, we gracefully default to the object dtype instead of overflowing. Closes gh-18584.
1 parent c3c04e2 commit 6b6cfb8

File tree

4 files changed

+32
-5
lines changed

4 files changed

+32
-5
lines changed

doc/source/whatsnew/v0.22.0.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ Conversion
186186
^^^^^^^^^^
187187

188188
- Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`)
189-
-
189+
- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`)
190190
-
191191

192192
Indexing
@@ -262,4 +262,3 @@ Other
262262
- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`)
263263
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
264264
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
265-
-

pandas/_libs/src/inference.pyx

+12-3
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,22 @@ cdef class Seen(object):
181181
"""
182182
Set flags indicating that an integer value was encountered.
183183
184+
In addition to setting a flag that an integer was seen, we
185+
also set two flags depending on the type of integer seen:
186+
187+
1) sint_ : a negative (signed) number in the
188+
range of [-2**63, 0) was encountered
189+
2) uint_ : a positive number in the range of
190+
[2**63, 2**64) was encountered
191+
184192
Parameters
185193
----------
186194
val : Python int
187195
Value with which to set the flags.
188196
"""
189197
self.int_ = 1
190-
self.sint_ = self.sint_ or (val < 0)
191-
self.uint_ = self.uint_ or (val > oINT64_MAX)
198+
self.sint_ = self.sint_ or (oINT64_MIN <= val < 0)
199+
self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX)
192200

193201
@property
194202
def numeric_(self):
@@ -1263,7 +1271,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
12631271
if not seen.null_:
12641272
seen.saw_int(int(val))
12651273

1266-
if seen.uint_ and seen.sint_:
1274+
if ((seen.uint_ and seen.sint_) or
1275+
val > oUINT64_MAX or val < oINT64_MIN):
12671276
seen.object_ = 1
12681277
break
12691278

pandas/tests/dtypes/test_inference.py

+7
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,13 @@ def test_convert_numeric_int64_uint64(self, case, coerce):
388388
result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
389389
tm.assert_almost_equal(result, expected)
390390

391+
@pytest.mark.parametrize("value", [-2**63 - 1, 2**64])
392+
def test_convert_int_overflow(self, value):
393+
# see gh-18584
394+
arr = np.array([value], dtype=object)
395+
result = lib.maybe_convert_objects(arr)
396+
tm.assert_numpy_array_equal(arr, result)
397+
391398
def test_maybe_convert_objects_uint64(self):
392399
# see gh-4471
393400
arr = np.array([2**63], dtype=object)

pandas/tests/frame/test_constructors.py

+12
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,18 @@ def test_constructor_overflow_int64(self):
195195
df_crawls = DataFrame(data)
196196
assert df_crawls['uid'].dtype == np.uint64
197197

198+
@pytest.mark.parametrize("values", [np.array([2**64], dtype=object),
199+
np.array([2**65]), [2**64 + 1],
200+
np.array([-2**63 - 4], dtype=object),
201+
np.array([-2**64 - 1]), [-2**65 - 2]])
202+
def test_constructor_int_overflow(self, values):
203+
# see gh-18584
204+
value = values[0]
205+
result = DataFrame(values)
206+
207+
assert result[0].dtype == object
208+
assert result[0][0] == value
209+
198210
def test_constructor_ordereddict(self):
199211
import random
200212
nitems = 100

0 commit comments

Comments
 (0)