diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 495d0beaf3faa..f2500bb29d0be 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -186,7 +186,7 @@ Conversion ^^^^^^^^^^ - Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- +- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Indexing @@ -262,4 +262,3 @@ Other - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) -- diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index cb192fcced318..e15b4693432d9 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -181,14 +181,22 @@ cdef class Seen(object): """ Set flags indicating that an integer value was encountered. + In addition to setting a flag that an integer was seen, we + also set two flags depending on the type of integer seen: + + 1) sint_ : a negative (signed) number in the + range of [-2**63, 0) was encountered + 2) uint_ : a positive number in the range of + [2**63, 2**64) was encountered + Parameters ---------- val : Python int Value with which to set the flags. """ self.int_ = 1 - self.sint_ = self.sint_ or (val < 0) - self.uint_ = self.uint_ or (val > oINT64_MAX) + self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) + self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) @property def numeric_(self): @@ -1263,7 +1271,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen.null_: seen.saw_int(int(val)) - if seen.uint_ and seen.sint_: + if ((seen.uint_ and seen.sint_) or + val > oUINT64_MAX or val < oINT64_MIN): seen.object_ = 1 break diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ef12416ef4e1c..092bbb36169d4 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -388,6 +388,13 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) + @pytest.mark.parametrize("value", [-2**63 - 1, 2**64]) + def test_convert_int_overflow(self, value): + # see gh-18584 + arr = np.array([value], dtype=object) + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(arr, result) + def test_maybe_convert_objects_uint64(self): # see gh-4471 arr = np.array([2**63], dtype=object) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 876e0ea7ea0b3..8fd196bfc4d2a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -195,6 +195,18 @@ def test_constructor_overflow_int64(self): df_crawls = DataFrame(data) assert df_crawls['uid'].dtype == np.uint64 + @pytest.mark.parametrize("values", [np.array([2**64], dtype=object), + np.array([2**65]), [2**64 + 1], + np.array([-2**63 - 4], dtype=object), + np.array([-2**64 - 1]), [-2**65 - 2]]) + def test_constructor_int_overflow(self, values): + # see gh-18584 + value = values[0] + result = DataFrame(values) + + assert result[0].dtype == object + assert result[0][0] == value + def test_constructor_ordereddict(self): import random nitems = 100