From 1c11b7202d94337f8dee5f627e660be1f0b0fae0 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 4 Dec 2017 02:06:42 -0800 Subject: [PATCH 1/2] BUG: Don't overflow in DataFrame init with uint For integers larger than what uint64 can handle, we gracefully default to the object dtype instead of overflowing. Closes gh-18584. --- doc/source/whatsnew/v0.22.0.txt | 3 +-- pandas/_libs/src/inference.pyx | 11 +++++++++-- pandas/tests/dtypes/test_inference.py | 6 ++++++ pandas/tests/frame/test_constructors.py | 10 ++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 495d0beaf3faa..3ca1417a108f5 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -186,7 +186,7 @@ Conversion ^^^^^^^^^^ - Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- +- Bug in the :class:`DataFrame` constructor in which data containing very large positive numbers was causing ``OverflowError`` (:issue:`18584`) - Indexing @@ -262,4 +262,3 @@ Other - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) - Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) -- diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index cb192fcced318..75c892873573e 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -181,6 +181,13 @@ cdef class Seen(object): """ Set flags indicating that an integer value was encountered. + In addition to setting a flag that an integer was seen, we + also set two flags depending on the type of integer seen: + + 1) sint_ : a negative (signed) number was encountered + 2) uint_ : a positive number in the range of [2**63, 2**64) + was encountered + Parameters ---------- val : Python int @@ -188,7 +195,7 @@ cdef class Seen(object): """ self.int_ = 1 self.sint_ = self.sint_ or (val < 0) - self.uint_ = self.uint_ or (val > oINT64_MAX) + self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) @property def numeric_(self): @@ -1263,7 +1270,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen.null_: seen.saw_int(int(val)) - if seen.uint_ and seen.sint_: + if (seen.uint_ and seen.sint_) or val > oUINT64_MAX: seen.object_ = 1 break diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index ef12416ef4e1c..21bb099d09261 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -388,6 +388,12 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) + def test_convert_uint64_overflow(self): + # see gh-18584 + arr = np.array([2**64], dtype=object) + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(arr, result) + def test_maybe_convert_objects_uint64(self): # see gh-4471 arr = np.array([2**63], dtype=object) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 876e0ea7ea0b3..1409383829ac1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -195,6 +195,16 @@ def test_constructor_overflow_int64(self): df_crawls = DataFrame(data) assert df_crawls['uid'].dtype == np.uint64 + @pytest.mark.parametrize("values", [np.array([2**64], dtype=object), + np.array([2**64]), [2**64]]) + def test_constructor_overflow_uint64(self, values): + # see gh-18584 + value = values[0] + result = DataFrame(values) + + assert result[0].dtype == object + assert result[0][0] == value + def test_constructor_ordereddict(self): import random nitems = 100 From 9d5abd3e80299a5ab39fe0bf7617ec6d00a3bd24 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 5 Dec 2017 01:52:37 -0800 Subject: [PATCH 2/2] Don't overflow in DataFrame init with int For integers smaller than what int64 can handle, we gracefully default to the object dtype instead of overflowing. --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/_libs/src/inference.pyx | 12 +++++++----- pandas/tests/dtypes/test_inference.py | 5 +++-- pandas/tests/frame/test_constructors.py | 6 ++++-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 3ca1417a108f5..f2500bb29d0be 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -186,7 +186,7 @@ Conversion ^^^^^^^^^^ - Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) -- Bug in the :class:`DataFrame` constructor in which data containing very large positive numbers was causing ``OverflowError`` (:issue:`18584`) +- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) - Indexing diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 75c892873573e..e15b4693432d9 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -184,9 +184,10 @@ cdef class Seen(object): In addition to setting a flag that an integer was seen, we also set two flags depending on the type of integer seen: - 1) sint_ : a negative (signed) number was encountered - 2) uint_ : a positive number in the range of [2**63, 2**64) - was encountered + 1) sint_ : a negative (signed) number in the + range of [-2**63, 0) was encountered + 2) uint_ : a positive number in the range of + [2**63, 2**64) was encountered Parameters ---------- @@ -194,7 +195,7 @@ cdef class Seen(object): Value with which to set the flags. """ self.int_ = 1 - self.sint_ = self.sint_ or (val < 0) + self.sint_ = self.sint_ or (oINT64_MIN <= val < 0) self.uint_ = self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) @property @@ -1270,7 +1271,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen.null_: seen.saw_int(int(val)) - if (seen.uint_ and seen.sint_) or val > oUINT64_MAX: + if ((seen.uint_ and seen.sint_) or + val > oUINT64_MAX or val < oINT64_MIN): seen.object_ = 1 break diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 21bb099d09261..092bbb36169d4 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -388,9 +388,10 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - def test_convert_uint64_overflow(self): + @pytest.mark.parametrize("value", [-2**63 - 1, 2**64]) + def test_convert_int_overflow(self, value): # see gh-18584 - arr = np.array([2**64], dtype=object) + arr = np.array([value], dtype=object) result = lib.maybe_convert_objects(arr) tm.assert_numpy_array_equal(arr, result) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1409383829ac1..8fd196bfc4d2a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -196,8 +196,10 @@ def test_constructor_overflow_int64(self): assert df_crawls['uid'].dtype == np.uint64 @pytest.mark.parametrize("values", [np.array([2**64], dtype=object), - np.array([2**64]), [2**64]]) - def test_constructor_overflow_uint64(self, values): + np.array([2**65]), [2**64 + 1], + np.array([-2**63 - 4], dtype=object), + np.array([-2**64 - 1]), [-2**65 - 2]]) + def test_constructor_int_overflow(self, values): # see gh-18584 value = values[0] result = DataFrame(values)