BUG: Convert uint64 in maybe_convert_numeric

gfyoung · gfyoung · commit e15620f71da0 · 2016-12-28T13:58:17.000-08:00
Add handling for uint64 elements in an array with the follow behavior specifications: 1) If uint64 and NaN are both detected, the original input will be returned if coerce_numeric is False. Otherwise, an Exception is raised. 2) If uint64 and negative numbers are both detected, the original input be returned if coerce_numeric is False. Otherwise, an Exception is raised. Closes gh-14982. Partial fix for gh-14983.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -319,5 +319,5 @@ Bug Fixes
 
 
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
-- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
+- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
 - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -944,26 +944,39 @@ def test_int64_overflow(self):
 00013007854817840017963235
 00013007854817840018860166"""
 
+        # 13007854817840016671868 > UINT64_MAX, so this
+        # will overflow and return object as the dtype.
         result = self.read_csv(StringIO(data))
         self.assertTrue(result['ID'].dtype == object)
 
-        self.assertRaises(OverflowError, self.read_csv,
-                          StringIO(data), converters={'ID': np.int64})
+        # 13007854817840016671868 > UINT64_MAX, so attempts
+        # to cast to either int64 or uint64 will result in
+        # an OverflowError being raised.
+        for conv in (np.int64, np.uint64):
+            self.assertRaises(OverflowError, self.read_csv,
+                              StringIO(data), converters={'ID': conv})
 
-        # Just inside int64 range: parse as integer
+        # These numbers fall right inside the int64 range,
+        # so they should be parsed as string.
         i_max = np.iinfo(np.int64).max
         i_min = np.iinfo(np.int64).min
+
         for x in [i_max, i_min]:
             result = self.read_csv(StringIO(str(x)), header=None)
             expected = DataFrame([x])
             tm.assert_frame_equal(result, expected)
 
-        # Just outside int64 range: parse as string
+        # These numbers fall just outside the int64 range,
+        # so they should be parsed as string.
         too_big = i_max + 1
         too_small = i_min - 1
+
         for x in [too_big, too_small]:
             result = self.read_csv(StringIO(str(x)), header=None)
-            expected = DataFrame([str(x)])
+            if self.engine == 'python' and x == too_big:
+                expected = DataFrame([x])
+            else:
+                expected = DataFrame([str(x)])
             tm.assert_frame_equal(result, expected)
 
     def test_empty_with_nrows_chunksize(self):
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -13,9 +13,6 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,
 
 # core.common import for fast inference checks
 
-npy_int64_max = np.iinfo(np.int64).max
-
-
 cpdef bint is_float(object obj):
     return util.is_float_object(obj)
 
@@ -629,48 +626,155 @@ cdef extern from "parse_helper.h":
 
 cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
 cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
+cdef uint64_t iUINT64_MAX = <uint64_t> UINT64_MAX
 
 
-def maybe_convert_numeric(object[:] values, set na_values,
+def maybe_convert_numeric(ndarray[object] values, set na_values,
                           bint convert_empty=True, bint coerce_numeric=False):
     """
-    Type inference function-- convert strings to numeric (potentially) and
-    convert to proper dtype array
+    Convert object array to a numeric array if possible.
+
+    Parameters
+    ----------
+    values : ndarray
+        Array of object elements to convert.
+    na_values : set
+        Set of values that should be interpreted as NaN.
+    convert_empty : bool, default True
+        If an empty array-like object is encountered, whether to interpret
+        that element as NaN or not. If set to False, a ValueError will be
+        raised if such an element is encountered and 'coerce_numeric' is False.
+    coerce_numeric : bool, default False
+        If initial attempts to convert to numeric have failed, whether to
+        force conversion to numeric via alternative methods or by setting the
+        element to NaN. Otherwise, an Exception will be raised when such an
+        element is encountered.
+
+        This boolean also has an impact on how conversion behaves when a
+        numeric array has no suitable numerical dtype to return (i.e. uint64,
+        int32, uint8). If set to False, the original object array will be
+        returned. Otherwise, a ValueError will be raised.
+
+    Returns
+    -------
+    numeric_array : array of converted object values to numerical ones
     """
     cdef:
         int status, maybe_int
         Py_ssize_t i, n = values.size
         ndarray[float64_t] floats = np.empty(n, dtype='f8')
         ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
         ndarray[int64_t] ints = np.empty(n, dtype='i8')
+        ndarray[uint64_t] uints = np.empty(n, dtype='u8')
         ndarray[uint8_t] bools = np.empty(n, dtype='u1')
+        bint seen_null = False
+        bint seen_uint = False
+        bint seen_sint = False
         bint seen_float = False
         bint seen_complex = False
         bint seen_int = False
         bint seen_bool = False
         object val
         float64_t fval
 
+    def check_uint64_nan():
+        """
+        Check whether we have encountered uint64 when handling a NaN element.
+
+        If uint64 has been encountered, we cannot safely cast to float64 due
+        to truncation problems (this would occur if we return a numeric array
+        containing a NaN element).
+
+        Returns
+        -------
+        return_values : bool
+            Whether or not we should return the original input array to avoid
+            data truncation.
+        """
+        if seen_null and seen_uint:
+            if not coerce_numeric:
+                return True
+            else:
+                raise ValueError("uint64 array detected, and such an "
+                                 "array cannot contain NaN.")
+
+        return False
+
+    def check_uint64_int64_conflict():
+        """
+        Check whether we have encountered both int64 and uint64 elements.
+
+        If bot have been encountered, we cannot safely cast to an integer
+        dtype since none is large enough to hold both types of elements.
+
+        Returns
+        -------
+        return_values : bool
+            Whether or not we should return the original input array to avoid
+            data truncation.
+        """
+        if seen_sint and seen_uint:
+            if not coerce_numeric:
+                return True
+            else:
+                raise ValueError("uint64 and negative values detected. "
+                                 "Cannot safely return a numeric array "
+                                 "without truncating data.")
+
+        return False
+
     for i in range(n):
         val = values[i]
 
         if val.__hash__ is not None and val in na_values:
+            seen_null = True
+            if check_uint64_nan():
+                return values
+
             floats[i] = complexes[i] = nan
             seen_float = True
         elif util.is_float_object(val):
+            if val != val:
+                seen_null = True
+                if check_uint64_nan():
+                    return values
+
             floats[i] = complexes[i] = val
             seen_float = True
         elif util.is_integer_object(val):
-            floats[i] = ints[i] = val
+            floats[i] = complexes[i] = val
+            as_int = int(val)
             seen_int = True
+
+            seen_uint = seen_uint or (as_int > iINT64_MAX)
+            seen_sint = seen_sint or (as_int < 0)
+
+            if check_uint64_nan() or check_uint64_int64_conflict():
+                return values
+
+            if seen_uint:
+                uints[i] = as_int
+            elif seen_sint:
+                ints[i] = as_int
+            else:
+                uints[i] = as_int
+                ints[i] = as_int
         elif util.is_bool_object(val):
-            floats[i] = ints[i] = bools[i] = val
+            floats[i] = uints[i] = ints[i] = bools[i] = val
             seen_bool = True
         elif val is None:
+            seen_null = True
+            if check_uint64_nan():
+                return values
+
             floats[i] = complexes[i] = nan
             seen_float = True
         elif hasattr(val, '__len__') and len(val) == 0:
             if convert_empty or coerce_numeric:
+                seen_null = True
+                if check_uint64_nan():
+                    return values
+
                 floats[i] = complexes[i] = nan
                 seen_float = True
             else:
@@ -686,24 +790,55 @@ def maybe_convert_numeric(object[:] values, set na_values,
                 status = floatify(val, &fval, &maybe_int)
 
                 if fval in na_values:
+                    seen_null = True
+                    if check_uint64_nan():
+                        return values
+
                     floats[i] = complexes[i] = nan
                     seen_float = True
                 else:
+                    if fval != fval:
+                        seen_null = True
+                        if check_uint64_nan():
+                            return values
+
                     floats[i] = fval
 
-                if not seen_float:
-                    if maybe_int:
-                        as_int = int(val)
+                if maybe_int:
+                    as_int = int(val)
 
-                        if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
+                    if as_int in na_values:
+                        seen_float = True
+                        seen_null = True
+                    else:
+                        seen_uint = seen_uint or (as_int > iINT64_MAX)
+                        seen_sint = seen_sint or (as_int < 0)
+                        seen_int = True
+
+                    if check_uint64_nan() or check_uint64_int64_conflict():
+                        return values
+
+                    if not (seen_float or as_int in na_values):
+                        if as_int < iINT64_MIN or as_int > iUINT64_MAX:
+                            raise ValueError('Integer out of range.')
+
+                        if seen_uint:
+                            uints[i] = as_int
+                        elif seen_sint:
                             ints[i] = as_int
                         else:
-                            raise ValueError('integer out of range')
-                    else:
-                        seen_float = True
+                            uints[i] = as_int
+                            ints[i] = as_int
+                else:
+                    seen_float = True
             except (TypeError, ValueError) as e:
                 if not coerce_numeric:
                     raise type(e)(str(e) + ' at position {}'.format(i))
+                elif "uint64" in str(e):  # Exception from check functions.
+                    raise
+                seen_null = True
+                if check_uint64_nan():
+                    return values
 
                 floats[i] = nan
                 seen_float = True
@@ -713,9 +848,14 @@ def maybe_convert_numeric(object[:] values, set na_values,
     elif seen_float:
         return floats
     elif seen_int:
-        return ints
+        if seen_uint:
+            return uints
+        else:
+            return ints
     elif seen_bool:
         return bools.view(np.bool_)
+    elif seen_uint:
+        return uints
     return ints
 
 
@@ -810,7 +950,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
             floats[i] = <float64_t> val
             complexes[i] = <double complex> val
             if not seen_null:
-                seen_uint = seen_uint or (int(val) > npy_int64_max)
+                seen_uint = seen_uint or (int(val) > iINT64_MAX)
                 seen_sint = seen_sint or (val < 0)
 
                 if seen_uint and seen_sint:
diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py
@@ -255,6 +255,59 @@ def test_convert_non_hashable(self):
         result = lib.maybe_convert_numeric(arr, set(), False, True)
         tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan]))
 
+    def test_convert_numeric_uint64(self):
+        arr = np.array([2**63], dtype=object)
+        exp = np.array([2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
+
+        arr = np.array([str(2**63)], dtype=object)
+        exp = np.array([2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
+
+        arr = np.array([np.uint64(2**63)], dtype=object)
+        exp = np.array([2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
+
+    def test_convert_numeric_uint64_nan(self):
+        msg = 'uint64 array detected'
+        cases = [(np.array([2**63, np.nan], dtype=object), set()),
+                 (np.array([str(2**63), np.nan], dtype=object), set()),
+                 (np.array([np.nan, 2**63], dtype=object), set()),
+                 (np.array([np.nan, str(2**63)], dtype=object), set()),
+                 (np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
+                 (np.array([str(2**63), str(2**63 + 1)],
+                           dtype=object), set([2**63]))]
+
+        for coerce in (True, False):
+            for arr, na_values in cases:
+                if coerce:
+                    with tm.assertRaisesRegexp(ValueError, msg):
+                        lib.maybe_convert_numeric(arr, na_values,
+                                                  coerce_numeric=coerce)
+                else:
+                    tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
+                        arr, na_values), arr)
+
+    def test_convert_numeric_int64_uint64(self):
+        msg = 'uint64 and negative values detected'
+        cases = [np.array([2**63, -1], dtype=object),
+                 np.array([str(2**63), -1], dtype=object),
+                 np.array([str(2**63), str(-1)], dtype=object),
+                 np.array([-1, 2**63], dtype=object),
+                 np.array([-1, str(2**63)], dtype=object),
+                 np.array([str(-1), str(2**63)], dtype=object)]
+
+        for coerce in (True, False):
+            for case in cases:
+                if coerce:
+                    with tm.assertRaisesRegexp(ValueError, msg):
+                        print(case)
+                        lib.maybe_convert_numeric(case, set(),
+                                                  coerce_numeric=coerce)
+                else:
+                    tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
+                        case, set()), case)
+
     def test_maybe_convert_objects_uint64(self):
         # see gh-4471
         arr = np.array([2**63], dtype=object)