pandas-dev · jtratner · Sep 15, 2013 · Sep 15, 2013 · Sep 17, 2013
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3325,11 +3325,6 @@ def form_blocks(arrays, names, axes):
             else:
                 datetime_items.append((i, k, v))
         elif issubclass(v.dtype.type, np.integer):
-            if v.dtype == np.uint64:
-                # HACK #2355 definite overflow
-                if (v > 2 ** 63 - 1).any():
-                    object_items.append((i, k, v))
-                    continue
             int_items.append((i, k, v))
         elif v.dtype == np.bool_:
             bool_items.append((i, k, v))

diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -425,6 +425,31 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
     else:
         return ints
 
+def maybe_convert_uint64(ndarray[object] objects):
+    '''
+    Try to convert objects into an array of uint64
+    '''
+    cdef:
+        Py_ssize_t i, n
+        ndarray[uint64_t] uints
+        bint cant_convert = 0
+        object val
+    n = len(objects)
+    uints = np.empty(n, dtype='uint64')
+    for i from 0 <= i < n:
+        val = objects[i]
+        if not util.is_integer_object(val) or val < 0:
+            cant_convert = 1
+            break
+        else:
+            uints[i] = val
+
+    if cant_convert:
+        return objects
+    else:
+        return uints
+
+
 def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
                           bint safe=0, bint convert_datetime=0):
     '''
@@ -460,61 +485,60 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     onan = np.nan
     fnan = np.nan
 
-    for i from 0 <= i < n:
-        val = objects[i]
-
-        if val is None:
-            seen_null = 1
-            floats[i] = complexes[i] = fnan
-        elif util.is_bool_object(val):
-            seen_bool = 1
-            bools[i] = val
-        elif util.is_float_object(val):
-            floats[i] = complexes[i] = val
-            seen_float = 1
-        elif util.is_datetime64_object(val):
-            if convert_datetime:
-                idatetimes[i] = convert_to_tsobject(val, None, None).value
-                seen_datetime = 1
-            else:
+    try:
+        for i from 0 <= i < n:
+            val = objects[i]
+
+            if val is None:
+                seen_null = 1
+                floats[i] = complexes[i] = fnan
+            elif util.is_bool_object(val):
+                seen_bool = 1
+                bools[i] = val
+            elif util.is_float_object(val):
+                floats[i] = complexes[i] = val
+                seen_float = 1
+            elif util.is_datetime64_object(val):
+                if convert_datetime:
+                    idatetimes[i] = convert_to_tsobject(val, None, None).value
+                    seen_datetime = 1
+                else:
+                    seen_object = 1
+                    # objects[i] = val.astype('O')
+                    break
+            elif util.is_timedelta64_object(val):
                 seen_object = 1
-                # objects[i] = val.astype('O')
                 break
-        elif util.is_timedelta64_object(val):
-            seen_object = 1
-            break
-        elif util.is_integer_object(val):
-            seen_int = 1
-            floats[i] = <float64_t> val
-            complexes[i] = <double complex> val
-            if not seen_null:
-                try:
+            elif util.is_integer_object(val):
+                seen_int = 1
+                floats[i] = <float64_t> val
+                complexes[i] = <double complex> val
+                if not seen_null:
                     ints[i] = val
-                except OverflowError:
+            elif util.is_complex_object(val):
+                complexes[i] = val
+                seen_complex = 1
+            elif PyDateTime_Check(val) or util.is_datetime64_object(val):
+                if convert_datetime:
+                    seen_datetime = 1
+                    idatetimes[i] = convert_to_tsobject(val, None, None).value
+                else:
+                    seen_object = 1
+                    break
+            elif try_float and not util.is_string_object(val):
+                # this will convert Decimal objects
+                try:
+                    floats[i] = float(val)
+                    complexes[i] = complex(val)
+                    seen_float = 1
+                except Exception:
                     seen_object = 1
                     break
-        elif util.is_complex_object(val):
-            complexes[i] = val
-            seen_complex = 1
-        elif PyDateTime_Check(val) or util.is_datetime64_object(val):
-            if convert_datetime:
-                seen_datetime = 1
-                idatetimes[i] = convert_to_tsobject(val, None, None).value
             else:
                 seen_object = 1
                 break
-        elif try_float and not util.is_string_object(val):
-            # this will convert Decimal objects
-            try:
-                floats[i] = float(val)
-                complexes[i] = complex(val)
-                seen_float = 1
-            except Exception:
-                seen_object = 1
-                break
-        else:
-            seen_object = 1
-            break
+    except OverflowError:
+        return maybe_convert_uint64(objects)
 
     seen_numeric = seen_complex or seen_float or seen_int
 

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -55,6 +55,7 @@
 import pandas.lib as lib
 
 from numpy.testing.decorators import slow
+from nose.tools import assert_equal
 
 def _skip_if_no_scipy():
     try:
@@ -79,13 +80,13 @@ def _check_mixed_float(df, dtype = None):
     elif isinstance(dtype, dict):
         dtypes.update(dtype)
     if dtypes.get('A'):
-        assert(df.dtypes['A'] == dtypes['A'])
+        assert_equal(df.dtypes['A'], dtypes['A'])
     if dtypes.get('B'):
-        assert(df.dtypes['B'] == dtypes['B'])
+        assert_equal(df.dtypes['B'], dtypes['B'])
     if dtypes.get('C'):
-        assert(df.dtypes['C'] == dtypes['C'])
+        assert_equal(df.dtypes['C'], dtypes['C'])
     if dtypes.get('D'):
-        assert(df.dtypes['D'] == dtypes['D'])
+        assert_equal(df.dtypes['D'], dtypes['D'])
 
 
 def _check_mixed_int(df, dtype = None):
@@ -95,13 +96,13 @@ def _check_mixed_int(df, dtype = None):
     elif isinstance(dtype, dict):
         dtypes.update(dtype)
     if dtypes.get('A'):
-        assert(df.dtypes['A'] == dtypes['A'])
+        assert_equal(df.dtypes['A'], dtypes['A'])
     if dtypes.get('B'):
-        assert(df.dtypes['B'] == dtypes['B'])
+        assert_equal(df.dtypes['B'], dtypes['B'])
     if dtypes.get('C'):
-        assert(df.dtypes['C'] == dtypes['C'])
+        assert_equal(df.dtypes['C'], dtypes['C'])
     if dtypes.get('D'):
-        assert(df.dtypes['D'] == dtypes['D'])
+        assert_equal(df.dtypes['D'], dtypes['D'])
 
 
 class CheckIndexing(object):
@@ -2225,17 +2226,17 @@ def test_constructor_overflow_int64(self):
                           dtype=np.uint64)
 
         result = DataFrame({'a': values})
-        self.assert_(result['a'].dtype == object)
+        self.assert_(result['a'].dtype == np.dtype('uint64'))
 
-        # #2355
+        # Now #2355 with #4845 fix.
         data_scores = [(6311132704823138710, 273), (2685045978526272070, 23),
                        (8921811264899370420, 45), (long(17019687244989530680), 270),
                        (long(9930107427299601010), 273)]
         dtype = [('uid', 'u8'), ('score', 'u8')]
         data = np.zeros((len(data_scores),), dtype=dtype)
         data[:] = data_scores
         df_crawls = DataFrame(data)
-        self.assert_(df_crawls['uid'].dtype == object)
+        self.assert_(df_crawls['uid'].dtype == np.dtype('uint64'))
 
     def test_is_mixed_type(self):
         self.assert_(not self.frame._is_mixed_type)
@@ -4437,7 +4438,7 @@ def test_arith_flex_frame(self):
                 # overflow in the uint
                 dtype = None
                 if op in ['sub']:
-                    dtype = dict(B = 'object', C = None)
+                    dtype = dict(B = 'uint64', C = None)
                 elif op in ['add','mul']:
                     dtype = dict(C = None)
                 assert_frame_equal(result, exp)
@@ -10346,7 +10347,7 @@ def test_constructor_with_convert(self):
 
         df = DataFrame({'A' : [2**63] })
         result = df['A']
-        expected = Series(np.asarray([2**63], np.object_))
+        expected = Series(np.asarray([2**63], np.uint64))
         assert_series_equal(result, expected)
 
         df = DataFrame({'A' : [datetime(2005, 1, 1), True] })

diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
@@ -0,0 +1,41 @@
+import unittest
+from datetime import datetime
+
+import pandas.lib as lib
+import numpy as np
+
+
+class TestLib(unittest.TestCase):
+    def test_maybe_convert_objects_uint64(self):
+        # GH4471 - array with objects too big for int64
+        arr = np.array([2 ** 63 + 1], dtype=object)
+        result = lib.maybe_convert_objects(arr)
+        expected = np.array([2 ** 63 + 1], dtype='uint64')
+        self.assertEqual(result.dtype, np.dtype('uint64'))
+        np.testing.assert_array_equal(result, expected)
+
+        arr2 = np.array([5, 2, 3, 4, 5, 1, 2, 3, 22, 1000, 2**63 + 5,
+                         2 ** 63 + 1000], dtype=object)
+        result = lib.maybe_convert_objects(arr2)
+        expected = arr2.copy().astype('uint64')
+        self.assertEqual(result.dtype, np.dtype('uint64'))
+        np.testing.assert_array_equal(result, expected)
+
+    def test_maybe_convert_objects_uint64_unconvertible(self):
+        # can't convert because negative number
+        neg = np.array([-5, 2 ** 63 + 5, 3], dtype=object)
+        neg2 = np.array([2 ** 63 + 100, -3], dtype=object)
+        # can't convert because of datetime
+        dt = np.array([datetime(2011, 5, 3), 2 ** 63 + 2], dtype=object)
+        # can't convert because of complex
+        cmplx = np.array([2 ** 63 + 5, 1+3j, 22], dtype=object)
+        # can't convert b/c of float
+        flt = np.array([3.25, 1, 3, 2 ** 63 +4], dtype=object)
+        # can't convert b/c of nan
+        null = np.array([5, 2, 2 ** 63 + 2, np.nan], dtype=object)
+        null2 = np.array([np.nan, 2 ** 63 + 2], dtype=object)
+        for arr in (neg, neg2, dt, cmplx, flt, null, null2):
+            result = lib.maybe_convert_objects(arr.copy())
+            self.assertEqual(result.dtype, np.object_)
+            np.testing.assert_array_equal(result, arr)
+