From 86cd6576d65193ebd96b3e562a45fe15ee220426 Mon Sep 17 00:00:00 2001
From: Jeffrey Tratner <jeffrey.tratner@gmail.com>
Date: Sun, 15 Sep 2013 11:15:49 -0400
Subject: [PATCH 1/3] BUG: lib.maybe_convert_objects work with uint64

When it's greater than uint64 max (and not negative, etc.)
---
 pandas/src/inference.pyx   | 24 +++++++++++++++++++++---
 pandas/tests/test_frame.py |  2 ++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
index e0bbc1a4e64c1..30c20328eab7a 100644
--- a/pandas/src/inference.pyx
+++ b/pandas/src/inference.pyx
@@ -28,6 +28,9 @@ try:
 except AttributeError:
     pass
 
+# I'm sure there's a better way to do this
+cdef int64_t MAX_INT = np.iinfo(np.int64).max
+
 def infer_dtype(object _values):
     cdef:
         Py_ssize_t i, n
@@ -437,6 +440,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
         ndarray[int64_t] ints
         ndarray[uint8_t] bools
         ndarray[int64_t] idatetimes
+        ndarray[uint64_t] uints
         bint seen_float = 0
         bint seen_complex = 0
         bint seen_datetime = 0
@@ -445,6 +449,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
         bint seen_object = 0
         bint seen_null = 0
         bint seen_numeric = 0
+        bint seen_uint = 0
+        bint seen_negative = 0
         object val, onan
         float64_t fval, fnan
 
@@ -456,6 +462,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     bools = np.empty(n, dtype=np.uint8)
     datetimes = np.empty(n, dtype='M8[ns]')
     idatetimes = datetimes.view(np.int64)
+    uints = np.empty(n, dtype='uint64')
 
     onan = np.nan
     fnan = np.nan
@@ -491,8 +498,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
                 try:
                     ints[i] = val
                 except OverflowError:
-                    seen_object = 1
-                    break
+                    if val < 0 or seen_negative:
+                        seen_object = 1
+                        break
+                    else:
+                        seen_uint = 1
+                if val < 0:
+                    seen_negative = 1
+                else:
+                    uints[i] = <uint64_t> val
         elif util.is_complex_object(val):
             complexes[i] = val
             seen_complex = 1
@@ -519,8 +533,12 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     seen_numeric = seen_complex or seen_float or seen_int
 
     if not seen_object:
+        if seen_uint:
+            if not (seen_null or seen_bool or seen_complex or seen_float or
+                    seen_negative):
+                return uints
 
-        if not safe:
+        elif not safe:
             if seen_null:
                 if not seen_bool and not seen_datetime:
                     if seen_complex:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index d216cebc1abf3..aedcae64385ca 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -10347,6 +10347,8 @@ def test_constructor_with_convert(self):
         df = DataFrame({'A' : [2**63] })
         result = df['A']
         expected = Series(np.asarray([2**63], np.object_))
+        # this doesn't work because no block manager for uint
+        #expected = Series(np.asarray([2**63], np.uint64))
         assert_series_equal(result, expected)
 
         df = DataFrame({'A' : [datetime(2005, 1, 1), True] })

From 4d646444d51b1cb1e8b8e13b00880af05931c9af Mon Sep 17 00:00:00 2001
From: Jeffrey Tratner <jeffrey.tratner@gmail.com>
Date: Sun, 15 Sep 2013 11:34:02 -0400
Subject: [PATCH 2/3] TST: Add test cases for lib.maybe_convert_objects
 directly

remove extraneous typecheck against uint64

better dtype checks in test_frame

update tests to reflect actual use of uint64, etc
---
 pandas/core/internals.py   |  5 -----
 pandas/src/inference.pyx   |  6 ++++--
 pandas/tests/test_frame.py | 29 +++++++++++++--------------
 pandas/tests/test_lib.py   | 41 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 22 deletions(-)
 create mode 100644 pandas/tests/test_lib.py

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 11ce27b078b18..365a6c43f7fb4 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -3325,11 +3325,6 @@ def form_blocks(arrays, names, axes):
             else:
                 datetime_items.append((i, k, v))
         elif issubclass(v.dtype.type, np.integer):
-            if v.dtype == np.uint64:
-                # HACK #2355 definite overflow
-                if (v > 2 ** 63 - 1).any():
-                    object_items.append((i, k, v))
-                    continue
             int_items.append((i, k, v))
         elif v.dtype == np.bool_:
             bool_items.append((i, k, v))
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
index 30c20328eab7a..d8fde3a82030e 100644
--- a/pandas/src/inference.pyx
+++ b/pandas/src/inference.pyx
@@ -534,8 +534,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
 
     if not seen_object:
         if seen_uint:
-            if not (seen_null or seen_bool or seen_complex or seen_float or
-                    seen_negative):
+            uint_incompatible = (seen_object or seen_null or seen_bool or
+                                 seen_complex or seen_float or seen_negative or
+                                 seen_datetime)
+            if not uint_incompatible:
                 return uints
 
         elif not safe:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index aedcae64385ca..94d383b386ecf 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -55,6 +55,7 @@
 import pandas.lib as lib
 
 from numpy.testing.decorators import slow
+from nose.tools import assert_equal
 
 def _skip_if_no_scipy():
     try:
@@ -79,13 +80,13 @@ def _check_mixed_float(df, dtype = None):
     elif isinstance(dtype, dict):
         dtypes.update(dtype)
     if dtypes.get('A'):
-        assert(df.dtypes['A'] == dtypes['A'])
+        assert_equal(df.dtypes['A'], dtypes['A'])
     if dtypes.get('B'):
-        assert(df.dtypes['B'] == dtypes['B'])
+        assert_equal(df.dtypes['B'], dtypes['B'])
     if dtypes.get('C'):
-        assert(df.dtypes['C'] == dtypes['C'])
+        assert_equal(df.dtypes['C'], dtypes['C'])
     if dtypes.get('D'):
-        assert(df.dtypes['D'] == dtypes['D'])
+        assert_equal(df.dtypes['D'], dtypes['D'])
 
 
 def _check_mixed_int(df, dtype = None):
@@ -95,13 +96,13 @@ def _check_mixed_int(df, dtype = None):
     elif isinstance(dtype, dict):
         dtypes.update(dtype)
     if dtypes.get('A'):
-        assert(df.dtypes['A'] == dtypes['A'])
+        assert_equal(df.dtypes['A'], dtypes['A'])
     if dtypes.get('B'):
-        assert(df.dtypes['B'] == dtypes['B'])
+        assert_equal(df.dtypes['B'], dtypes['B'])
     if dtypes.get('C'):
-        assert(df.dtypes['C'] == dtypes['C'])
+        assert_equal(df.dtypes['C'], dtypes['C'])
     if dtypes.get('D'):
-        assert(df.dtypes['D'] == dtypes['D'])
+        assert_equal(df.dtypes['D'], dtypes['D'])
 
 
 class CheckIndexing(object):
@@ -2225,9 +2226,9 @@ def test_constructor_overflow_int64(self):
                           dtype=np.uint64)
 
         result = DataFrame({'a': values})
-        self.assert_(result['a'].dtype == object)
+        self.assert_(result['a'].dtype == np.dtype('uint64'))
 
-        # #2355
+        # Now #2355 with #4845 fix.
         data_scores = [(6311132704823138710, 273), (2685045978526272070, 23),
                        (8921811264899370420, 45), (long(17019687244989530680), 270),
                        (long(9930107427299601010), 273)]
@@ -2235,7 +2236,7 @@ def test_constructor_overflow_int64(self):
         data = np.zeros((len(data_scores),), dtype=dtype)
         data[:] = data_scores
         df_crawls = DataFrame(data)
-        self.assert_(df_crawls['uid'].dtype == object)
+        self.assert_(df_crawls['uid'].dtype == np.dtype('uint64'))
 
     def test_is_mixed_type(self):
         self.assert_(not self.frame._is_mixed_type)
@@ -4437,7 +4438,7 @@ def test_arith_flex_frame(self):
                 # overflow in the uint
                 dtype = None
                 if op in ['sub']:
-                    dtype = dict(B = 'object', C = None)
+                    dtype = dict(B = 'uint64', C = None)
                 elif op in ['add','mul']:
                     dtype = dict(C = None)
                 assert_frame_equal(result, exp)
@@ -10346,9 +10347,7 @@ def test_constructor_with_convert(self):
 
         df = DataFrame({'A' : [2**63] })
         result = df['A']
-        expected = Series(np.asarray([2**63], np.object_))
-        # this doesn't work because no block manager for uint
-        #expected = Series(np.asarray([2**63], np.uint64))
+        expected = Series(np.asarray([2**63], np.uint64))
         assert_series_equal(result, expected)
 
         df = DataFrame({'A' : [datetime(2005, 1, 1), True] })
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
new file mode 100644
index 0000000000000..f7ea224096e72
--- /dev/null
+++ b/pandas/tests/test_lib.py
@@ -0,0 +1,41 @@
+import unittest
+from datetime import datetime
+
+import pandas.lib as lib
+import numpy as np
+
+
+class TestLib(unittest.TestCase):
+    def test_maybe_convert_objects_uint64(self):
+        # GH4471 - array with objects too big for int64
+        arr = np.array([2 ** 63 + 1], dtype=object)
+        result = lib.maybe_convert_objects(arr)
+        expected = np.array([2 ** 63 + 1], dtype='uint64')
+        self.assertEqual(result.dtype, np.dtype('uint64'))
+        np.testing.assert_array_equal(result, expected)
+
+        arr2 = np.array([5, 2, 3, 4, 5, 1, 2, 3, 22, 1000, 2**63 + 5,
+                         2 ** 63 + 1000], dtype=object)
+        result = lib.maybe_convert_objects(arr2)
+        expected = arr2.copy().astype('uint64')
+        self.assertEqual(result.dtype, np.dtype('uint64'))
+        np.testing.assert_array_equal(result, expected)
+
+    def test_maybe_convert_objects_uint64_unconvertible(self):
+        # can't convert because negative number
+        neg = np.array([-5, 2 ** 63 + 5, 3], dtype=object)
+        neg2 = np.array([2 ** 63 + 100, -3], dtype=object)
+        # can't convert because of datetime
+        dt = np.array([datetime(2011, 5, 3), 2 ** 63 + 2], dtype=object)
+        # can't convert because of complex
+        cmplx = np.array([2 ** 63 + 5, 1+3j, 22], dtype=object)
+        # can't convert b/c of float
+        flt = np.array([3.25, 1, 3, 2 ** 63 +4], dtype=object)
+        # can't convert b/c of nan
+        null = np.array([5, 2, 2 ** 63 + 2, np.nan], dtype=object)
+        null2 = np.array([np.nan, 2 ** 63 + 2], dtype=object)
+        for arr in (neg, neg2, dt, cmplx, flt, null, null2):
+            result = lib.maybe_convert_objects(arr.copy())
+            self.assertEqual(result.dtype, np.object_)
+            np.testing.assert_array_equal(result, arr)
+

From 672fccd84de8d29859668b7b3f859b84aac372aa Mon Sep 17 00:00:00 2001
From: Jeffrey Tratner <jeffrey.tratner@gmail.com>
Date: Tue, 17 Sep 2013 19:18:23 -0400
Subject: [PATCH 3/3] Try using a separate uint64 function

---
 pandas/src/inference.pyx | 144 ++++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 70 deletions(-)

diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
index d8fde3a82030e..093b5cc6325f1 100644
--- a/pandas/src/inference.pyx
+++ b/pandas/src/inference.pyx
@@ -28,9 +28,6 @@ try:
 except AttributeError:
     pass
 
-# I'm sure there's a better way to do this
-cdef int64_t MAX_INT = np.iinfo(np.int64).max
-
 def infer_dtype(object _values):
     cdef:
         Py_ssize_t i, n
@@ -428,6 +425,31 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
     else:
         return ints
 
+def maybe_convert_uint64(ndarray[object] objects):
+    '''
+    Try to convert objects into an array of uint64
+    '''
+    cdef:
+        Py_ssize_t i, n
+        ndarray[uint64_t] uints
+        bint cant_convert = 0
+        object val
+    n = len(objects)
+    uints = np.empty(n, dtype='uint64')
+    for i from 0 <= i < n:
+        val = objects[i]
+        if not util.is_integer_object(val) or val < 0:
+            cant_convert = 1
+            break
+        else:
+            uints[i] = val
+
+    if cant_convert:
+        return objects
+    else:
+        return uints
+
+
 def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
                           bint safe=0, bint convert_datetime=0):
     '''
@@ -440,7 +462,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
         ndarray[int64_t] ints
         ndarray[uint8_t] bools
         ndarray[int64_t] idatetimes
-        ndarray[uint64_t] uints
         bint seen_float = 0
         bint seen_complex = 0
         bint seen_datetime = 0
@@ -449,8 +470,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
         bint seen_object = 0
         bint seen_null = 0
         bint seen_numeric = 0
-        bint seen_uint = 0
-        bint seen_negative = 0
         object val, onan
         float64_t fval, fnan
 
@@ -462,85 +481,70 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     bools = np.empty(n, dtype=np.uint8)
     datetimes = np.empty(n, dtype='M8[ns]')
     idatetimes = datetimes.view(np.int64)
-    uints = np.empty(n, dtype='uint64')
 
     onan = np.nan
     fnan = np.nan
 
-    for i from 0 <= i < n:
-        val = objects[i]
-
-        if val is None:
-            seen_null = 1
-            floats[i] = complexes[i] = fnan
-        elif util.is_bool_object(val):
-            seen_bool = 1
-            bools[i] = val
-        elif util.is_float_object(val):
-            floats[i] = complexes[i] = val
-            seen_float = 1
-        elif util.is_datetime64_object(val):
-            if convert_datetime:
-                idatetimes[i] = convert_to_tsobject(val, None, None).value
-                seen_datetime = 1
-            else:
+    try:
+        for i from 0 <= i < n:
+            val = objects[i]
+
+            if val is None:
+                seen_null = 1
+                floats[i] = complexes[i] = fnan
+            elif util.is_bool_object(val):
+                seen_bool = 1
+                bools[i] = val
+            elif util.is_float_object(val):
+                floats[i] = complexes[i] = val
+                seen_float = 1
+            elif util.is_datetime64_object(val):
+                if convert_datetime:
+                    idatetimes[i] = convert_to_tsobject(val, None, None).value
+                    seen_datetime = 1
+                else:
+                    seen_object = 1
+                    # objects[i] = val.astype('O')
+                    break
+            elif util.is_timedelta64_object(val):
                 seen_object = 1
-                # objects[i] = val.astype('O')
                 break
-        elif util.is_timedelta64_object(val):
-            seen_object = 1
-            break
-        elif util.is_integer_object(val):
-            seen_int = 1
-            floats[i] = <float64_t> val
-            complexes[i] = <double complex> val
-            if not seen_null:
-                try:
+            elif util.is_integer_object(val):
+                seen_int = 1
+                floats[i] = <float64_t> val
+                complexes[i] = <double complex> val
+                if not seen_null:
                     ints[i] = val
-                except OverflowError:
-                    if val < 0 or seen_negative:
-                        seen_object = 1
-                        break
-                    else:
-                        seen_uint = 1
-                if val < 0:
-                    seen_negative = 1
+            elif util.is_complex_object(val):
+                complexes[i] = val
+                seen_complex = 1
+            elif PyDateTime_Check(val) or util.is_datetime64_object(val):
+                if convert_datetime:
+                    seen_datetime = 1
+                    idatetimes[i] = convert_to_tsobject(val, None, None).value
                 else:
-                    uints[i] = <uint64_t> val
-        elif util.is_complex_object(val):
-            complexes[i] = val
-            seen_complex = 1
-        elif PyDateTime_Check(val) or util.is_datetime64_object(val):
-            if convert_datetime:
-                seen_datetime = 1
-                idatetimes[i] = convert_to_tsobject(val, None, None).value
+                    seen_object = 1
+                    break
+            elif try_float and not util.is_string_object(val):
+                # this will convert Decimal objects
+                try:
+                    floats[i] = float(val)
+                    complexes[i] = complex(val)
+                    seen_float = 1
+                except Exception:
+                    seen_object = 1
+                    break
             else:
                 seen_object = 1
                 break
-        elif try_float and not util.is_string_object(val):
-            # this will convert Decimal objects
-            try:
-                floats[i] = float(val)
-                complexes[i] = complex(val)
-                seen_float = 1
-            except Exception:
-                seen_object = 1
-                break
-        else:
-            seen_object = 1
-            break
+    except OverflowError:
+        return maybe_convert_uint64(objects)
 
     seen_numeric = seen_complex or seen_float or seen_int
 
     if not seen_object:
-        if seen_uint:
-            uint_incompatible = (seen_object or seen_null or seen_bool or
-                                 seen_complex or seen_float or seen_negative or
-                                 seen_datetime)
-            if not uint_incompatible:
-                return uints
-
-        elif not safe:
+
+        if not safe:
             if seen_null:
                 if not seen_bool and not seen_datetime:
                     if seen_complex: