diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11ce27b078b18..365a6c43f7fb4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3325,11 +3325,6 @@ def form_blocks(arrays, names, axes): else: datetime_items.append((i, k, v)) elif issubclass(v.dtype.type, np.integer): - if v.dtype == np.uint64: - # HACK #2355 definite overflow - if (v > 2 ** 63 - 1).any(): - object_items.append((i, k, v)) - continue int_items.append((i, k, v)) elif v.dtype == np.bool_: bool_items.append((i, k, v)) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index e0bbc1a4e64c1..093b5cc6325f1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -425,6 +425,31 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, else: return ints +def maybe_convert_uint64(ndarray[object] objects): + ''' + Try to convert objects into an array of uint64 + ''' + cdef: + Py_ssize_t i, n + ndarray[uint64_t] uints + bint cant_convert = 0 + object val + n = len(objects) + uints = np.empty(n, dtype='uint64') + for i from 0 <= i < n: + val = objects[i] + if not util.is_integer_object(val) or val < 0: + cant_convert = 1 + break + else: + uints[i] = val + + if cant_convert: + return objects + else: + return uints + + def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint safe=0, bint convert_datetime=0): ''' @@ -460,61 +485,60 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, onan = np.nan fnan = np.nan - for i from 0 <= i < n: - val = objects[i] - - if val is None: - seen_null = 1 - floats[i] = complexes[i] = fnan - elif util.is_bool_object(val): - seen_bool = 1 - bools[i] = val - elif util.is_float_object(val): - floats[i] = complexes[i] = val - seen_float = 1 - elif util.is_datetime64_object(val): - if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None, None).value - seen_datetime = 1 - else: + try: + for i from 0 <= i < n: + val = objects[i] + + if val is None: + seen_null = 1 + floats[i] = complexes[i] = fnan + elif util.is_bool_object(val): + seen_bool = 1 + bools[i] = val + elif util.is_float_object(val): + floats[i] = complexes[i] = val + seen_float = 1 + elif util.is_datetime64_object(val): + if convert_datetime: + idatetimes[i] = convert_to_tsobject(val, None, None).value + seen_datetime = 1 + else: + seen_object = 1 + # objects[i] = val.astype('O') + break + elif util.is_timedelta64_object(val): seen_object = 1 - # objects[i] = val.astype('O') break - elif util.is_timedelta64_object(val): - seen_object = 1 - break - elif util.is_integer_object(val): - seen_int = 1 - floats[i] = val - complexes[i] = val - if not seen_null: - try: + elif util.is_integer_object(val): + seen_int = 1 + floats[i] = val + complexes[i] = val + if not seen_null: ints[i] = val - except OverflowError: + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 + elif PyDateTime_Check(val) or util.is_datetime64_object(val): + if convert_datetime: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None).value + else: + seen_object = 1 + break + elif try_float and not util.is_string_object(val): + # this will convert Decimal objects + try: + floats[i] = float(val) + complexes[i] = complex(val) + seen_float = 1 + except Exception: seen_object = 1 break - elif util.is_complex_object(val): - complexes[i] = val - seen_complex = 1 - elif PyDateTime_Check(val) or util.is_datetime64_object(val): - if convert_datetime: - seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None).value else: seen_object = 1 break - elif try_float and not util.is_string_object(val): - # this will convert Decimal objects - try: - floats[i] = float(val) - complexes[i] = complex(val) - seen_float = 1 - except Exception: - seen_object = 1 - break - else: - seen_object = 1 - break + except OverflowError: + return maybe_convert_uint64(objects) seen_numeric = seen_complex or seen_float or seen_int diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d216cebc1abf3..94d383b386ecf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -55,6 +55,7 @@ import pandas.lib as lib from numpy.testing.decorators import slow +from nose.tools import assert_equal def _skip_if_no_scipy(): try: @@ -79,13 +80,13 @@ def _check_mixed_float(df, dtype = None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) + assert_equal(df.dtypes['A'], dtypes['A']) if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) + assert_equal(df.dtypes['B'], dtypes['B']) if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) + assert_equal(df.dtypes['C'], dtypes['C']) if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + assert_equal(df.dtypes['D'], dtypes['D']) def _check_mixed_int(df, dtype = None): @@ -95,13 +96,13 @@ def _check_mixed_int(df, dtype = None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) + assert_equal(df.dtypes['A'], dtypes['A']) if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) + assert_equal(df.dtypes['B'], dtypes['B']) if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) + assert_equal(df.dtypes['C'], dtypes['C']) if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + assert_equal(df.dtypes['D'], dtypes['D']) class CheckIndexing(object): @@ -2225,9 +2226,9 @@ def test_constructor_overflow_int64(self): dtype=np.uint64) result = DataFrame({'a': values}) - self.assert_(result['a'].dtype == object) + self.assert_(result['a'].dtype == np.dtype('uint64')) - # #2355 + # Now #2355 with #4845 fix. data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), (8921811264899370420, 45), (long(17019687244989530680), 270), (long(9930107427299601010), 273)] @@ -2235,7 +2236,7 @@ def test_constructor_overflow_int64(self): data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) - self.assert_(df_crawls['uid'].dtype == object) + self.assert_(df_crawls['uid'].dtype == np.dtype('uint64')) def test_is_mixed_type(self): self.assert_(not self.frame._is_mixed_type) @@ -4437,7 +4438,7 @@ def test_arith_flex_frame(self): # overflow in the uint dtype = None if op in ['sub']: - dtype = dict(B = 'object', C = None) + dtype = dict(B = 'uint64', C = None) elif op in ['add','mul']: dtype = dict(C = None) assert_frame_equal(result, exp) @@ -10346,7 +10347,7 @@ def test_constructor_with_convert(self): df = DataFrame({'A' : [2**63] }) result = df['A'] - expected = Series(np.asarray([2**63], np.object_)) + expected = Series(np.asarray([2**63], np.uint64)) assert_series_equal(result, expected) df = DataFrame({'A' : [datetime(2005, 1, 1), True] }) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py new file mode 100644 index 0000000000000..f7ea224096e72 --- /dev/null +++ b/pandas/tests/test_lib.py @@ -0,0 +1,41 @@ +import unittest +from datetime import datetime + +import pandas.lib as lib +import numpy as np + + +class TestLib(unittest.TestCase): + def test_maybe_convert_objects_uint64(self): + # GH4471 - array with objects too big for int64 + arr = np.array([2 ** 63 + 1], dtype=object) + result = lib.maybe_convert_objects(arr) + expected = np.array([2 ** 63 + 1], dtype='uint64') + self.assertEqual(result.dtype, np.dtype('uint64')) + np.testing.assert_array_equal(result, expected) + + arr2 = np.array([5, 2, 3, 4, 5, 1, 2, 3, 22, 1000, 2**63 + 5, + 2 ** 63 + 1000], dtype=object) + result = lib.maybe_convert_objects(arr2) + expected = arr2.copy().astype('uint64') + self.assertEqual(result.dtype, np.dtype('uint64')) + np.testing.assert_array_equal(result, expected) + + def test_maybe_convert_objects_uint64_unconvertible(self): + # can't convert because negative number + neg = np.array([-5, 2 ** 63 + 5, 3], dtype=object) + neg2 = np.array([2 ** 63 + 100, -3], dtype=object) + # can't convert because of datetime + dt = np.array([datetime(2011, 5, 3), 2 ** 63 + 2], dtype=object) + # can't convert because of complex + cmplx = np.array([2 ** 63 + 5, 1+3j, 22], dtype=object) + # can't convert b/c of float + flt = np.array([3.25, 1, 3, 2 ** 63 +4], dtype=object) + # can't convert b/c of nan + null = np.array([5, 2, 2 ** 63 + 2, np.nan], dtype=object) + null2 = np.array([np.nan, 2 ** 63 + 2], dtype=object) + for arr in (neg, neg2, dt, cmplx, flt, null, null2): + result = lib.maybe_convert_objects(arr.copy()) + self.assertEqual(result.dtype, np.object_) + np.testing.assert_array_equal(result, arr) +