From 86cd6576d65193ebd96b3e562a45fe15ee220426 Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Sun, 15 Sep 2013 11:15:49 -0400 Subject: [PATCH 1/3] BUG: lib.maybe_convert_objects work with uint64 When it's greater than uint64 max (and not negative, etc.) --- pandas/src/inference.pyx | 24 +++++++++++++++++++++--- pandas/tests/test_frame.py | 2 ++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index e0bbc1a4e64c1..30c20328eab7a 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -28,6 +28,9 @@ try: except AttributeError: pass +# I'm sure there's a better way to do this +cdef int64_t MAX_INT = np.iinfo(np.int64).max + def infer_dtype(object _values): cdef: Py_ssize_t i, n @@ -437,6 +440,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ndarray[int64_t] ints ndarray[uint8_t] bools ndarray[int64_t] idatetimes + ndarray[uint64_t] uints bint seen_float = 0 bint seen_complex = 0 bint seen_datetime = 0 @@ -445,6 +449,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_object = 0 bint seen_null = 0 bint seen_numeric = 0 + bint seen_uint = 0 + bint seen_negative = 0 object val, onan float64_t fval, fnan @@ -456,6 +462,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bools = np.empty(n, dtype=np.uint8) datetimes = np.empty(n, dtype='M8[ns]') idatetimes = datetimes.view(np.int64) + uints = np.empty(n, dtype='uint64') onan = np.nan fnan = np.nan @@ -491,8 +498,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, try: ints[i] = val except OverflowError: - seen_object = 1 - break + if val < 0 or seen_negative: + seen_object = 1 + break + else: + seen_uint = 1 + if val < 0: + seen_negative = 1 + else: + uints[i] = val elif util.is_complex_object(val): complexes[i] = val seen_complex = 1 @@ -519,8 +533,12 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_numeric = seen_complex or seen_float or seen_int if not seen_object: + if seen_uint: + if not (seen_null or seen_bool or seen_complex or seen_float or + seen_negative): + return uints - if not safe: + elif not safe: if seen_null: if not seen_bool and not seen_datetime: if seen_complex: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d216cebc1abf3..aedcae64385ca 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10347,6 +10347,8 @@ def test_constructor_with_convert(self): df = DataFrame({'A' : [2**63] }) result = df['A'] expected = Series(np.asarray([2**63], np.object_)) + # this doesn't work because no block manager for uint + #expected = Series(np.asarray([2**63], np.uint64)) assert_series_equal(result, expected) df = DataFrame({'A' : [datetime(2005, 1, 1), True] }) From 4d646444d51b1cb1e8b8e13b00880af05931c9af Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Sun, 15 Sep 2013 11:34:02 -0400 Subject: [PATCH 2/3] TST: Add test cases for lib.maybe_convert_objects directly remove extraneous typecheck against uint64 better dtype checks in test_frame update tests to reflect actual use of uint64, etc --- pandas/core/internals.py | 5 ----- pandas/src/inference.pyx | 6 ++++-- pandas/tests/test_frame.py | 29 +++++++++++++-------------- pandas/tests/test_lib.py | 41 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 22 deletions(-) create mode 100644 pandas/tests/test_lib.py diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11ce27b078b18..365a6c43f7fb4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3325,11 +3325,6 @@ def form_blocks(arrays, names, axes): else: datetime_items.append((i, k, v)) elif issubclass(v.dtype.type, np.integer): - if v.dtype == np.uint64: - # HACK #2355 definite overflow - if (v > 2 ** 63 - 1).any(): - object_items.append((i, k, v)) - continue int_items.append((i, k, v)) elif v.dtype == np.bool_: bool_items.append((i, k, v)) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 30c20328eab7a..d8fde3a82030e 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -534,8 +534,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen_object: if seen_uint: - if not (seen_null or seen_bool or seen_complex or seen_float or - seen_negative): + uint_incompatible = (seen_object or seen_null or seen_bool or + seen_complex or seen_float or seen_negative or + seen_datetime) + if not uint_incompatible: return uints elif not safe: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index aedcae64385ca..94d383b386ecf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -55,6 +55,7 @@ import pandas.lib as lib from numpy.testing.decorators import slow +from nose.tools import assert_equal def _skip_if_no_scipy(): try: @@ -79,13 +80,13 @@ def _check_mixed_float(df, dtype = None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) + assert_equal(df.dtypes['A'], dtypes['A']) if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) + assert_equal(df.dtypes['B'], dtypes['B']) if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) + assert_equal(df.dtypes['C'], dtypes['C']) if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + assert_equal(df.dtypes['D'], dtypes['D']) def _check_mixed_int(df, dtype = None): @@ -95,13 +96,13 @@ def _check_mixed_int(df, dtype = None): elif isinstance(dtype, dict): dtypes.update(dtype) if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) + assert_equal(df.dtypes['A'], dtypes['A']) if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) + assert_equal(df.dtypes['B'], dtypes['B']) if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) + assert_equal(df.dtypes['C'], dtypes['C']) if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + assert_equal(df.dtypes['D'], dtypes['D']) class CheckIndexing(object): @@ -2225,9 +2226,9 @@ def test_constructor_overflow_int64(self): dtype=np.uint64) result = DataFrame({'a': values}) - self.assert_(result['a'].dtype == object) + self.assert_(result['a'].dtype == np.dtype('uint64')) - # #2355 + # Now #2355 with #4845 fix. data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), (8921811264899370420, 45), (long(17019687244989530680), 270), (long(9930107427299601010), 273)] @@ -2235,7 +2236,7 @@ def test_constructor_overflow_int64(self): data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) - self.assert_(df_crawls['uid'].dtype == object) + self.assert_(df_crawls['uid'].dtype == np.dtype('uint64')) def test_is_mixed_type(self): self.assert_(not self.frame._is_mixed_type) @@ -4437,7 +4438,7 @@ def test_arith_flex_frame(self): # overflow in the uint dtype = None if op in ['sub']: - dtype = dict(B = 'object', C = None) + dtype = dict(B = 'uint64', C = None) elif op in ['add','mul']: dtype = dict(C = None) assert_frame_equal(result, exp) @@ -10346,9 +10347,7 @@ def test_constructor_with_convert(self): df = DataFrame({'A' : [2**63] }) result = df['A'] - expected = Series(np.asarray([2**63], np.object_)) - # this doesn't work because no block manager for uint - #expected = Series(np.asarray([2**63], np.uint64)) + expected = Series(np.asarray([2**63], np.uint64)) assert_series_equal(result, expected) df = DataFrame({'A' : [datetime(2005, 1, 1), True] }) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py new file mode 100644 index 0000000000000..f7ea224096e72 --- /dev/null +++ b/pandas/tests/test_lib.py @@ -0,0 +1,41 @@ +import unittest +from datetime import datetime + +import pandas.lib as lib +import numpy as np + + +class TestLib(unittest.TestCase): + def test_maybe_convert_objects_uint64(self): + # GH4471 - array with objects too big for int64 + arr = np.array([2 ** 63 + 1], dtype=object) + result = lib.maybe_convert_objects(arr) + expected = np.array([2 ** 63 + 1], dtype='uint64') + self.assertEqual(result.dtype, np.dtype('uint64')) + np.testing.assert_array_equal(result, expected) + + arr2 = np.array([5, 2, 3, 4, 5, 1, 2, 3, 22, 1000, 2**63 + 5, + 2 ** 63 + 1000], dtype=object) + result = lib.maybe_convert_objects(arr2) + expected = arr2.copy().astype('uint64') + self.assertEqual(result.dtype, np.dtype('uint64')) + np.testing.assert_array_equal(result, expected) + + def test_maybe_convert_objects_uint64_unconvertible(self): + # can't convert because negative number + neg = np.array([-5, 2 ** 63 + 5, 3], dtype=object) + neg2 = np.array([2 ** 63 + 100, -3], dtype=object) + # can't convert because of datetime + dt = np.array([datetime(2011, 5, 3), 2 ** 63 + 2], dtype=object) + # can't convert because of complex + cmplx = np.array([2 ** 63 + 5, 1+3j, 22], dtype=object) + # can't convert b/c of float + flt = np.array([3.25, 1, 3, 2 ** 63 +4], dtype=object) + # can't convert b/c of nan + null = np.array([5, 2, 2 ** 63 + 2, np.nan], dtype=object) + null2 = np.array([np.nan, 2 ** 63 + 2], dtype=object) + for arr in (neg, neg2, dt, cmplx, flt, null, null2): + result = lib.maybe_convert_objects(arr.copy()) + self.assertEqual(result.dtype, np.object_) + np.testing.assert_array_equal(result, arr) + From 672fccd84de8d29859668b7b3f859b84aac372aa Mon Sep 17 00:00:00 2001 From: Jeffrey Tratner Date: Tue, 17 Sep 2013 19:18:23 -0400 Subject: [PATCH 3/3] Try using a separate uint64 function --- pandas/src/inference.pyx | 144 ++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 70 deletions(-) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index d8fde3a82030e..093b5cc6325f1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -28,9 +28,6 @@ try: except AttributeError: pass -# I'm sure there's a better way to do this -cdef int64_t MAX_INT = np.iinfo(np.int64).max - def infer_dtype(object _values): cdef: Py_ssize_t i, n @@ -428,6 +425,31 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, else: return ints +def maybe_convert_uint64(ndarray[object] objects): + ''' + Try to convert objects into an array of uint64 + ''' + cdef: + Py_ssize_t i, n + ndarray[uint64_t] uints + bint cant_convert = 0 + object val + n = len(objects) + uints = np.empty(n, dtype='uint64') + for i from 0 <= i < n: + val = objects[i] + if not util.is_integer_object(val) or val < 0: + cant_convert = 1 + break + else: + uints[i] = val + + if cant_convert: + return objects + else: + return uints + + def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint safe=0, bint convert_datetime=0): ''' @@ -440,7 +462,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ndarray[int64_t] ints ndarray[uint8_t] bools ndarray[int64_t] idatetimes - ndarray[uint64_t] uints bint seen_float = 0 bint seen_complex = 0 bint seen_datetime = 0 @@ -449,8 +470,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_object = 0 bint seen_null = 0 bint seen_numeric = 0 - bint seen_uint = 0 - bint seen_negative = 0 object val, onan float64_t fval, fnan @@ -462,85 +481,70 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bools = np.empty(n, dtype=np.uint8) datetimes = np.empty(n, dtype='M8[ns]') idatetimes = datetimes.view(np.int64) - uints = np.empty(n, dtype='uint64') onan = np.nan fnan = np.nan - for i from 0 <= i < n: - val = objects[i] - - if val is None: - seen_null = 1 - floats[i] = complexes[i] = fnan - elif util.is_bool_object(val): - seen_bool = 1 - bools[i] = val - elif util.is_float_object(val): - floats[i] = complexes[i] = val - seen_float = 1 - elif util.is_datetime64_object(val): - if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None, None).value - seen_datetime = 1 - else: + try: + for i from 0 <= i < n: + val = objects[i] + + if val is None: + seen_null = 1 + floats[i] = complexes[i] = fnan + elif util.is_bool_object(val): + seen_bool = 1 + bools[i] = val + elif util.is_float_object(val): + floats[i] = complexes[i] = val + seen_float = 1 + elif util.is_datetime64_object(val): + if convert_datetime: + idatetimes[i] = convert_to_tsobject(val, None, None).value + seen_datetime = 1 + else: + seen_object = 1 + # objects[i] = val.astype('O') + break + elif util.is_timedelta64_object(val): seen_object = 1 - # objects[i] = val.astype('O') break - elif util.is_timedelta64_object(val): - seen_object = 1 - break - elif util.is_integer_object(val): - seen_int = 1 - floats[i] = val - complexes[i] = val - if not seen_null: - try: + elif util.is_integer_object(val): + seen_int = 1 + floats[i] = val + complexes[i] = val + if not seen_null: ints[i] = val - except OverflowError: - if val < 0 or seen_negative: - seen_object = 1 - break - else: - seen_uint = 1 - if val < 0: - seen_negative = 1 + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 + elif PyDateTime_Check(val) or util.is_datetime64_object(val): + if convert_datetime: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None).value else: - uints[i] = val - elif util.is_complex_object(val): - complexes[i] = val - seen_complex = 1 - elif PyDateTime_Check(val) or util.is_datetime64_object(val): - if convert_datetime: - seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None).value + seen_object = 1 + break + elif try_float and not util.is_string_object(val): + # this will convert Decimal objects + try: + floats[i] = float(val) + complexes[i] = complex(val) + seen_float = 1 + except Exception: + seen_object = 1 + break else: seen_object = 1 break - elif try_float and not util.is_string_object(val): - # this will convert Decimal objects - try: - floats[i] = float(val) - complexes[i] = complex(val) - seen_float = 1 - except Exception: - seen_object = 1 - break - else: - seen_object = 1 - break + except OverflowError: + return maybe_convert_uint64(objects) seen_numeric = seen_complex or seen_float or seen_int if not seen_object: - if seen_uint: - uint_incompatible = (seen_object or seen_null or seen_bool or - seen_complex or seen_float or seen_negative or - seen_datetime) - if not uint_incompatible: - return uints - - elif not safe: + + if not safe: if seen_null: if not seen_bool and not seen_datetime: if seen_complex: