Skip to content

BUG: Make lib.maybe_convert_objects work with uint64 #4845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -3325,11 +3325,6 @@ def form_blocks(arrays, names, axes):
else:
datetime_items.append((i, k, v))
elif issubclass(v.dtype.type, np.integer):
if v.dtype == np.uint64:
# HACK #2355 definite overflow
if (v > 2 ** 63 - 1).any():
object_items.append((i, k, v))
continue
int_items.append((i, k, v))
elif v.dtype == np.bool_:
bool_items.append((i, k, v))
Expand Down
118 changes: 71 additions & 47 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,31 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
else:
return ints

def maybe_convert_uint64(ndarray[object] objects):
'''
Try to convert objects into an array of uint64
'''
cdef:
Py_ssize_t i, n
ndarray[uint64_t] uints
bint cant_convert = 0
object val
n = len(objects)
uints = np.empty(n, dtype='uint64')
for i from 0 <= i < n:
val = objects[i]
if not util.is_integer_object(val) or val < 0:
cant_convert = 1
break
else:
uints[i] = val

if cant_convert:
return objects
else:
return uints


def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
bint safe=0, bint convert_datetime=0):
'''
Expand Down Expand Up @@ -460,61 +485,60 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
onan = np.nan
fnan = np.nan

for i from 0 <= i < n:
val = objects[i]

if val is None:
seen_null = 1
floats[i] = complexes[i] = fnan
elif util.is_bool_object(val):
seen_bool = 1
bools[i] = val
elif util.is_float_object(val):
floats[i] = complexes[i] = val
seen_float = 1
elif util.is_datetime64_object(val):
if convert_datetime:
idatetimes[i] = convert_to_tsobject(val, None, None).value
seen_datetime = 1
else:
try:
for i from 0 <= i < n:
val = objects[i]

if val is None:
seen_null = 1
floats[i] = complexes[i] = fnan
elif util.is_bool_object(val):
seen_bool = 1
bools[i] = val
elif util.is_float_object(val):
floats[i] = complexes[i] = val
seen_float = 1
elif util.is_datetime64_object(val):
if convert_datetime:
idatetimes[i] = convert_to_tsobject(val, None, None).value
seen_datetime = 1
else:
seen_object = 1
# objects[i] = val.astype('O')
break
elif util.is_timedelta64_object(val):
seen_object = 1
# objects[i] = val.astype('O')
break
elif util.is_timedelta64_object(val):
seen_object = 1
break
elif util.is_integer_object(val):
seen_int = 1
floats[i] = <float64_t> val
complexes[i] = <double complex> val
if not seen_null:
try:
elif util.is_integer_object(val):
seen_int = 1
floats[i] = <float64_t> val
complexes[i] = <double complex> val
if not seen_null:
ints[i] = val
except OverflowError:
elif util.is_complex_object(val):
complexes[i] = val
seen_complex = 1
elif PyDateTime_Check(val) or util.is_datetime64_object(val):
if convert_datetime:
seen_datetime = 1
idatetimes[i] = convert_to_tsobject(val, None, None).value
else:
seen_object = 1
break
elif try_float and not util.is_string_object(val):
# this will convert Decimal objects
try:
floats[i] = float(val)
complexes[i] = complex(val)
seen_float = 1
except Exception:
seen_object = 1
break
elif util.is_complex_object(val):
complexes[i] = val
seen_complex = 1
elif PyDateTime_Check(val) or util.is_datetime64_object(val):
if convert_datetime:
seen_datetime = 1
idatetimes[i] = convert_to_tsobject(val, None, None).value
else:
seen_object = 1
break
elif try_float and not util.is_string_object(val):
# this will convert Decimal objects
try:
floats[i] = float(val)
complexes[i] = complex(val)
seen_float = 1
except Exception:
seen_object = 1
break
else:
seen_object = 1
break
except OverflowError:
return maybe_convert_uint64(objects)

seen_numeric = seen_complex or seen_float or seen_int

Expand Down
27 changes: 14 additions & 13 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
import pandas.lib as lib

from numpy.testing.decorators import slow
from nose.tools import assert_equal

def _skip_if_no_scipy():
try:
Expand All @@ -79,13 +80,13 @@ def _check_mixed_float(df, dtype = None):
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get('A'):
assert(df.dtypes['A'] == dtypes['A'])
assert_equal(df.dtypes['A'], dtypes['A'])
if dtypes.get('B'):
assert(df.dtypes['B'] == dtypes['B'])
assert_equal(df.dtypes['B'], dtypes['B'])
if dtypes.get('C'):
assert(df.dtypes['C'] == dtypes['C'])
assert_equal(df.dtypes['C'], dtypes['C'])
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
assert_equal(df.dtypes['D'], dtypes['D'])


def _check_mixed_int(df, dtype = None):
Expand All @@ -95,13 +96,13 @@ def _check_mixed_int(df, dtype = None):
elif isinstance(dtype, dict):
dtypes.update(dtype)
if dtypes.get('A'):
assert(df.dtypes['A'] == dtypes['A'])
assert_equal(df.dtypes['A'], dtypes['A'])
if dtypes.get('B'):
assert(df.dtypes['B'] == dtypes['B'])
assert_equal(df.dtypes['B'], dtypes['B'])
if dtypes.get('C'):
assert(df.dtypes['C'] == dtypes['C'])
assert_equal(df.dtypes['C'], dtypes['C'])
if dtypes.get('D'):
assert(df.dtypes['D'] == dtypes['D'])
assert_equal(df.dtypes['D'], dtypes['D'])


class CheckIndexing(object):
Expand Down Expand Up @@ -2225,17 +2226,17 @@ def test_constructor_overflow_int64(self):
dtype=np.uint64)

result = DataFrame({'a': values})
self.assert_(result['a'].dtype == object)
self.assert_(result['a'].dtype == np.dtype('uint64'))

# #2355
# Now #2355 with #4845 fix.
data_scores = [(6311132704823138710, 273), (2685045978526272070, 23),
(8921811264899370420, 45), (long(17019687244989530680), 270),
(long(9930107427299601010), 273)]
dtype = [('uid', 'u8'), ('score', 'u8')]
data = np.zeros((len(data_scores),), dtype=dtype)
data[:] = data_scores
df_crawls = DataFrame(data)
self.assert_(df_crawls['uid'].dtype == object)
self.assert_(df_crawls['uid'].dtype == np.dtype('uint64'))

def test_is_mixed_type(self):
self.assert_(not self.frame._is_mixed_type)
Expand Down Expand Up @@ -4437,7 +4438,7 @@ def test_arith_flex_frame(self):
# overflow in the uint
dtype = None
if op in ['sub']:
dtype = dict(B = 'object', C = None)
dtype = dict(B = 'uint64', C = None)
elif op in ['add','mul']:
dtype = dict(C = None)
assert_frame_equal(result, exp)
Expand Down Expand Up @@ -10346,7 +10347,7 @@ def test_constructor_with_convert(self):

df = DataFrame({'A' : [2**63] })
result = df['A']
expected = Series(np.asarray([2**63], np.object_))
expected = Series(np.asarray([2**63], np.uint64))
assert_series_equal(result, expected)

df = DataFrame({'A' : [datetime(2005, 1, 1), True] })
Expand Down
41 changes: 41 additions & 0 deletions pandas/tests/test_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import unittest
from datetime import datetime

import pandas.lib as lib
import numpy as np


class TestLib(unittest.TestCase):
def test_maybe_convert_objects_uint64(self):
# GH4471 - array with objects too big for int64
arr = np.array([2 ** 63 + 1], dtype=object)
result = lib.maybe_convert_objects(arr)
expected = np.array([2 ** 63 + 1], dtype='uint64')
self.assertEqual(result.dtype, np.dtype('uint64'))
np.testing.assert_array_equal(result, expected)

arr2 = np.array([5, 2, 3, 4, 5, 1, 2, 3, 22, 1000, 2**63 + 5,
2 ** 63 + 1000], dtype=object)
result = lib.maybe_convert_objects(arr2)
expected = arr2.copy().astype('uint64')
self.assertEqual(result.dtype, np.dtype('uint64'))
np.testing.assert_array_equal(result, expected)

def test_maybe_convert_objects_uint64_unconvertible(self):
# can't convert because negative number
neg = np.array([-5, 2 ** 63 + 5, 3], dtype=object)
neg2 = np.array([2 ** 63 + 100, -3], dtype=object)
# can't convert because of datetime
dt = np.array([datetime(2011, 5, 3), 2 ** 63 + 2], dtype=object)
# can't convert because of complex
cmplx = np.array([2 ** 63 + 5, 1+3j, 22], dtype=object)
# can't convert b/c of float
flt = np.array([3.25, 1, 3, 2 ** 63 +4], dtype=object)
# can't convert b/c of nan
null = np.array([5, 2, 2 ** 63 + 2, np.nan], dtype=object)
null2 = np.array([np.nan, 2 ** 63 + 2], dtype=object)
for arr in (neg, neg2, dt, cmplx, flt, null, null2):
result = lib.maybe_convert_objects(arr.copy())
self.assertEqual(result.dtype, np.object_)
np.testing.assert_array_equal(result, arr)