Skip to content

Commit a83b51b

Browse files
committed
BUG: Convert uint64 in maybe_convert_numeric
Add handling for uint64 elements in an array with the follow behavior specifications: 1) If uint64 and NaN are both detected, the original input will be returned if coerce_numeric is False. Otherwise, an Exception is raised. 2) If uint64 and negative numbers are both detected, the original input be returned if coerce_numeric is False. Otherwise, an Exception is raised. Closes pandas-devgh-14982. Partial fix for pandas-devgh-14983.
1 parent 3e3434b commit a83b51b

File tree

4 files changed

+231
-23
lines changed

4 files changed

+231
-23
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -319,5 +319,5 @@ Bug Fixes
319319

320320

321321
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
322-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
322+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
323323
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)

pandas/io/tests/parser/common.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -944,26 +944,39 @@ def test_int64_overflow(self):
944944
00013007854817840017963235
945945
00013007854817840018860166"""
946946

947+
# 13007854817840016671868 > UINT64_MAX, so this
948+
# will overflow and return object as the dtype.
947949
result = self.read_csv(StringIO(data))
948950
self.assertTrue(result['ID'].dtype == object)
949951

950-
self.assertRaises(OverflowError, self.read_csv,
951-
StringIO(data), converters={'ID': np.int64})
952+
# 13007854817840016671868 > UINT64_MAX, so attempts
953+
# to cast to either int64 or uint64 will result in
954+
# an OverflowError being raised.
955+
for conv in (np.int64, np.uint64):
956+
self.assertRaises(OverflowError, self.read_csv,
957+
StringIO(data), converters={'ID': conv})
952958

953-
# Just inside int64 range: parse as integer
959+
# These numbers fall right inside the int64 range,
960+
# so they should be parsed as string.
954961
i_max = np.iinfo(np.int64).max
955962
i_min = np.iinfo(np.int64).min
963+
956964
for x in [i_max, i_min]:
957965
result = self.read_csv(StringIO(str(x)), header=None)
958966
expected = DataFrame([x])
959967
tm.assert_frame_equal(result, expected)
960968

961-
# Just outside int64 range: parse as string
969+
# These numbers fall just outside the int64 range,
970+
# so they should be parsed as string.
962971
too_big = i_max + 1
963972
too_small = i_min - 1
973+
964974
for x in [too_big, too_small]:
965975
result = self.read_csv(StringIO(str(x)), header=None)
966-
expected = DataFrame([str(x)])
976+
if self.engine == 'python' and x == too_big:
977+
expected = DataFrame([x])
978+
else:
979+
expected = DataFrame([str(x)])
967980
tm.assert_frame_equal(result, expected)
968981

969982
def test_empty_with_nrows_chunksize(self):

pandas/src/inference.pyx

+159-17
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,
1313

1414
# core.common import for fast inference checks
1515

16-
npy_int64_max = np.iinfo(np.int64).max
17-
18-
1916
cpdef bint is_float(object obj):
2017
return util.is_float_object(obj)
2118

@@ -629,48 +626,157 @@ cdef extern from "parse_helper.h":
629626

630627
cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
631628
cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
629+
cdef uint64_t iUINT64_MAX = <uint64_t> UINT64_MAX
632630

633631

634-
def maybe_convert_numeric(object[:] values, set na_values,
632+
def maybe_convert_numeric(ndarray[object] values, set na_values,
635633
bint convert_empty=True, bint coerce_numeric=False):
636634
"""
637-
Type inference function-- convert strings to numeric (potentially) and
638-
convert to proper dtype array
635+
Convert object array to a numeric array if possible.
636+
637+
Parameters
638+
----------
639+
values : ndarray
640+
Array of object elements to convert.
641+
na_values : set
642+
Set of values that should be interpreted as NaN.
643+
convert_empty : bool, default True
644+
If an empty array-like object is encountered, whether to interpret
645+
that element as NaN or not. If set to False, a ValueError will be
646+
raised if such an element is encountered and 'coerce_numeric' is False.
647+
coerce_numeric : bool, default False
648+
If initial attempts to convert to numeric have failed, whether to
649+
force conversion to numeric via alternative methods or by setting the
650+
element to NaN. Otherwise, an Exception will be raised when such an
651+
element is encountered.
652+
653+
This boolean also has an impact on how conversion behaves when a
654+
numeric array has no suitable numerical dtype to return (i.e. uint64,
655+
int32, uint8). If set to False, the original object array will be
656+
returned. Otherwise, a ValueError will be raised.
657+
658+
Returns
659+
-------
660+
numeric_array : array of converted object values to numerical ones
639661
"""
640662
cdef:
641663
int status, maybe_int
642664
Py_ssize_t i, n = values.size
643665
ndarray[float64_t] floats = np.empty(n, dtype='f8')
644666
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
645667
ndarray[int64_t] ints = np.empty(n, dtype='i8')
668+
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
646669
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
670+
bint seen_null = False
671+
bint seen_uint = False
672+
bint seen_sint = False
647673
bint seen_float = False
648674
bint seen_complex = False
649675
bint seen_int = False
650676
bint seen_bool = False
651677
object val
652678
float64_t fval
653679

680+
681+
def check_uint64_nan():
682+
"""
683+
Check whether we have encountered uint64 when handling a NaN element.
684+
685+
If uint64 has been encountered, we cannot safely cast to float64 due
686+
to truncation problems (this would occur if we return a numeric array
687+
containing a NaN element).
688+
689+
Returns
690+
-------
691+
return_values : bool
692+
Whether or not we should return the original input array to avoid
693+
data truncation.
694+
"""
695+
if seen_null and seen_uint:
696+
if not coerce_numeric:
697+
return True
698+
else:
699+
raise ValueError("uint64 array detected, and such an "
700+
"array cannot contain NaN.")
701+
702+
return False
703+
704+
705+
def check_uint64_int64_conflict():
706+
"""
707+
Check whether we have encountered both int64 and uint64 elements.
708+
709+
If bot have been encountered, we cannot safely cast to an integer
710+
dtype since none is large enough to hold both types of elements.
711+
712+
Returns
713+
-------
714+
return_values : bool
715+
Whether or not we should return the original input array to avoid
716+
data truncation.
717+
"""
718+
if seen_sint and seen_uint:
719+
if not coerce_numeric:
720+
return True
721+
else:
722+
raise ValueError("uint64 and negative values detected. "
723+
"Cannot safely return a numeric array "
724+
"without truncating data.")
725+
726+
return False
727+
654728
for i in range(n):
655729
val = values[i]
656730

657731
if val.__hash__ is not None and val in na_values:
732+
seen_null = True
733+
if check_uint64_nan():
734+
return values
735+
658736
floats[i] = complexes[i] = nan
659737
seen_float = True
660738
elif util.is_float_object(val):
739+
if val != val:
740+
seen_null = True
741+
if check_uint64_nan():
742+
return values
743+
661744
floats[i] = complexes[i] = val
662745
seen_float = True
663746
elif util.is_integer_object(val):
664-
floats[i] = ints[i] = val
747+
floats[i] = complexes[i] = val
748+
as_int = int(val)
665749
seen_int = True
750+
751+
seen_uint = seen_uint or (as_int > iINT64_MAX)
752+
seen_sint = seen_sint or (as_int < 0)
753+
754+
if check_uint64_nan() or check_uint64_int64_conflict():
755+
return values
756+
757+
if seen_uint:
758+
uints[i] = as_int
759+
elif seen_sint:
760+
ints[i] = as_int
761+
else:
762+
uints[i] = as_int
763+
ints[i] = as_int
666764
elif util.is_bool_object(val):
667-
floats[i] = ints[i] = bools[i] = val
765+
floats[i] = uints[i] = ints[i] = bools[i] = val
668766
seen_bool = True
669767
elif val is None:
768+
seen_null = True
769+
if check_uint64_nan():
770+
return values
771+
670772
floats[i] = complexes[i] = nan
671773
seen_float = True
672774
elif hasattr(val, '__len__') and len(val) == 0:
673775
if convert_empty or coerce_numeric:
776+
seen_null = True
777+
if check_uint64_nan():
778+
return values
779+
674780
floats[i] = complexes[i] = nan
675781
seen_float = True
676782
else:
@@ -686,24 +792,55 @@ def maybe_convert_numeric(object[:] values, set na_values,
686792
status = floatify(val, &fval, &maybe_int)
687793

688794
if fval in na_values:
795+
seen_null = True
796+
if check_uint64_nan():
797+
return values
798+
689799
floats[i] = complexes[i] = nan
690800
seen_float = True
691801
else:
802+
if fval != fval:
803+
seen_null = True
804+
if check_uint64_nan():
805+
return values
806+
692807
floats[i] = fval
693808

694-
if not seen_float:
695-
if maybe_int:
696-
as_int = int(val)
809+
if maybe_int:
810+
as_int = int(val)
697811

698-
if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
812+
if as_int in na_values:
813+
seen_float = True
814+
seen_null = True
815+
else:
816+
seen_uint = seen_uint or (as_int > iINT64_MAX)
817+
seen_sint = seen_sint or (as_int < 0)
818+
seen_int = True
819+
820+
if check_uint64_nan() or check_uint64_int64_conflict():
821+
return values
822+
823+
if not (seen_float or as_int in na_values):
824+
if as_int < iINT64_MIN or as_int > iUINT64_MAX:
825+
raise ValueError('Integer out of range.')
826+
827+
if seen_uint:
828+
uints[i] = as_int
829+
elif seen_sint:
699830
ints[i] = as_int
700831
else:
701-
raise ValueError('integer out of range')
702-
else:
703-
seen_float = True
832+
uints[i] = as_int
833+
ints[i] = as_int
834+
else:
835+
seen_float = True
704836
except (TypeError, ValueError) as e:
705837
if not coerce_numeric:
706838
raise type(e)(str(e) + ' at position {}'.format(i))
839+
elif "uint64" in str(e): # Exception from check functions.
840+
raise
841+
seen_null = True
842+
if check_uint64_nan():
843+
return values
707844

708845
floats[i] = nan
709846
seen_float = True
@@ -713,9 +850,14 @@ def maybe_convert_numeric(object[:] values, set na_values,
713850
elif seen_float:
714851
return floats
715852
elif seen_int:
716-
return ints
853+
if seen_uint:
854+
return uints
855+
else:
856+
return ints
717857
elif seen_bool:
718858
return bools.view(np.bool_)
859+
elif seen_uint:
860+
return uints
719861
return ints
720862

721863

@@ -810,7 +952,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
810952
floats[i] = <float64_t> val
811953
complexes[i] = <double complex> val
812954
if not seen_null:
813-
seen_uint = seen_uint or (int(val) > npy_int64_max)
955+
seen_uint = seen_uint or (int(val) > iINT64_MAX)
814956
seen_sint = seen_sint or (val < 0)
815957

816958
if seen_uint and seen_sint:

pandas/tests/types/test_inference.py

+53
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,59 @@ def test_convert_non_hashable(self):
255255
result = lib.maybe_convert_numeric(arr, set(), False, True)
256256
tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan]))
257257

258+
def test_convert_numeric_uint64(self):
259+
arr = np.array([2**63], dtype=object)
260+
exp = np.array([2**63], dtype=np.uint64)
261+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
262+
263+
arr = np.array([str(2**63)], dtype=object)
264+
exp = np.array([2**63], dtype=np.uint64)
265+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
266+
267+
arr = np.array([np.uint64(2**63)], dtype=object)
268+
exp = np.array([2**63], dtype=np.uint64)
269+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
270+
271+
def test_convert_numeric_uint64_nan(self):
272+
msg = 'uint64 array detected'
273+
cases = [(np.array([2**63, np.nan], dtype=object), set()),
274+
(np.array([str(2**63), np.nan], dtype=object), set()),
275+
(np.array([np.nan, 2**63], dtype=object), set()),
276+
(np.array([np.nan, str(2**63)], dtype=object), set()),
277+
(np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
278+
(np.array([str(2**63), str(2**63 + 1)],
279+
dtype=object), set([2**63]))]
280+
281+
for coerce in (True, False):
282+
for arr, na_values in cases:
283+
if coerce:
284+
with tm.assertRaisesRegexp(ValueError, msg):
285+
lib.maybe_convert_numeric(arr, na_values,
286+
coerce_numeric=coerce)
287+
else:
288+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
289+
arr, na_values), arr)
290+
291+
def test_convert_numeric_int64_uint64(self):
292+
msg = 'uint64 and negative values detected'
293+
cases = [np.array([2**63, -1], dtype=object),
294+
np.array([str(2**63), -1], dtype=object),
295+
np.array([str(2**63), str(-1)], dtype=object),
296+
np.array([-1, 2**63], dtype=object),
297+
np.array([-1, str(2**63)], dtype=object),
298+
np.array([str(-1), str(2**63)], dtype=object)]
299+
300+
for coerce in (True, False):
301+
for case in cases:
302+
if coerce:
303+
with tm.assertRaisesRegexp(ValueError, msg):
304+
print(case)
305+
lib.maybe_convert_numeric(case, set(),
306+
coerce_numeric=coerce)
307+
else:
308+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
309+
case, set()), case)
310+
258311
def test_maybe_convert_objects_uint64(self):
259312
# see gh-4471
260313
arr = np.array([2**63], dtype=object)

0 commit comments

Comments
 (0)