Skip to content

Commit e15620f

Browse files
committed
BUG: Convert uint64 in maybe_convert_numeric
Add handling for uint64 elements in an array with the follow behavior specifications: 1) If uint64 and NaN are both detected, the original input will be returned if coerce_numeric is False. Otherwise, an Exception is raised. 2) If uint64 and negative numbers are both detected, the original input be returned if coerce_numeric is False. Otherwise, an Exception is raised. Closes gh-14982. Partial fix for gh-14983.
1 parent 3e3434b commit e15620f

File tree

4 files changed

+229
-23
lines changed

4 files changed

+229
-23
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -319,5 +319,5 @@ Bug Fixes
319319

320320

321321
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
322-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
322+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
323323
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)

pandas/io/tests/parser/common.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -944,26 +944,39 @@ def test_int64_overflow(self):
944944
00013007854817840017963235
945945
00013007854817840018860166"""
946946

947+
# 13007854817840016671868 > UINT64_MAX, so this
948+
# will overflow and return object as the dtype.
947949
result = self.read_csv(StringIO(data))
948950
self.assertTrue(result['ID'].dtype == object)
949951

950-
self.assertRaises(OverflowError, self.read_csv,
951-
StringIO(data), converters={'ID': np.int64})
952+
# 13007854817840016671868 > UINT64_MAX, so attempts
953+
# to cast to either int64 or uint64 will result in
954+
# an OverflowError being raised.
955+
for conv in (np.int64, np.uint64):
956+
self.assertRaises(OverflowError, self.read_csv,
957+
StringIO(data), converters={'ID': conv})
952958

953-
# Just inside int64 range: parse as integer
959+
# These numbers fall right inside the int64 range,
960+
# so they should be parsed as string.
954961
i_max = np.iinfo(np.int64).max
955962
i_min = np.iinfo(np.int64).min
963+
956964
for x in [i_max, i_min]:
957965
result = self.read_csv(StringIO(str(x)), header=None)
958966
expected = DataFrame([x])
959967
tm.assert_frame_equal(result, expected)
960968

961-
# Just outside int64 range: parse as string
969+
# These numbers fall just outside the int64 range,
970+
# so they should be parsed as string.
962971
too_big = i_max + 1
963972
too_small = i_min - 1
973+
964974
for x in [too_big, too_small]:
965975
result = self.read_csv(StringIO(str(x)), header=None)
966-
expected = DataFrame([str(x)])
976+
if self.engine == 'python' and x == too_big:
977+
expected = DataFrame([x])
978+
else:
979+
expected = DataFrame([str(x)])
967980
tm.assert_frame_equal(result, expected)
968981

969982
def test_empty_with_nrows_chunksize(self):

pandas/src/inference.pyx

+157-17
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,
1313

1414
# core.common import for fast inference checks
1515

16-
npy_int64_max = np.iinfo(np.int64).max
17-
18-
1916
cpdef bint is_float(object obj):
2017
return util.is_float_object(obj)
2118

@@ -629,48 +626,155 @@ cdef extern from "parse_helper.h":
629626

630627
cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
631628
cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
629+
cdef uint64_t iUINT64_MAX = <uint64_t> UINT64_MAX
632630

633631

634-
def maybe_convert_numeric(object[:] values, set na_values,
632+
def maybe_convert_numeric(ndarray[object] values, set na_values,
635633
bint convert_empty=True, bint coerce_numeric=False):
636634
"""
637-
Type inference function-- convert strings to numeric (potentially) and
638-
convert to proper dtype array
635+
Convert object array to a numeric array if possible.
636+
637+
Parameters
638+
----------
639+
values : ndarray
640+
Array of object elements to convert.
641+
na_values : set
642+
Set of values that should be interpreted as NaN.
643+
convert_empty : bool, default True
644+
If an empty array-like object is encountered, whether to interpret
645+
that element as NaN or not. If set to False, a ValueError will be
646+
raised if such an element is encountered and 'coerce_numeric' is False.
647+
coerce_numeric : bool, default False
648+
If initial attempts to convert to numeric have failed, whether to
649+
force conversion to numeric via alternative methods or by setting the
650+
element to NaN. Otherwise, an Exception will be raised when such an
651+
element is encountered.
652+
653+
This boolean also has an impact on how conversion behaves when a
654+
numeric array has no suitable numerical dtype to return (i.e. uint64,
655+
int32, uint8). If set to False, the original object array will be
656+
returned. Otherwise, a ValueError will be raised.
657+
658+
Returns
659+
-------
660+
numeric_array : array of converted object values to numerical ones
639661
"""
640662
cdef:
641663
int status, maybe_int
642664
Py_ssize_t i, n = values.size
643665
ndarray[float64_t] floats = np.empty(n, dtype='f8')
644666
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
645667
ndarray[int64_t] ints = np.empty(n, dtype='i8')
668+
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
646669
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
670+
bint seen_null = False
671+
bint seen_uint = False
672+
bint seen_sint = False
647673
bint seen_float = False
648674
bint seen_complex = False
649675
bint seen_int = False
650676
bint seen_bool = False
651677
object val
652678
float64_t fval
653679

680+
def check_uint64_nan():
681+
"""
682+
Check whether we have encountered uint64 when handling a NaN element.
683+
684+
If uint64 has been encountered, we cannot safely cast to float64 due
685+
to truncation problems (this would occur if we return a numeric array
686+
containing a NaN element).
687+
688+
Returns
689+
-------
690+
return_values : bool
691+
Whether or not we should return the original input array to avoid
692+
data truncation.
693+
"""
694+
if seen_null and seen_uint:
695+
if not coerce_numeric:
696+
return True
697+
else:
698+
raise ValueError("uint64 array detected, and such an "
699+
"array cannot contain NaN.")
700+
701+
return False
702+
703+
def check_uint64_int64_conflict():
704+
"""
705+
Check whether we have encountered both int64 and uint64 elements.
706+
707+
If bot have been encountered, we cannot safely cast to an integer
708+
dtype since none is large enough to hold both types of elements.
709+
710+
Returns
711+
-------
712+
return_values : bool
713+
Whether or not we should return the original input array to avoid
714+
data truncation.
715+
"""
716+
if seen_sint and seen_uint:
717+
if not coerce_numeric:
718+
return True
719+
else:
720+
raise ValueError("uint64 and negative values detected. "
721+
"Cannot safely return a numeric array "
722+
"without truncating data.")
723+
724+
return False
725+
654726
for i in range(n):
655727
val = values[i]
656728

657729
if val.__hash__ is not None and val in na_values:
730+
seen_null = True
731+
if check_uint64_nan():
732+
return values
733+
658734
floats[i] = complexes[i] = nan
659735
seen_float = True
660736
elif util.is_float_object(val):
737+
if val != val:
738+
seen_null = True
739+
if check_uint64_nan():
740+
return values
741+
661742
floats[i] = complexes[i] = val
662743
seen_float = True
663744
elif util.is_integer_object(val):
664-
floats[i] = ints[i] = val
745+
floats[i] = complexes[i] = val
746+
as_int = int(val)
665747
seen_int = True
748+
749+
seen_uint = seen_uint or (as_int > iINT64_MAX)
750+
seen_sint = seen_sint or (as_int < 0)
751+
752+
if check_uint64_nan() or check_uint64_int64_conflict():
753+
return values
754+
755+
if seen_uint:
756+
uints[i] = as_int
757+
elif seen_sint:
758+
ints[i] = as_int
759+
else:
760+
uints[i] = as_int
761+
ints[i] = as_int
666762
elif util.is_bool_object(val):
667-
floats[i] = ints[i] = bools[i] = val
763+
floats[i] = uints[i] = ints[i] = bools[i] = val
668764
seen_bool = True
669765
elif val is None:
766+
seen_null = True
767+
if check_uint64_nan():
768+
return values
769+
670770
floats[i] = complexes[i] = nan
671771
seen_float = True
672772
elif hasattr(val, '__len__') and len(val) == 0:
673773
if convert_empty or coerce_numeric:
774+
seen_null = True
775+
if check_uint64_nan():
776+
return values
777+
674778
floats[i] = complexes[i] = nan
675779
seen_float = True
676780
else:
@@ -686,24 +790,55 @@ def maybe_convert_numeric(object[:] values, set na_values,
686790
status = floatify(val, &fval, &maybe_int)
687791

688792
if fval in na_values:
793+
seen_null = True
794+
if check_uint64_nan():
795+
return values
796+
689797
floats[i] = complexes[i] = nan
690798
seen_float = True
691799
else:
800+
if fval != fval:
801+
seen_null = True
802+
if check_uint64_nan():
803+
return values
804+
692805
floats[i] = fval
693806

694-
if not seen_float:
695-
if maybe_int:
696-
as_int = int(val)
807+
if maybe_int:
808+
as_int = int(val)
697809

698-
if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
810+
if as_int in na_values:
811+
seen_float = True
812+
seen_null = True
813+
else:
814+
seen_uint = seen_uint or (as_int > iINT64_MAX)
815+
seen_sint = seen_sint or (as_int < 0)
816+
seen_int = True
817+
818+
if check_uint64_nan() or check_uint64_int64_conflict():
819+
return values
820+
821+
if not (seen_float or as_int in na_values):
822+
if as_int < iINT64_MIN or as_int > iUINT64_MAX:
823+
raise ValueError('Integer out of range.')
824+
825+
if seen_uint:
826+
uints[i] = as_int
827+
elif seen_sint:
699828
ints[i] = as_int
700829
else:
701-
raise ValueError('integer out of range')
702-
else:
703-
seen_float = True
830+
uints[i] = as_int
831+
ints[i] = as_int
832+
else:
833+
seen_float = True
704834
except (TypeError, ValueError) as e:
705835
if not coerce_numeric:
706836
raise type(e)(str(e) + ' at position {}'.format(i))
837+
elif "uint64" in str(e): # Exception from check functions.
838+
raise
839+
seen_null = True
840+
if check_uint64_nan():
841+
return values
707842

708843
floats[i] = nan
709844
seen_float = True
@@ -713,9 +848,14 @@ def maybe_convert_numeric(object[:] values, set na_values,
713848
elif seen_float:
714849
return floats
715850
elif seen_int:
716-
return ints
851+
if seen_uint:
852+
return uints
853+
else:
854+
return ints
717855
elif seen_bool:
718856
return bools.view(np.bool_)
857+
elif seen_uint:
858+
return uints
719859
return ints
720860

721861

@@ -810,7 +950,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
810950
floats[i] = <float64_t> val
811951
complexes[i] = <double complex> val
812952
if not seen_null:
813-
seen_uint = seen_uint or (int(val) > npy_int64_max)
953+
seen_uint = seen_uint or (int(val) > iINT64_MAX)
814954
seen_sint = seen_sint or (val < 0)
815955

816956
if seen_uint and seen_sint:

pandas/tests/types/test_inference.py

+53
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,59 @@ def test_convert_non_hashable(self):
255255
result = lib.maybe_convert_numeric(arr, set(), False, True)
256256
tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan]))
257257

258+
def test_convert_numeric_uint64(self):
259+
arr = np.array([2**63], dtype=object)
260+
exp = np.array([2**63], dtype=np.uint64)
261+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
262+
263+
arr = np.array([str(2**63)], dtype=object)
264+
exp = np.array([2**63], dtype=np.uint64)
265+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
266+
267+
arr = np.array([np.uint64(2**63)], dtype=object)
268+
exp = np.array([2**63], dtype=np.uint64)
269+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
270+
271+
def test_convert_numeric_uint64_nan(self):
272+
msg = 'uint64 array detected'
273+
cases = [(np.array([2**63, np.nan], dtype=object), set()),
274+
(np.array([str(2**63), np.nan], dtype=object), set()),
275+
(np.array([np.nan, 2**63], dtype=object), set()),
276+
(np.array([np.nan, str(2**63)], dtype=object), set()),
277+
(np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
278+
(np.array([str(2**63), str(2**63 + 1)],
279+
dtype=object), set([2**63]))]
280+
281+
for coerce in (True, False):
282+
for arr, na_values in cases:
283+
if coerce:
284+
with tm.assertRaisesRegexp(ValueError, msg):
285+
lib.maybe_convert_numeric(arr, na_values,
286+
coerce_numeric=coerce)
287+
else:
288+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
289+
arr, na_values), arr)
290+
291+
def test_convert_numeric_int64_uint64(self):
292+
msg = 'uint64 and negative values detected'
293+
cases = [np.array([2**63, -1], dtype=object),
294+
np.array([str(2**63), -1], dtype=object),
295+
np.array([str(2**63), str(-1)], dtype=object),
296+
np.array([-1, 2**63], dtype=object),
297+
np.array([-1, str(2**63)], dtype=object),
298+
np.array([str(-1), str(2**63)], dtype=object)]
299+
300+
for coerce in (True, False):
301+
for case in cases:
302+
if coerce:
303+
with tm.assertRaisesRegexp(ValueError, msg):
304+
print(case)
305+
lib.maybe_convert_numeric(case, set(),
306+
coerce_numeric=coerce)
307+
else:
308+
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
309+
case, set()), case)
310+
258311
def test_maybe_convert_objects_uint64(self):
259312
# see gh-4471
260313
arr = np.array([2**63], dtype=object)

0 commit comments

Comments
 (0)