Skip to content

Commit 3e21ad6

Browse files
committed
BUG: Prevent uint64 overflow in Series.unique
Introduces a UInt64HashTable class to hash uint64 elements and prevent overflow in functions like Series.unique. Closes pandas-devgh-14721.
1 parent f1cfe5b commit 3e21ad6

File tree

10 files changed

+62
-9
lines changed

10 files changed

+62
-9
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -250,4 +250,5 @@ Bug Fixes
250250

251251

252252

253+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
253254
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)

pandas/core/nanops.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111

1212
import pandas.hashtable as _hash
1313
from pandas import compat, lib, algos, tslib
14-
from pandas.types.common import (_ensure_int64, _ensure_object,
15-
_ensure_float64, _get_dtype,
16-
is_float, is_scalar,
14+
from pandas.types.common import (_ensure_int64, _ensure_uint64,
15+
_ensure_object, _ensure_float64,
16+
_get_dtype, is_float, is_scalar,
1717
is_integer, is_complex, is_float_dtype,
1818
is_complex_dtype, is_integer_dtype,
1919
is_bool_dtype, is_object_dtype,
@@ -802,9 +802,12 @@ def unique1d(values):
802802
table = _hash.Int64HashTable(len(values))
803803
uniques = table.unique(_ensure_int64(values))
804804
uniques = uniques.view('m8[ns]')
805-
elif np.issubdtype(values.dtype, np.integer):
805+
elif np.issubdtype(values.dtype, np.signedinteger):
806806
table = _hash.Int64HashTable(len(values))
807807
uniques = table.unique(_ensure_int64(values))
808+
elif np.issubdtype(values.dtype, np.unsignedinteger):
809+
table = _hash.UInt64HashTable(len(values))
810+
uniques = table.unique(_ensure_uint64(values))
808811
else:
809812
table = _hash.PyObjectHashTable(len(values))
810813
uniques = table.unique(_ensure_object(values))

pandas/hashtable.pxd

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
1-
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
1+
from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
2+
kh_str_t, uint64_t, int64_t, float64_t)
23

34
# prototypes for sharing
45

56
cdef class HashTable:
67
pass
78

9+
cdef class UInt64HashTable(HashTable):
10+
cdef kh_uint64_t *table
11+
12+
cpdef get_item(self, uint64_t val)
13+
cpdef set_item(self, uint64_t key, Py_ssize_t val)
14+
815
cdef class Int64HashTable(HashTable):
916
cdef kh_int64_t *table
1017

pandas/src/algos_common_helper.pxi.in

+1
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
553553
('int16', 'INT16', 'int16'),
554554
('int32', 'INT32', 'int32'),
555555
('int64', 'INT64', 'int64'),
556+
('uint64', 'UINT64', 'uint64'),
556557
# ('platform_int', 'INT', 'int_'),
557558
# ('object', 'OBJECT', 'object_'),
558559
]

pandas/src/hashtable_class_helper.pxi.in

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1717

1818
dtypes = [('Float64', 'float64', 'float64_t'),
1919
('Int64', 'int64', 'int64_t'),
20-
('String', 'string', 'char *')]
20+
('String', 'string', 'char *'),
21+
('UInt64', 'uint64', 'uint64_t')]
2122
}}
2223

2324
{{for name, dtype, arg in dtypes}}
@@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
4041

4142
ctypedef fused vector_data:
4243
Int64VectorData
44+
UInt64VectorData
4345
Float64VectorData
4446
StringVectorData
4547

@@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil:
5456

5557
# name, dtype, arg, idtype
5658
dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
59+
('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
5760
('Int64', 'int64', 'int64_t', 'np.int64')]
5861

5962
}}
@@ -201,6 +204,7 @@ cdef class HashTable:
201204

202205
# name, dtype, null_condition, float_group
203206
dtypes = [('Float64', 'float64', 'val != val', True),
207+
('UInt64', 'uint64', 'val == 0', False),
204208
('Int64', 'int64', 'val == iNaT', False)]
205209

206210
}}

pandas/src/hashtable_func_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1111
{{py:
1212

1313
# name
14-
dtypes = ['float64', 'int64']
14+
dtypes = ['float64', 'int64', 'uint64']
1515

1616
}}
1717

pandas/src/khash.pxd

+19-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from cpython cimport PyObject
2-
from numpy cimport int64_t, int32_t, uint32_t, float64_t
2+
from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t
33

44
cdef extern from "khash_python.h":
55
ctypedef uint32_t khint_t
@@ -55,7 +55,6 @@ cdef extern from "khash_python.h":
5555

5656
bint kh_exist_str(kh_str_t*, khiter_t) nogil
5757

58-
5958
ctypedef struct kh_int64_t:
6059
khint_t n_buckets, size, n_occupied, upper_bound
6160
uint32_t *flags
@@ -72,6 +71,24 @@ cdef extern from "khash_python.h":
7271

7372
bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
7473

74+
ctypedef uint64_t khuint64_t
75+
76+
ctypedef struct kh_uint64_t:
77+
khint_t n_buckets, size, n_occupied, upper_bound
78+
uint32_t *flags
79+
khuint64_t *keys
80+
size_t *vals
81+
82+
inline kh_uint64_t* kh_init_uint64() nogil
83+
inline void kh_destroy_uint64(kh_uint64_t*) nogil
84+
inline void kh_clear_uint64(kh_uint64_t*) nogil
85+
inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil
86+
inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
87+
inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil
88+
inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil
89+
90+
bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
91+
7592
ctypedef struct kh_float64_t:
7693
khint_t n_buckets, size, n_occupied, upper_bound
7794
uint32_t *flags

pandas/src/klib/khash.h

+2
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,14 @@ typedef const char *kh_cstr_t;
567567

568568
#define kh_exist_str(h, k) (kh_exist(h, k))
569569
#define kh_exist_float64(h, k) (kh_exist(h, k))
570+
#define kh_exist_uint64(h, k) (kh_exist(h, k))
570571
#define kh_exist_int64(h, k) (kh_exist(h, k))
571572
#define kh_exist_int32(h, k) (kh_exist(h, k))
572573

573574
KHASH_MAP_INIT_STR(str, size_t)
574575
KHASH_MAP_INIT_INT(int32, size_t)
575576
KHASH_MAP_INIT_INT64(int64, size_t)
577+
KHASH_MAP_INIT_UINT64(uint64, size_t)
576578

577579

578580
#endif /* __AC_KHASH_H */

pandas/tests/test_base.py

+16
Original file line numberDiff line numberDiff line change
@@ -1051,6 +1051,22 @@ def test_searchsorted(self):
10511051
self.assertTrue(0 <= index <= len(o))
10521052

10531053

1054+
class TestUInt64HashTable(tm.TestCase):
1055+
1056+
def test_lookup_overflow(self):
1057+
from pandas.hashtable import UInt64HashTable
1058+
xs = np.array([1, 2, 2**63], dtype=np.uint64)
1059+
m = UInt64HashTable()
1060+
m.map_locations(xs)
1061+
self.assert_numpy_array_equal(m.lookup(xs),
1062+
np.arange(len(xs), dtype=np.int64))
1063+
1064+
def test_get_unique(self):
1065+
s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
1066+
exp = np.array([1, 2, 2**63], dtype=np.uint64)
1067+
self.assert_numpy_array_equal(s.unique(), exp)
1068+
1069+
10541070
class TestFloat64HashTable(tm.TestCase):
10551071

10561072
def test_lookup_nan(self):

pandas/types/common.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def _ensure_float(arr):
3232
arr = arr.astype(float)
3333
return arr
3434

35+
36+
_ensure_uint64 = algos.ensure_uint64
3537
_ensure_int64 = algos.ensure_int64
3638
_ensure_int32 = algos.ensure_int32
3739
_ensure_int16 = algos.ensure_int16

0 commit comments

Comments
 (0)