Skip to content

Commit 03e926e

Browse files
committed
BUG: Prevent uint64 overflow in Series.unique
Introduces a UInt64HashTable class to hash uint64 elements and prevent overflow in functions like Series.unique. Closes gh-14721.
1 parent 39efbbc commit 03e926e

10 files changed

+59
-7
lines changed

doc/source/whatsnew/v0.20.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -257,4 +257,6 @@ Bug Fixes
257257

258258

259259

260+
261+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
260262
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)

pandas/core/algorithms.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
_ensure_platform_int,
2626
_ensure_object,
2727
_ensure_float64,
28+
_ensure_uint64,
2829
_ensure_int64,
2930
is_list_like)
3031
from pandas.compat.numpy import _np_version_under1p10
@@ -129,9 +130,12 @@ def unique1d(values):
129130
table = htable.Int64HashTable(len(values))
130131
uniques = table.unique(_ensure_int64(values))
131132
uniques = uniques.view('m8[ns]')
132-
elif np.issubdtype(values.dtype, np.integer):
133+
elif np.issubdtype(values.dtype, np.signedinteger):
133134
table = htable.Int64HashTable(len(values))
134135
uniques = table.unique(_ensure_int64(values))
136+
elif np.issubdtype(values.dtype, np.unsignedinteger):
137+
table = htable.UInt64HashTable(len(values))
138+
uniques = table.unique(_ensure_uint64(values))
135139
else:
136140

137141
# its cheaper to use a String Hash Table than Object

pandas/hashtable.pxd

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
1-
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
1+
from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
2+
kh_str_t, uint64_t, int64_t, float64_t)
23

34
# prototypes for sharing
45

56
cdef class HashTable:
67
pass
78

9+
cdef class UInt64HashTable(HashTable):
10+
cdef kh_uint64_t *table
11+
12+
cpdef get_item(self, uint64_t val)
13+
cpdef set_item(self, uint64_t key, Py_ssize_t val)
14+
815
cdef class Int64HashTable(HashTable):
916
cdef kh_int64_t *table
1017

pandas/src/algos_common_helper.pxi.in

+1
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
553553
('int16', 'INT16', 'int16'),
554554
('int32', 'INT32', 'int32'),
555555
('int64', 'INT64', 'int64'),
556+
('uint64', 'UINT64', 'uint64'),
556557
# ('platform_int', 'INT', 'int_'),
557558
# ('object', 'OBJECT', 'object_'),
558559
]

pandas/src/hashtable_class_helper.pxi.in

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1717

1818
dtypes = [('Float64', 'float64', 'float64_t'),
1919
('Int64', 'int64', 'int64_t'),
20-
('String', 'string', 'char *')]
20+
('String', 'string', 'char *'),
21+
('UInt64', 'uint64', 'uint64_t')]
2122
}}
2223

2324
{{for name, dtype, arg in dtypes}}
@@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
4041

4142
ctypedef fused vector_data:
4243
Int64VectorData
44+
UInt64VectorData
4345
Float64VectorData
4446
StringVectorData
4547

@@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil:
5456

5557
# name, dtype, arg, idtype
5658
dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
59+
('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
5760
('Int64', 'int64', 'int64_t', 'np.int64')]
5861

5962
}}
@@ -201,6 +204,7 @@ cdef class HashTable:
201204

202205
# name, dtype, null_condition, float_group
203206
dtypes = [('Float64', 'float64', 'val != val', True),
207+
('UInt64', 'uint64', 'val == 0', False),
204208
('Int64', 'int64', 'val == iNaT', False)]
205209

206210
}}

pandas/src/hashtable_func_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1111
{{py:
1212

1313
# name
14-
dtypes = ['float64', 'int64']
14+
dtypes = ['float64', 'int64', 'uint64']
1515

1616
}}
1717

pandas/src/khash.pxd

+19-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from cpython cimport PyObject
2-
from numpy cimport int64_t, int32_t, uint32_t, float64_t
2+
from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t
33

44
cdef extern from "khash_python.h":
55
ctypedef uint32_t khint_t
@@ -55,7 +55,6 @@ cdef extern from "khash_python.h":
5555

5656
bint kh_exist_str(kh_str_t*, khiter_t) nogil
5757

58-
5958
ctypedef struct kh_int64_t:
6059
khint_t n_buckets, size, n_occupied, upper_bound
6160
uint32_t *flags
@@ -72,6 +71,24 @@ cdef extern from "khash_python.h":
7271

7372
bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
7473

74+
ctypedef uint64_t khuint64_t
75+
76+
ctypedef struct kh_uint64_t:
77+
khint_t n_buckets, size, n_occupied, upper_bound
78+
uint32_t *flags
79+
khuint64_t *keys
80+
size_t *vals
81+
82+
inline kh_uint64_t* kh_init_uint64() nogil
83+
inline void kh_destroy_uint64(kh_uint64_t*) nogil
84+
inline void kh_clear_uint64(kh_uint64_t*) nogil
85+
inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil
86+
inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
87+
inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil
88+
inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil
89+
90+
bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
91+
7592
ctypedef struct kh_float64_t:
7693
khint_t n_buckets, size, n_occupied, upper_bound
7794
uint32_t *flags

pandas/src/klib/khash.h

+2
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,14 @@ typedef const char *kh_cstr_t;
567567

568568
#define kh_exist_str(h, k) (kh_exist(h, k))
569569
#define kh_exist_float64(h, k) (kh_exist(h, k))
570+
#define kh_exist_uint64(h, k) (kh_exist(h, k))
570571
#define kh_exist_int64(h, k) (kh_exist(h, k))
571572
#define kh_exist_int32(h, k) (kh_exist(h, k))
572573

573574
KHASH_MAP_INIT_STR(str, size_t)
574575
KHASH_MAP_INIT_INT(int32, size_t)
575576
KHASH_MAP_INIT_INT64(int64, size_t)
577+
KHASH_MAP_INIT_UINT64(uint64, size_t)
576578

577579

578580
#endif /* __AC_KHASH_H */

pandas/tests/test_algos.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,18 @@ def test_lookup_nan(self):
899899
self.assert_numpy_array_equal(m.lookup(xs),
900900
np.arange(len(xs), dtype=np.int64))
901901

902+
def test_lookup_overflow(self):
903+
xs = np.array([1, 2, 2**63], dtype=np.uint64)
904+
m = hashtable.UInt64HashTable()
905+
m.map_locations(xs)
906+
self.assert_numpy_array_equal(m.lookup(xs),
907+
np.arange(len(xs), dtype=np.int64))
908+
909+
def test_get_unique(self):
910+
s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
911+
exp = np.array([1, 2, 2**63], dtype=np.uint64)
912+
self.assert_numpy_array_equal(s.unique(), exp)
913+
902914
def test_vector_resize(self):
903915
# Test for memory errors after internal vector
904916
# reallocations (pull request #7157)
@@ -915,7 +927,8 @@ def _test_vector_resize(htable, uniques, dtype, nvals):
915927
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
916928
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
917929
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
918-
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
930+
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
931+
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]
919932

920933
for (tbl, vect, dtype) in test_cases:
921934
# resizing to empty is a special case

pandas/types/common.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def _ensure_float(arr):
3232
arr = arr.astype(float)
3333
return arr
3434

35+
36+
_ensure_uint64 = algos.ensure_uint64
3537
_ensure_int64 = algos.ensure_int64
3638
_ensure_int32 = algos.ensure_int32
3739
_ensure_int16 = algos.ensure_int16

0 commit comments

Comments
 (0)