Skip to content

BUG: Prevent uint64 overflow in Series.unique #14915

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,4 +258,6 @@ Bug Fixes




- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
6 changes: 5 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_ensure_platform_int,
_ensure_object,
_ensure_float64,
_ensure_uint64,
_ensure_int64,
is_list_like)
from pandas.compat.numpy import _np_version_under1p10
Expand Down Expand Up @@ -129,9 +130,12 @@ def unique1d(values):
table = htable.Int64HashTable(len(values))
uniques = table.unique(_ensure_int64(values))
uniques = uniques.view('m8[ns]')
elif np.issubdtype(values.dtype, np.integer):
elif np.issubdtype(values.dtype, np.signedinteger):
table = htable.Int64HashTable(len(values))
uniques = table.unique(_ensure_int64(values))
elif np.issubdtype(values.dtype, np.unsignedinteger):
table = htable.UInt64HashTable(len(values))
uniques = table.unique(_ensure_uint64(values))
else:

# its cheaper to use a String Hash Table than Object
Expand Down
9 changes: 8 additions & 1 deletion pandas/hashtable.pxd
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t
from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
kh_str_t, uint64_t, int64_t, float64_t)

# prototypes for sharing

cdef class HashTable:
pass

cdef class UInt64HashTable(HashTable):
cdef kh_uint64_t *table

cpdef get_item(self, uint64_t val)
cpdef set_item(self, uint64_t key, Py_ssize_t val)

cdef class Int64HashTable(HashTable):
cdef kh_int64_t *table

Expand Down
1 change: 1 addition & 0 deletions pandas/src/algos_common_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
('int16', 'INT16', 'int16'),
('int32', 'INT32', 'int32'),
('int64', 'INT64', 'int64'),
('uint64', 'UINT64', 'uint64'),
# ('platform_int', 'INT', 'int_'),
# ('object', 'OBJECT', 'object_'),
]
Expand Down
6 changes: 5 additions & 1 deletion pandas/src/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in

dtypes = [('Float64', 'float64', 'float64_t'),
('Int64', 'int64', 'int64_t'),
('String', 'string', 'char *')]
('String', 'string', 'char *'),
('UInt64', 'uint64', 'uint64_t')]
}}

{{for name, dtype, arg in dtypes}}
Expand All @@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data,

ctypedef fused vector_data:
Int64VectorData
UInt64VectorData
Float64VectorData
StringVectorData

Expand All @@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil:

# name, dtype, arg, idtype
dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'),
('UInt64', 'uint64', 'uint64_t', 'np.uint64'),
('Int64', 'int64', 'int64_t', 'np.int64')]

}}
Expand Down Expand Up @@ -201,6 +204,7 @@ cdef class HashTable:

# name, dtype, null_condition, float_group
dtypes = [('Float64', 'float64', 'val != val', True),
('UInt64', 'uint64', 'val == 0', False),
('Int64', 'int64', 'val == iNaT', False)]

}}
Expand Down
2 changes: 1 addition & 1 deletion pandas/src/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
{{py:

# name
dtypes = ['float64', 'int64']
dtypes = ['float64', 'int64', 'uint64']

}}

Expand Down
21 changes: 19 additions & 2 deletions pandas/src/khash.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from cpython cimport PyObject
from numpy cimport int64_t, int32_t, uint32_t, float64_t
from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t

cdef extern from "khash_python.h":
ctypedef uint32_t khint_t
Expand Down Expand Up @@ -55,7 +55,6 @@ cdef extern from "khash_python.h":

bint kh_exist_str(kh_str_t*, khiter_t) nogil


ctypedef struct kh_int64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
Expand All @@ -72,6 +71,24 @@ cdef extern from "khash_python.h":

bint kh_exist_int64(kh_int64_t*, khiter_t) nogil

ctypedef uint64_t khuint64_t

ctypedef struct kh_uint64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
khuint64_t *keys
size_t *vals

inline kh_uint64_t* kh_init_uint64() nogil
inline void kh_destroy_uint64(kh_uint64_t*) nogil
inline void kh_clear_uint64(kh_uint64_t*) nogil
inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil
inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil
inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil
inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil

bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil

ctypedef struct kh_float64_t:
khint_t n_buckets, size, n_occupied, upper_bound
uint32_t *flags
Expand Down
2 changes: 2 additions & 0 deletions pandas/src/klib/khash.h
Original file line number Diff line number Diff line change
Expand Up @@ -567,12 +567,14 @@ typedef const char *kh_cstr_t;

#define kh_exist_str(h, k) (kh_exist(h, k))
#define kh_exist_float64(h, k) (kh_exist(h, k))
#define kh_exist_uint64(h, k) (kh_exist(h, k))
#define kh_exist_int64(h, k) (kh_exist(h, k))
#define kh_exist_int32(h, k) (kh_exist(h, k))

KHASH_MAP_INIT_STR(str, size_t)
KHASH_MAP_INIT_INT(int32, size_t)
KHASH_MAP_INIT_INT64(int64, size_t)
KHASH_MAP_INIT_UINT64(uint64, size_t)


#endif /* __AC_KHASH_H */
15 changes: 14 additions & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,18 @@ def test_lookup_nan(self):
self.assert_numpy_array_equal(m.lookup(xs),
np.arange(len(xs), dtype=np.int64))

def test_lookup_overflow(self):
xs = np.array([1, 2, 2**63], dtype=np.uint64)
m = hashtable.UInt64HashTable()
m.map_locations(xs)
self.assert_numpy_array_equal(m.lookup(xs),
np.arange(len(xs), dtype=np.int64))

def test_get_unique(self):
s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
exp = np.array([1, 2, 2**63], dtype=np.uint64)
self.assert_numpy_array_equal(s.unique(), exp)

def test_vector_resize(self):
# Test for memory errors after internal vector
# reallocations (pull request #7157)
Expand All @@ -915,7 +927,8 @@ def _test_vector_resize(htable, uniques, dtype, nvals):
(hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'),
(hashtable.StringHashTable, hashtable.ObjectVector, 'object'),
(hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'),
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')]
(hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'),
(hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')]

for (tbl, vect, dtype) in test_cases:
# resizing to empty is a special case
Expand Down
2 changes: 2 additions & 0 deletions pandas/types/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def _ensure_float(arr):
arr = arr.astype(float)
return arr


_ensure_uint64 = algos.ensure_uint64
_ensure_int64 = algos.ensure_int64
_ensure_int32 = algos.ensure_int32
_ensure_int16 = algos.ensure_int16
Expand Down