Skip to content

Commit d9b7c84

Browse files
committed
BUG: Use UInt64HashTable in pd.unique
Uses UInt64HashTable to patch a uint64 overflow bug in pd.unique analogous to that seen in Series.unique (patched in pandas-devgh-14915).
1 parent 0725916 commit d9b7c84

File tree

4 files changed

+24
-2
lines changed

4 files changed

+24
-2
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -289,5 +289,6 @@ Bug Fixes
289289

290290

291291
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
292+
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
292293
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
293294
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/core/algorithms.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from pandas import compat, lib, tslib, _np_version_under1p8
1010
from pandas.types.cast import _maybe_promote
1111
from pandas.types.generic import ABCSeries, ABCIndex
12-
from pandas.types.common import (is_integer_dtype,
12+
from pandas.types.common import (is_unsigned_integer_dtype,
13+
is_signed_integer_dtype,
14+
is_integer_dtype,
1315
is_int64_dtype,
1416
is_categorical_dtype,
1517
is_extension_type,
@@ -913,8 +915,10 @@ def _hashtable_algo(f, values, return_dtype=None):
913915
dtype = values.dtype
914916
if is_float_dtype(dtype):
915917
return f(htable.Float64HashTable, _ensure_float64)
916-
elif is_integer_dtype(dtype):
918+
elif is_signed_integer_dtype(dtype):
917919
return f(htable.Int64HashTable, _ensure_int64)
920+
elif is_unsigned_integer_dtype(dtype):
921+
return f(htable.UInt64HashTable, _ensure_uint64)
918922
elif is_datetime64_dtype(dtype):
919923
return_dtype = return_dtype or 'M8[ns]'
920924
return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

pandas/tests/test_algos.py

+5
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,11 @@ def test_timedelta64_dtype_array_returned(self):
365365
tm.assert_numpy_array_equal(result, expected)
366366
self.assertEqual(result.dtype, expected.dtype)
367367

368+
def test_uint64_overflow(self):
369+
s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
370+
exp = np.array([1, 2, 2**63], dtype=np.uint64)
371+
tm.assert_numpy_array_equal(algos.unique(s), exp)
372+
368373

369374
class TestIsin(tm.TestCase):
370375
_multiprocess_can_split_ = True

pandas/types/common.py

+12
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,18 @@ def is_integer_dtype(arr_or_dtype):
155155
not issubclass(tipo, (np.datetime64, np.timedelta64)))
156156

157157

158+
def is_signed_integer_dtype(arr_or_dtype):
159+
tipo = _get_dtype_type(arr_or_dtype)
160+
return (issubclass(tipo, np.signedinteger) and
161+
not issubclass(tipo, (np.datetime64, np.timedelta64)))
162+
163+
164+
def is_unsigned_integer_dtype(arr_or_dtype):
165+
tipo = _get_dtype_type(arr_or_dtype)
166+
return (issubclass(tipo, np.unsignedinteger) and
167+
not issubclass(tipo, (np.datetime64, np.timedelta64)))
168+
169+
158170
def is_int64_dtype(arr_or_dtype):
159171
tipo = _get_dtype_type(arr_or_dtype)
160172
return issubclass(tipo, np.int64)

0 commit comments

Comments
 (0)