BUG: Use UInt64HashTable in pd.unique

gfyoung · gfyoung · commit d9b7c841d2b6 · 2016-12-20T17:38:39.000-05:00
Uses UInt64HashTable to patch a uint64 overflow bug in pd.unique analogous to that seen in Series.unique (patched in pandas-devgh-14915).
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -289,5 +289,6 @@ Bug Fixes
 
 
 - Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
+- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
 - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -9,7 +9,9 @@
 from pandas import compat, lib, tslib, _np_version_under1p8
 from pandas.types.cast import _maybe_promote
 from pandas.types.generic import ABCSeries, ABCIndex
-from pandas.types.common import (is_integer_dtype,
+from pandas.types.common import (is_unsigned_integer_dtype,
+                                 is_signed_integer_dtype,
+                                 is_integer_dtype,
                                  is_int64_dtype,
                                  is_categorical_dtype,
                                  is_extension_type,
@@ -913,8 +915,10 @@ def _hashtable_algo(f, values, return_dtype=None):
     dtype = values.dtype
     if is_float_dtype(dtype):
         return f(htable.Float64HashTable, _ensure_float64)
-    elif is_integer_dtype(dtype):
+    elif is_signed_integer_dtype(dtype):
         return f(htable.Int64HashTable, _ensure_int64)
+    elif is_unsigned_integer_dtype(dtype):
+        return f(htable.UInt64HashTable, _ensure_uint64)
     elif is_datetime64_dtype(dtype):
         return_dtype = return_dtype or 'M8[ns]'
         return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -365,6 +365,11 @@ def test_timedelta64_dtype_array_returned(self):
         tm.assert_numpy_array_equal(result, expected)
         self.assertEqual(result.dtype, expected.dtype)
 
+    def test_uint64_overflow(self):
+        s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+        exp = np.array([1, 2, 2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(algos.unique(s), exp)
+
 
 class TestIsin(tm.TestCase):
     _multiprocess_can_split_ = True
diff --git a/pandas/types/common.py b/pandas/types/common.py
@@ -155,6 +155,18 @@ def is_integer_dtype(arr_or_dtype):
             not issubclass(tipo, (np.datetime64, np.timedelta64)))
 
 
+def is_signed_integer_dtype(arr_or_dtype):
+    tipo = _get_dtype_type(arr_or_dtype)
+    return (issubclass(tipo, np.signedinteger) and
+            not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
+
+def is_unsigned_integer_dtype(arr_or_dtype):
+    tipo = _get_dtype_type(arr_or_dtype)
+    return (issubclass(tipo, np.unsignedinteger) and
+            not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
+
 def is_int64_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return issubclass(tipo, np.int64)