DOC, TST, BUG: Improve uint64 core/algos behavior

gfyoung · gfyoung · commit b47353a6dc2f · 2016-12-21T12:10:13.000-05:00
1) duplicated() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and refactors to use duplicated_uint64. 2) mode() Updates documentation to describe the "values" parameter in the signature, adds tests for uint64, and reactors to use mode_uint64. 3) unique() Uses UInt64HashTable to patch a uint64 overflow bug analogous to that seen in Series.unique (patched in pandas-devgh-14915). 4) Types API Introduces "is_signed_integer_dtype" and "is_unsigned _integer_dtype" to the public API. Used in refactoring/ patching of 1-3.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -298,5 +298,6 @@ Bug Fixes
 
 
 - Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
+- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
 - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py
@@ -153,10 +153,10 @@ class TestTypes(Base, tm.TestCase):
                'is_floating_dtype', 'is_int64_dtype', 'is_integer',
                'is_integer_dtype', 'is_number', 'is_numeric_dtype',
                'is_object_dtype', 'is_scalar', 'is_sparse',
-               'is_string_dtype',
+               'is_string_dtype', 'is_signed_integer_dtype',
                'is_timedelta64_dtype', 'is_timedelta64_ns_dtype',
-               'is_period', 'is_period_dtype',
-               'is_re', 'is_re_compilable',
+               'is_unsigned_integer_dtype', 'is_period',
+               'is_period_dtype', 'is_re', 'is_re_compilable',
                'is_dict_like', 'is_iterator',
                'is_list_like', 'is_hashable',
                'is_named_tuple', 'is_sequence',
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -9,7 +9,9 @@
 from pandas import compat, lib, tslib, _np_version_under1p8
 from pandas.types.cast import _maybe_promote
 from pandas.types.generic import ABCSeries, ABCIndex
-from pandas.types.common import (is_integer_dtype,
+from pandas.types.common import (is_unsigned_integer_dtype,
+                                 is_signed_integer_dtype,
+                                 is_integer_dtype,
                                  is_int64_dtype,
                                  is_categorical_dtype,
                                  is_extension_type,
@@ -490,12 +492,14 @@ def _value_counts_arraylike(values, dropna=True):
 
 def duplicated(values, keep='first'):
     """
-    Return boolean ndarray denoting duplicate values
+    Return boolean ndarray denoting duplicate values.
 
     .. versionadded:: 0.19.0
 
     Parameters
     ----------
+    values : ndarray-like
+        Array over which to check for duplicate values.
     keep : {'first', 'last', False}, default 'first'
         - ``first`` : Mark duplicates as ``True`` except for the first
           occurrence.
@@ -521,9 +525,12 @@ def duplicated(values, keep='first'):
     elif isinstance(values, (ABCSeries, ABCIndex)):
         values = values.values
 
-    if is_integer_dtype(dtype):
+    if is_signed_integer_dtype(dtype):
         values = _ensure_int64(values)
         duplicated = htable.duplicated_int64(values, keep=keep)
+    elif is_unsigned_integer_dtype(dtype):
+        values = _ensure_uint64(values)
+        duplicated = htable.duplicated_uint64(values, keep=keep)
     elif is_float_dtype(dtype):
         values = _ensure_float64(values)
         duplicated = htable.duplicated_float64(values, keep=keep)
@@ -535,7 +542,19 @@ def duplicated(values, keep='first'):
 
 
 def mode(values):
-    """Returns the mode or mode(s) of the passed Series or ndarray (sorted)"""
+    """
+    Returns the mode(s) of an array.
+
+    Parameters
+    ----------
+    values : array-like
+        Array over which to check for duplicate values.
+
+    Returns
+    -------
+    mode : Series
+    """
+
     # must sort because hash order isn't necessarily defined.
     from pandas.core.series import Series
 
@@ -547,10 +566,12 @@ def mode(values):
         constructor = Series
 
     dtype = values.dtype
-    if is_integer_dtype(values):
+    if is_signed_integer_dtype(values):
         values = _ensure_int64(values)
         result = constructor(sorted(htable.mode_int64(values)), dtype=dtype)
-
+    elif is_unsigned_integer_dtype(values):
+        values = _ensure_uint64(values)
+        result = constructor(sorted(htable.mode_uint64(values)), dtype=dtype)
     elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
         dtype = values.dtype
         values = values.view(np.int64)
@@ -893,8 +914,10 @@ def _hashtable_algo(f, values, return_dtype=None):
     dtype = values.dtype
     if is_float_dtype(dtype):
         return f(htable.Float64HashTable, _ensure_float64)
-    elif is_integer_dtype(dtype):
+    elif is_signed_integer_dtype(dtype):
         return f(htable.Int64HashTable, _ensure_int64)
+    elif is_unsigned_integer_dtype(dtype):
+        return f(htable.UInt64HashTable, _ensure_uint64)
     elif is_datetime64_dtype(dtype):
         return_dtype = return_dtype or 'M8[ns]'
         return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -199,41 +199,6 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask):
     return modes[:j + 1]
 
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def mode_int64(int64_t[:] values):
-    cdef:
-        int count, max_count = 2
-        int j = -1 # so you can do +=
-        int k
-        kh_int64_t *table
-        ndarray[int64_t] modes
-
-    table = kh_init_int64()
-
-    build_count_table_int64(values, table, 0)
-
-    modes = np.empty(table.n_buckets, dtype=np.int64)
-
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                count = table.vals[k]
-
-                if count == max_count:
-                    j += 1
-                elif count > max_count:
-                    max_count = count
-                    j = 0
-                else:
-                    continue
-                modes[j] = table.keys[k]
-
-    kh_destroy_int64(table)
-
-    return modes[:j + 1]
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def duplicated_object(ndarray[object] values, object keep='first'):
diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in
@@ -112,3 +112,55 @@ def duplicated_{{dtype}}({{dtype}}_t[:] values,
     return out
 
 {{endfor}}
+
+#----------------------------------------------------------------------
+# Mode Computations
+#----------------------------------------------------------------------
+
+{{py:
+
+# Note that mode is also implemented for object,
+# but it takes different parameters.
+
+# dtype, ctype
+dtypes = [('int64', 'int64_t'),
+          ('uint64', 'uint64_t')]
+}}
+
+{{for dtype, ctype in dtypes}}
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def mode_{{dtype}}({{ctype}}[:] values):
+    cdef:
+        int count, max_count = 2
+        int j = -1 # so you can do +=
+        int k
+        kh_{{ctype}} *table
+        ndarray[{{ctype}}] modes
+
+    table = kh_init_{{dtype}}()
+
+    build_count_table_{{dtype}}(values, table, 0)
+
+    modes = np.empty(table.n_buckets, dtype=np.{{dtype}})
+
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_{{dtype}}(table, k):
+                count = table.vals[k]
+
+                if count == max_count:
+                    j += 1
+                elif count > max_count:
+                    max_count = count
+                    j = 0
+                else:
+                    continue
+                modes[j] = table.keys[k]
+
+    kh_destroy_{{dtype}}(table)
+
+    return modes[:j + 1]
+
+{{endfor}}
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -365,6 +365,11 @@ def test_timedelta64_dtype_array_returned(self):
         tm.assert_numpy_array_equal(result, expected)
         self.assertEqual(result.dtype, expected.dtype)
 
+    def test_uint64_overflow(self):
+        s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+        exp = np.array([1, 2, 2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(algos.unique(s), exp)
+
 
 class TestIsin(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -672,7 +677,9 @@ def test_numeric_object_likes(self):
                  np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
                            2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
                  np.array(['a', 'b', 'a', 'e', 'c',
-                           'b', 'd', 'a', 'e', 'f'], dtype=object)]
+                           'b', 'd', 'a', 'e', 'f'], dtype=object),
+                 np.array([1, 2**63, 1, 3**5, 10,
+                           2**63, 39, 1, 3**5, 7], dtype=np.uint64)]
 
         exp_first = np.array([False, False, True, False, False,
                               True, False, True, True, False])
@@ -1202,6 +1209,60 @@ def test_int64_add_overflow():
                                        b_mask=np.array([False, True]))
 
 
+class TestMode(tm.TestCase):
+
+    def test_basic(self):
+        s = Series([1, 2], dtype=np.intp)
+        exp = Series([], dtype=np.intp)
+        tm.assert_series_equal(algos.mode(s), exp)
+
+        s = Series([1, 2, 2], dtype=np.intp)
+        exp = Series([2], dtype=np.intp)
+        tm.assert_series_equal(algos.mode(s), exp)
+
+        s = Series([1, 1, 2, 3, 3], dtype=np.intp)
+        exp = Series([1, 3], dtype=np.intp)
+        tm.assert_series_equal(algos.mode(s), exp)
+
+    def test_categorical(self):
+        c = Categorical([1, 2])
+        exp = Series([], dtype=np.intp)
+        tm.assert_series_equal(algos.mode(c), exp)
+
+        c = Categorical([1, 2, 2])
+        exp = Series([2], dtype=np.intp)
+        tm.assert_series_equal(algos.mode(c), exp)
+
+        c = Categorical([1, 1, 2, 3, 3])
+        exp = Series([1, 3], dtype=np.intp)
+        tm.assert_series_equal(algos.mode(c), exp)
+
+    def test_nonnumeric(self):
+        s = Series([1, 'foo', 'foo'])
+        exp = Series(['foo'])
+        tm.assert_series_equal(algos.mode(s), exp)
+
+        s = Series([1, 2, 'foo', 'foo', 2])
+        exp = Series(['foo', 2])
+
+        # Cannot sort "int" and "str" together
+        with tm.assert_produces_warning(UserWarning):
+            tm.assert_series_equal(algos.mode(s), exp)
+
+    def test_uint64(self):
+        s = Series([1, 2**63, 2**63], dtype=np.uint64)
+        exp = Series([2**63], dtype=np.uint64)
+
+        tm.assert_series_equal(algos.mode(s), exp)
+        tm.assert_series_equal(s.mode(), exp)
+
+        s = Series([1, 2**63], dtype=np.uint64)
+        exp = Series([], dtype=np.uint64)
+
+        tm.assert_series_equal(algos.mode(s), exp)
+        tm.assert_series_equal(s.mode(), exp)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/types/api.py b/pandas/types/api.py
@@ -44,6 +44,8 @@
                      is_floating_dtype,
                      is_bool_dtype,
                      is_complex_dtype,
+                     is_signed_integer_dtype,
+                     is_unsigned_integer_dtype,
 
                      # like
                      is_re,
diff --git a/pandas/types/common.py b/pandas/types/common.py
@@ -155,6 +155,18 @@ def is_integer_dtype(arr_or_dtype):
             not issubclass(tipo, (np.datetime64, np.timedelta64)))
 
 
+def is_signed_integer_dtype(arr_or_dtype):
+    tipo = _get_dtype_type(arr_or_dtype)
+    return (issubclass(tipo, np.signedinteger) and
+            not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
+
+def is_unsigned_integer_dtype(arr_or_dtype):
+    tipo = _get_dtype_type(arr_or_dtype)
+    return (issubclass(tipo, np.unsignedinteger) and
+            not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
+
 def is_int64_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return issubclass(tipo, np.int64)