BUG, TST: Patch uint64 behavior in value_counts and factorize

gfyoung · gfyoung · commit 1fb256be46a3 · 2017-01-21T01:59:44.000-08:00
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -110,6 +110,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
 - Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
+- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
 
 .. _whatsnew_0200.enhancements.other:
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     if isinstance(values, Index):
         uniques = values._shallow_copy(uniques, name=None)
     elif isinstance(values, Series):
-        # TODO: This constructor is bugged for uint's, especially
-        # np.uint64 due to overflow. Test this for uint behavior
-        # once constructor has been fixed.
         uniques = Index(uniques)
     return labels, uniques
 
@@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True):
         if is_period_type:
             keys = PeriodIndex._simple_new(keys, freq=freq)
 
-    elif is_integer_dtype(dtype):
+    elif is_signed_integer_dtype(dtype):
         values = _ensure_int64(values)
         keys, counts = htable.value_count_int64(values, dropna)
+    elif is_unsigned_integer_dtype(dtype):
+        values = _ensure_uint64(values)
+        keys, counts = htable.value_count_uint64(values, dropna)
     elif is_float_dtype(dtype):
         values = _ensure_float64(values)
         keys, counts = htable.value_count_float64(values, dropna)
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                         elif inferred in ['floating', 'mixed-integer-float']:
 
                             # If we are actually all equal to integers,
-                            # then coerce to integer
-                            from .numeric import (Int64Index, UInt64Index,
-                                                  Float64Index)
-                            try:
-                                res = data.astype('i8', copy=False)
-                                if (res == data).all():
-                                    return Int64Index(res, copy=copy,
-                                                      name=name)
-                            except (OverflowError, TypeError, ValueError):
-                                pass
+                            # then coerce to integer.
+                            out = cls._convert_to_int_index(data, copy, name)
 
-                            # Conversion to int64 failed (possibly due to
-                            # overflow), so let's try now with uint64.
-                            try:
-                                res = data.astype('u8', copy=False)
-                                if (res == data).all():
-                                    return UInt64Index(res, copy=copy,
-                                                       name=name)
-                            except (TypeError, ValueError):
-                                pass
+                            # Conversion was successful.
+                            if out is not None:
+                                return out
 
-                            # return an actual float index
+                            # Return an actual float index.
+                            from .numeric import Float64Index
                             return Float64Index(data, copy=copy, dtype=dtype,
                                                 name=name)
 
@@ -270,13 +257,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
             if dtype is None:
                 inferred = lib.infer_dtype(subarr)
                 if inferred == 'integer':
-                    from .numeric import Int64Index, UInt64Index
-                    try:
-                        return Int64Index(subarr.astype('i8'), copy=copy,
-                                          name=name)
-                    except OverflowError:
-                        return UInt64Index(subarr.astype('u8'), copy=copy,
-                                           name=name)
+                    out = cls._convert_to_int_index(subarr, copy, name)
+
+                    if out is not None:
+                        return out
+
+                    return Index(subarr, copy=copy,
+                                 dtype=object, name=name)
                 elif inferred in ['floating', 'mixed-integer-float']:
                     from .numeric import Float64Index
                     return Float64Index(subarr, copy=copy, name=name)
@@ -351,6 +338,42 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
     See each method's docstring.
     """
 
+    @classmethod
+    def _convert_to_int_index(cls, data, copy, name):
+        """
+        Attempt to convert an array of data into an integer index.
+
+        Parameters
+        ----------
+        data : The data to convert.
+        copy : Whether to copy the data or not.
+        name : The name of the index returned.
+
+        Returns
+        -------
+        int_index : data converted to either an Int64Index or a
+                    UInt64Index, or None, if the conversion was
+                    not successful.
+        """
+        from .numeric import Int64Index, UInt64Index
+        try:
+            res = data.astype('i8', copy=False)
+            if (res == data).all():
+                return Int64Index(res, copy=copy, name=name)
+        except (OverflowError, TypeError, ValueError):
+            pass
+
+        # Conversion to int64 failed (possibly due to
+        # overflow), so let's try now with uint64.
+        try:
+            res = data.astype('u8', copy=False)
+            if (res == data).all():
+                return UInt64Index(res, copy=copy, name=name)
+        except (TypeError, ValueError):
+            pass
+
+        return None
+
     @classmethod
     def _simple_new(cls, values, name=None, dtype=None, **kwargs):
         """
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -919,6 +919,10 @@ def test_constructor(self):
         res = Index([1, 2**63])
         tm.assert_index_equal(res, idx)
 
+        idx = Index([-1, 2**63], dtype=object)
+        res = Index(np.array([-1, 2**63], dtype=object))
+        tm.assert_index_equal(res, idx)
+
     def test_get_indexer(self):
         target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
         indexer = self.index.get_indexer(target)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -287,6 +287,23 @@ def test_complex_sorting(self):
 
         self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True)
 
+    def test_uint64_factorize(self):
+        data = np.array([2**63, 1, 2**63], dtype=np.uint64)
+        exp_labels = np.array([0, 1, 0], dtype=np.intp)
+        exp_uniques = np.array([2**63, 1], dtype=np.uint64)
+
+        labels, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(labels, exp_labels)
+        tm.assert_numpy_array_equal(uniques, exp_uniques)
+
+        data = np.array([2**63, -1, 2**63], dtype=object)
+        exp_labels = np.array([0, 1, 0], dtype=np.intp)
+        exp_uniques = np.array([2**63, -1], dtype=object)
+
+        labels, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(labels, exp_labels)
+        tm.assert_numpy_array_equal(uniques, exp_uniques)
+
 
 class TestUnique(tm.TestCase):
     _multiprocess_can_split_ = True
@@ -626,6 +643,19 @@ def test_value_counts_normalized(self):
                               index=Series([2.0, 1.0], dtype=t))
             tm.assert_series_equal(result, expected)
 
+    def test_value_counts_uint64(self):
+        arr = np.array([2**63], dtype=np.uint64)
+        expected = Series([1], index=[2**63])
+        result = algos.value_counts(arr)
+
+        tm.assert_series_equal(result, expected)
+
+        arr = np.array([-1, 2**63], dtype=object)
+        expected = Series([1, 1], index=[-1, 2**63])
+        result = algos.value_counts(arr)
+
+        tm.assert_series_equal(result, expected)
+
 
 class TestDuplicated(tm.TestCase):