BUG: Use value_count_uint64 in value_counts

gfyoung · gfyoung · commit f1748699b2e4 · 2017-01-19T02:14:49.000-08:00
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -110,6 +110,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
 - Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
+- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
 
 .. _whatsnew_0200.enhancements.other:
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
     if isinstance(values, Index):
         uniques = values._shallow_copy(uniques, name=None)
     elif isinstance(values, Series):
-        # TODO: This constructor is bugged for uint's, especially
-        # np.uint64 due to overflow. Test this for uint behavior
-        # once constructor has been fixed.
         uniques = Index(uniques)
     return labels, uniques
 
@@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True):
         if is_period_type:
             keys = PeriodIndex._simple_new(keys, freq=freq)
 
-    elif is_integer_dtype(dtype):
+    elif is_signed_integer_dtype(dtype):
         values = _ensure_int64(values)
         keys, counts = htable.value_count_int64(values, dropna)
+    elif is_unsigned_integer_dtype(dtype):
+        values = _ensure_uint64(values)
+        keys, counts = htable.value_count_uint64(values, dropna)
     elif is_float_dtype(dtype):
         values = _ensure_float64(values)
         keys, counts = htable.value_count_float64(values, dropna)
diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -144,6 +144,41 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject):
     def __new__(cls, data=None, dtype=None, copy=False, name=None,
                 fastpath=False, tupleize_cols=True, **kwargs):
 
+        def convert_to_int_index(_data, _copy, _name):
+            """
+            Attempt to convert an array of data into an integer index.
+
+            Parameters
+            ----------
+            _data : The data to convert.
+            _copy : Whether to copy the data or not.
+            _name : The name of the index returned.
+
+            Returns
+            -------
+            int_index : _data converted to either an Int64Index or a
+                        UInt64Index, or None, if the conversion was
+                        not successful.
+            """
+            from .numeric import Int64Index, UInt64Index
+            try:
+                res = _data.astype('i8', copy=False)
+                if (res == _data).all():
+                    return Int64Index(res, copy=_copy,
+                                      name=_name)
+            except (OverflowError, TypeError, ValueError):
+                pass
+
+            # Conversion to int64 failed (possibly due to
+            # overflow), so let's try now with uint64.
+            try:
+                res = _data.astype('u8', copy=False)
+                if (res == _data).all():
+                    return UInt64Index(res, copy=_copy,
+                                       name=_name)
+            except (TypeError, ValueError):
+                return None
+
         if name is None and hasattr(data, 'name'):
             name = data.name
 
@@ -202,28 +237,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                         elif inferred in ['floating', 'mixed-integer-float']:
 
                             # If we are actually all equal to integers,
-                            # then coerce to integer
-                            from .numeric import (Int64Index, UInt64Index,
-                                                  Float64Index)
-                            try:
-                                res = data.astype('i8', copy=False)
-                                if (res == data).all():
-                                    return Int64Index(res, copy=copy,
-                                                      name=name)
-                            except (OverflowError, TypeError, ValueError):
-                                pass
+                            # then coerce to integer.
+                            out = convert_to_int_index(data, copy, name)
 
-                            # Conversion to int64 failed (possibly due to
-                            # overflow), so let's try now with uint64.
-                            try:
-                                res = data.astype('u8', copy=False)
-                                if (res == data).all():
-                                    return UInt64Index(res, copy=copy,
-                                                       name=name)
-                            except (TypeError, ValueError):
-                                pass
+                            # Conversion was successful.
+                            if out is not None:
+                                return out
 
-                            # return an actual float index
+                            # Return an actual float index.
+                            from .numeric import Float64Index
                             return Float64Index(data, copy=copy, dtype=dtype,
                                                 name=name)
 
@@ -270,13 +292,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
             if dtype is None:
                 inferred = lib.infer_dtype(subarr)
                 if inferred == 'integer':
-                    from .numeric import Int64Index, UInt64Index
-                    try:
-                        return Int64Index(subarr.astype('i8'), copy=copy,
-                                          name=name)
-                    except OverflowError:
-                        return UInt64Index(subarr.astype('u8'), copy=copy,
-                                           name=name)
+                    out = convert_to_int_index(subarr, copy, name)
+
+                    if out is not None:
+                        return out
+                    else:
+                        return Index(subarr, copy=copy,
+                                     dtype=object, name=name)
                 elif inferred in ['floating', 'mixed-integer-float']:
                     from .numeric import Float64Index
                     return Float64Index(subarr, copy=copy, name=name)
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
@@ -919,6 +919,10 @@ def test_constructor(self):
         res = Index([1, 2**63])
         tm.assert_index_equal(res, idx)
 
+        idx = Index([-1, 2**63], dtype=object)
+        res = Index(np.array([-1, 2**63], dtype=object))
+        tm.assert_index_equal(res, idx)
+
     def test_get_indexer(self):
         target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
         indexer = self.index.get_indexer(target)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -626,6 +626,19 @@ def test_value_counts_normalized(self):
                               index=Series([2.0, 1.0], dtype=t))
             tm.assert_series_equal(result, expected)
 
+    def test_value_counts_uint64(self):
+        arr = np.array([2**63], dtype=np.uint64)
+        expected = Series([1], index=[2**63])
+        result = algos.value_counts(arr)
+
+        tm.assert_series_equal(result, expected)
+
+        arr = np.array([-1, 2**63], dtype=object)
+        expected = Series([1, 1], index=[-1, 2**63])
+        result = algos.value_counts(arr)
+
+        tm.assert_series_equal(result, expected)
+
 
 class TestDuplicated(tm.TestCase):