diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 798151971363e..0a6f37fef209b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -110,6 +110,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) - Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`) - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) +- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`) .. _whatsnew_0200.enhancements.other: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b4a61b26aceb3..05cfb1bd9ec27 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): - # TODO: This constructor is bugged for uint's, especially - # np.uint64 due to overflow. Test this for uint behavior - # once constructor has been fixed. uniques = Index(uniques) return labels, uniques @@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True): if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) - elif is_integer_dtype(dtype): + elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) + elif is_unsigned_integer_dtype(dtype): + values = _ensure_uint64(values) + keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index d0bf4edfbc5d2..422f7b96c4462 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif inferred in ['floating', 'mixed-integer-float']: # If we are actually all equal to integers, - # then coerce to integer - from .numeric import (Int64Index, UInt64Index, - Float64Index) - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, - name=name) - except (OverflowError, TypeError, ValueError): - pass + # then coerce to integer. + out = cls._convert_to_int_index(data, copy, name) - # Conversion to int64 failed (possibly due to - # overflow), so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, - name=name) - except (TypeError, ValueError): - pass + # Conversion was successful. + if out is not None: + return out - # return an actual float index + # Return an actual float index. + from .numeric import Float64Index return Float64Index(data, copy=copy, dtype=dtype, name=name) @@ -270,13 +257,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if dtype is None: inferred = lib.infer_dtype(subarr) if inferred == 'integer': - from .numeric import Int64Index, UInt64Index - try: - return Int64Index(subarr.astype('i8'), copy=copy, - name=name) - except OverflowError: - return UInt64Index(subarr.astype('u8'), copy=copy, - name=name) + out = cls._convert_to_int_index(subarr, copy, name) + + if out is not None: + return out + + return Index(subarr, copy=copy, + dtype=object, name=name) elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index return Float64Index(subarr, copy=copy, name=name) @@ -351,6 +338,42 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, See each method's docstring. """ + @classmethod + def _convert_to_int_index(cls, data, copy, name): + """ + Attempt to convert an array of data into an integer index. + + Parameters + ---------- + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. + + Returns + ------- + int_index : data converted to either an Int64Index or a + UInt64Index, or None, if the conversion was + not successful. + """ + from .numeric import Int64Index, UInt64Index + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to + # overflow), so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (TypeError, ValueError): + pass + + return None + @classmethod def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 044d3477271ad..c7acbf51a17e5 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -919,6 +919,10 @@ def test_constructor(self): res = Index([1, 2**63]) tm.assert_index_equal(res, idx) + idx = Index([-1, 2**63], dtype=object) + res = Index(np.array([-1, 2**63], dtype=object)) + tm.assert_index_equal(res, idx) + def test_get_indexer(self): target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) indexer = self.index.get_indexer(target) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 75dd887c9d290..99453b9793007 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -287,6 +287,23 @@ def test_complex_sorting(self): self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True) + def test_uint64_factorize(self): + data = np.array([2**63, 1, 2**63], dtype=np.uint64) + exp_labels = np.array([0, 1, 0], dtype=np.intp) + exp_uniques = np.array([2**63, 1], dtype=np.uint64) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + + data = np.array([2**63, -1, 2**63], dtype=object) + exp_labels = np.array([0, 1, 0], dtype=np.intp) + exp_uniques = np.array([2**63, -1], dtype=object) + + labels, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(labels, exp_labels) + tm.assert_numpy_array_equal(uniques, exp_uniques) + class TestUnique(tm.TestCase): _multiprocess_can_split_ = True @@ -626,6 +643,19 @@ def test_value_counts_normalized(self): index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) + def test_value_counts_uint64(self): + arr = np.array([2**63], dtype=np.uint64) + expected = Series([1], index=[2**63]) + result = algos.value_counts(arr) + + tm.assert_series_equal(result, expected) + + arr = np.array([-1, 2**63], dtype=object) + expected = Series([1, 1], index=[-1, 2**63]) + result = algos.value_counts(arr) + + tm.assert_series_equal(result, expected) + class TestDuplicated(tm.TestCase):