Skip to content

Commit f174869

Browse files
committed
BUG: Use value_count_uint64 in value_counts
1 parent 684c4d5 commit f174869

File tree

5 files changed

+71
-31
lines changed

5 files changed

+71
-31
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
110110
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
111111
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
112112
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
113+
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
113114

114115
.. _whatsnew_0200.enhancements.other:
115116

pandas/core/algorithms.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
366366
if isinstance(values, Index):
367367
uniques = values._shallow_copy(uniques, name=None)
368368
elif isinstance(values, Series):
369-
# TODO: This constructor is bugged for uint's, especially
370-
# np.uint64 due to overflow. Test this for uint behavior
371-
# once constructor has been fixed.
372369
uniques = Index(uniques)
373370
return labels, uniques
374371

@@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True):
477474
if is_period_type:
478475
keys = PeriodIndex._simple_new(keys, freq=freq)
479476

480-
elif is_integer_dtype(dtype):
477+
elif is_signed_integer_dtype(dtype):
481478
values = _ensure_int64(values)
482479
keys, counts = htable.value_count_int64(values, dropna)
480+
elif is_unsigned_integer_dtype(dtype):
481+
values = _ensure_uint64(values)
482+
keys, counts = htable.value_count_uint64(values, dropna)
483483
elif is_float_dtype(dtype):
484484
values = _ensure_float64(values)
485485
keys, counts = htable.value_count_float64(values, dropna)

pandas/indexes/base.py

+49-27
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,41 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject):
144144
def __new__(cls, data=None, dtype=None, copy=False, name=None,
145145
fastpath=False, tupleize_cols=True, **kwargs):
146146

147+
def convert_to_int_index(_data, _copy, _name):
148+
"""
149+
Attempt to convert an array of data into an integer index.
150+
151+
Parameters
152+
----------
153+
_data : The data to convert.
154+
_copy : Whether to copy the data or not.
155+
_name : The name of the index returned.
156+
157+
Returns
158+
-------
159+
int_index : _data converted to either an Int64Index or a
160+
UInt64Index, or None, if the conversion was
161+
not successful.
162+
"""
163+
from .numeric import Int64Index, UInt64Index
164+
try:
165+
res = _data.astype('i8', copy=False)
166+
if (res == _data).all():
167+
return Int64Index(res, copy=_copy,
168+
name=_name)
169+
except (OverflowError, TypeError, ValueError):
170+
pass
171+
172+
# Conversion to int64 failed (possibly due to
173+
# overflow), so let's try now with uint64.
174+
try:
175+
res = _data.astype('u8', copy=False)
176+
if (res == _data).all():
177+
return UInt64Index(res, copy=_copy,
178+
name=_name)
179+
except (TypeError, ValueError):
180+
return None
181+
147182
if name is None and hasattr(data, 'name'):
148183
name = data.name
149184

@@ -202,28 +237,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
202237
elif inferred in ['floating', 'mixed-integer-float']:
203238

204239
# If we are actually all equal to integers,
205-
# then coerce to integer
206-
from .numeric import (Int64Index, UInt64Index,
207-
Float64Index)
208-
try:
209-
res = data.astype('i8', copy=False)
210-
if (res == data).all():
211-
return Int64Index(res, copy=copy,
212-
name=name)
213-
except (OverflowError, TypeError, ValueError):
214-
pass
240+
# then coerce to integer.
241+
out = convert_to_int_index(data, copy, name)
215242

216-
# Conversion to int64 failed (possibly due to
217-
# overflow), so let's try now with uint64.
218-
try:
219-
res = data.astype('u8', copy=False)
220-
if (res == data).all():
221-
return UInt64Index(res, copy=copy,
222-
name=name)
223-
except (TypeError, ValueError):
224-
pass
243+
# Conversion was successful.
244+
if out is not None:
245+
return out
225246

226-
# return an actual float index
247+
# Return an actual float index.
248+
from .numeric import Float64Index
227249
return Float64Index(data, copy=copy, dtype=dtype,
228250
name=name)
229251

@@ -270,13 +292,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
270292
if dtype is None:
271293
inferred = lib.infer_dtype(subarr)
272294
if inferred == 'integer':
273-
from .numeric import Int64Index, UInt64Index
274-
try:
275-
return Int64Index(subarr.astype('i8'), copy=copy,
276-
name=name)
277-
except OverflowError:
278-
return UInt64Index(subarr.astype('u8'), copy=copy,
279-
name=name)
295+
out = convert_to_int_index(subarr, copy, name)
296+
297+
if out is not None:
298+
return out
299+
else:
300+
return Index(subarr, copy=copy,
301+
dtype=object, name=name)
280302
elif inferred in ['floating', 'mixed-integer-float']:
281303
from .numeric import Float64Index
282304
return Float64Index(subarr, copy=copy, name=name)

pandas/tests/indexes/test_numeric.py

+4
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,10 @@ def test_constructor(self):
919919
res = Index([1, 2**63])
920920
tm.assert_index_equal(res, idx)
921921

922+
idx = Index([-1, 2**63], dtype=object)
923+
res = Index(np.array([-1, 2**63], dtype=object))
924+
tm.assert_index_equal(res, idx)
925+
922926
def test_get_indexer(self):
923927
target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
924928
indexer = self.index.get_indexer(target)

pandas/tests/test_algos.py

+13
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,19 @@ def test_value_counts_normalized(self):
626626
index=Series([2.0, 1.0], dtype=t))
627627
tm.assert_series_equal(result, expected)
628628

629+
def test_value_counts_uint64(self):
630+
arr = np.array([2**63], dtype=np.uint64)
631+
expected = Series([1], index=[2**63])
632+
result = algos.value_counts(arr)
633+
634+
tm.assert_series_equal(result, expected)
635+
636+
arr = np.array([-1, 2**63], dtype=object)
637+
expected = Series([1, 1], index=[-1, 2**63])
638+
result = algos.value_counts(arr)
639+
640+
tm.assert_series_equal(result, expected)
641+
629642

630643
class TestDuplicated(tm.TestCase):
631644

0 commit comments

Comments
 (0)