Skip to content

Commit 1fb256b

Browse files
committed
BUG, TST: Patch uint64 behavior in value_counts and factorize
1 parent 97fd744 commit 1fb256b

File tree

5 files changed

+89
-31
lines changed

5 files changed

+89
-31
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
110110
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
111111
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
112112
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
113+
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
113114

114115
.. _whatsnew_0200.enhancements.other:
115116

pandas/core/algorithms.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
366366
if isinstance(values, Index):
367367
uniques = values._shallow_copy(uniques, name=None)
368368
elif isinstance(values, Series):
369-
# TODO: This constructor is bugged for uint's, especially
370-
# np.uint64 due to overflow. Test this for uint behavior
371-
# once constructor has been fixed.
372369
uniques = Index(uniques)
373370
return labels, uniques
374371

@@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True):
477474
if is_period_type:
478475
keys = PeriodIndex._simple_new(keys, freq=freq)
479476

480-
elif is_integer_dtype(dtype):
477+
elif is_signed_integer_dtype(dtype):
481478
values = _ensure_int64(values)
482479
keys, counts = htable.value_count_int64(values, dropna)
480+
elif is_unsigned_integer_dtype(dtype):
481+
values = _ensure_uint64(values)
482+
keys, counts = htable.value_count_uint64(values, dropna)
483483
elif is_float_dtype(dtype):
484484
values = _ensure_float64(values)
485485
keys, counts = htable.value_count_float64(values, dropna)

pandas/indexes/base.py

+50-27
Original file line numberDiff line numberDiff line change
@@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
202202
elif inferred in ['floating', 'mixed-integer-float']:
203203

204204
# If we are actually all equal to integers,
205-
# then coerce to integer
206-
from .numeric import (Int64Index, UInt64Index,
207-
Float64Index)
208-
try:
209-
res = data.astype('i8', copy=False)
210-
if (res == data).all():
211-
return Int64Index(res, copy=copy,
212-
name=name)
213-
except (OverflowError, TypeError, ValueError):
214-
pass
205+
# then coerce to integer.
206+
out = cls._convert_to_int_index(data, copy, name)
215207

216-
# Conversion to int64 failed (possibly due to
217-
# overflow), so let's try now with uint64.
218-
try:
219-
res = data.astype('u8', copy=False)
220-
if (res == data).all():
221-
return UInt64Index(res, copy=copy,
222-
name=name)
223-
except (TypeError, ValueError):
224-
pass
208+
# Conversion was successful.
209+
if out is not None:
210+
return out
225211

226-
# return an actual float index
212+
# Return an actual float index.
213+
from .numeric import Float64Index
227214
return Float64Index(data, copy=copy, dtype=dtype,
228215
name=name)
229216

@@ -270,13 +257,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
270257
if dtype is None:
271258
inferred = lib.infer_dtype(subarr)
272259
if inferred == 'integer':
273-
from .numeric import Int64Index, UInt64Index
274-
try:
275-
return Int64Index(subarr.astype('i8'), copy=copy,
276-
name=name)
277-
except OverflowError:
278-
return UInt64Index(subarr.astype('u8'), copy=copy,
279-
name=name)
260+
out = cls._convert_to_int_index(subarr, copy, name)
261+
262+
if out is not None:
263+
return out
264+
265+
return Index(subarr, copy=copy,
266+
dtype=object, name=name)
280267
elif inferred in ['floating', 'mixed-integer-float']:
281268
from .numeric import Float64Index
282269
return Float64Index(subarr, copy=copy, name=name)
@@ -351,6 +338,42 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
351338
See each method's docstring.
352339
"""
353340

341+
@classmethod
342+
def _convert_to_int_index(cls, data, copy, name):
343+
"""
344+
Attempt to convert an array of data into an integer index.
345+
346+
Parameters
347+
----------
348+
data : The data to convert.
349+
copy : Whether to copy the data or not.
350+
name : The name of the index returned.
351+
352+
Returns
353+
-------
354+
int_index : data converted to either an Int64Index or a
355+
UInt64Index, or None, if the conversion was
356+
not successful.
357+
"""
358+
from .numeric import Int64Index, UInt64Index
359+
try:
360+
res = data.astype('i8', copy=False)
361+
if (res == data).all():
362+
return Int64Index(res, copy=copy, name=name)
363+
except (OverflowError, TypeError, ValueError):
364+
pass
365+
366+
# Conversion to int64 failed (possibly due to
367+
# overflow), so let's try now with uint64.
368+
try:
369+
res = data.astype('u8', copy=False)
370+
if (res == data).all():
371+
return UInt64Index(res, copy=copy, name=name)
372+
except (TypeError, ValueError):
373+
pass
374+
375+
return None
376+
354377
@classmethod
355378
def _simple_new(cls, values, name=None, dtype=None, **kwargs):
356379
"""

pandas/tests/indexes/test_numeric.py

+4
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,10 @@ def test_constructor(self):
919919
res = Index([1, 2**63])
920920
tm.assert_index_equal(res, idx)
921921

922+
idx = Index([-1, 2**63], dtype=object)
923+
res = Index(np.array([-1, 2**63], dtype=object))
924+
tm.assert_index_equal(res, idx)
925+
922926
def test_get_indexer(self):
923927
target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
924928
indexer = self.index.get_indexer(target)

pandas/tests/test_algos.py

+30
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,23 @@ def test_complex_sorting(self):
287287

288288
self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True)
289289

290+
def test_uint64_factorize(self):
291+
data = np.array([2**63, 1, 2**63], dtype=np.uint64)
292+
exp_labels = np.array([0, 1, 0], dtype=np.intp)
293+
exp_uniques = np.array([2**63, 1], dtype=np.uint64)
294+
295+
labels, uniques = algos.factorize(data)
296+
tm.assert_numpy_array_equal(labels, exp_labels)
297+
tm.assert_numpy_array_equal(uniques, exp_uniques)
298+
299+
data = np.array([2**63, -1, 2**63], dtype=object)
300+
exp_labels = np.array([0, 1, 0], dtype=np.intp)
301+
exp_uniques = np.array([2**63, -1], dtype=object)
302+
303+
labels, uniques = algos.factorize(data)
304+
tm.assert_numpy_array_equal(labels, exp_labels)
305+
tm.assert_numpy_array_equal(uniques, exp_uniques)
306+
290307

291308
class TestUnique(tm.TestCase):
292309
_multiprocess_can_split_ = True
@@ -626,6 +643,19 @@ def test_value_counts_normalized(self):
626643
index=Series([2.0, 1.0], dtype=t))
627644
tm.assert_series_equal(result, expected)
628645

646+
def test_value_counts_uint64(self):
647+
arr = np.array([2**63], dtype=np.uint64)
648+
expected = Series([1], index=[2**63])
649+
result = algos.value_counts(arr)
650+
651+
tm.assert_series_equal(result, expected)
652+
653+
arr = np.array([-1, 2**63], dtype=object)
654+
expected = Series([1, 1], index=[-1, 2**63])
655+
result = algos.value_counts(arr)
656+
657+
tm.assert_series_equal(result, expected)
658+
629659

630660
class TestDuplicated(tm.TestCase):
631661

0 commit comments

Comments
 (0)