Skip to content

Commit 13f16c4

Browse files
gfyoungAnkurDedania
authored andcommitted
BUG, TST: Patch handling of uint64 in algorithms
1) Patches handling of `uint64` in `value_counts` 2) Adds tests for handling of `uint64` in `factorize` xref pandas-dev#14934 Author: gfyoung <[email protected]> Closes pandas-dev#15162 from gfyoung/core-algorithms-uint64-three and squashes the following commits: 1fb256b [gfyoung] BUG, TST: Patch uint64 behavior in value_counts and factorize
1 parent 7c6b73e commit 13f16c4

File tree

5 files changed

+92
-29
lines changed

5 files changed

+92
-29
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
111111
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
112112
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
113113
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
114+
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
114115

115116
.. _whatsnew_0200.enhancements.other:
116117

pandas/core/algorithms.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
366366
if isinstance(values, Index):
367367
uniques = values._shallow_copy(uniques, name=None)
368368
elif isinstance(values, Series):
369-
# TODO: This constructor is bugged for uint's, especially
370-
# np.uint64 due to overflow. Test this for uint behavior
371-
# once constructor has been fixed.
372369
uniques = Index(uniques)
373370
return labels, uniques
374371

@@ -477,9 +474,12 @@ def _value_counts_arraylike(values, dropna=True):
477474
if is_period_type:
478475
keys = PeriodIndex._simple_new(keys, freq=freq)
479476

480-
elif is_integer_dtype(dtype):
477+
elif is_signed_integer_dtype(dtype):
481478
values = _ensure_int64(values)
482479
keys, counts = htable.value_count_int64(values, dropna)
480+
elif is_unsigned_integer_dtype(dtype):
481+
values = _ensure_uint64(values)
482+
keys, counts = htable.value_count_uint64(values, dropna)
483483
elif is_float_dtype(dtype):
484484
values = _ensure_float64(values)
485485
keys, counts = htable.value_count_float64(values, dropna)

pandas/indexes/base.py

+53-25
Original file line numberDiff line numberDiff line change
@@ -202,28 +202,15 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
202202
elif inferred in ['floating', 'mixed-integer-float']:
203203

204204
# If we are actually all equal to integers,
205-
# then coerce to integer
206-
from .numeric import (Int64Index, UInt64Index,
207-
Float64Index)
205+
# then coerce to integer.
208206
try:
209-
res = data.astype('i8', copy=False)
210-
if (res == data).all():
211-
return Int64Index(res, copy=copy,
212-
name=name)
213-
except (OverflowError, TypeError, ValueError):
207+
return cls._try_convert_to_int_index(
208+
data, copy, name)
209+
except ValueError:
214210
pass
215211

216-
# Conversion to int64 failed (possibly due to
217-
# overflow), so let's try now with uint64.
218-
try:
219-
res = data.astype('u8', copy=False)
220-
if (res == data).all():
221-
return UInt64Index(res, copy=copy,
222-
name=name)
223-
except (TypeError, ValueError):
224-
pass
225-
226-
# return an actual float index
212+
# Return an actual float index.
213+
from .numeric import Float64Index
227214
return Float64Index(data, copy=copy, dtype=dtype,
228215
name=name)
229216

@@ -270,13 +257,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
270257
if dtype is None:
271258
inferred = lib.infer_dtype(subarr)
272259
if inferred == 'integer':
273-
from .numeric import Int64Index, UInt64Index
274260
try:
275-
return Int64Index(subarr.astype('i8'), copy=copy,
276-
name=name)
277-
except OverflowError:
278-
return UInt64Index(subarr.astype('u8'), copy=copy,
279-
name=name)
261+
return cls._try_convert_to_int_index(
262+
subarr, copy, name)
263+
except ValueError:
264+
pass
265+
266+
return Index(subarr, copy=copy,
267+
dtype=object, name=name)
280268
elif inferred in ['floating', 'mixed-integer-float']:
281269
from .numeric import Float64Index
282270
return Float64Index(subarr, copy=copy, name=name)
@@ -597,6 +585,46 @@ def ravel(self, order='C'):
597585
return self._values.ravel(order=order)
598586

599587
# construction helpers
588+
@classmethod
589+
def _try_convert_to_int_index(cls, data, copy, name):
590+
"""
591+
Attempt to convert an array of data into an integer index.
592+
593+
Parameters
594+
----------
595+
data : The data to convert.
596+
copy : Whether to copy the data or not.
597+
name : The name of the index returned.
598+
599+
Returns
600+
-------
601+
int_index : data converted to either an Int64Index or a
602+
UInt64Index
603+
604+
Raises
605+
------
606+
ValueError if the conversion was not successful.
607+
"""
608+
609+
from .numeric import Int64Index, UInt64Index
610+
try:
611+
res = data.astype('i8', copy=False)
612+
if (res == data).all():
613+
return Int64Index(res, copy=copy, name=name)
614+
except (OverflowError, TypeError, ValueError):
615+
pass
616+
617+
# Conversion to int64 failed (possibly due to
618+
# overflow), so let's try now with uint64.
619+
try:
620+
res = data.astype('u8', copy=False)
621+
if (res == data).all():
622+
return UInt64Index(res, copy=copy, name=name)
623+
except (TypeError, ValueError):
624+
pass
625+
626+
raise ValueError
627+
600628
@classmethod
601629
def _scalar_data_error(cls, data):
602630
raise TypeError('{0}(...) must be called with a collection of some '

pandas/tests/indexes/test_numeric.py

+4
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,10 @@ def test_constructor(self):
919919
res = Index([1, 2**63])
920920
tm.assert_index_equal(res, idx)
921921

922+
idx = Index([-1, 2**63], dtype=object)
923+
res = Index(np.array([-1, 2**63], dtype=object))
924+
tm.assert_index_equal(res, idx)
925+
922926
def test_get_indexer(self):
923927
target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63)
924928
indexer = self.index.get_indexer(target)

pandas/tests/test_algos.py

+30
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,23 @@ def test_complex_sorting(self):
287287

288288
self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True)
289289

290+
def test_uint64_factorize(self):
291+
data = np.array([2**63, 1, 2**63], dtype=np.uint64)
292+
exp_labels = np.array([0, 1, 0], dtype=np.intp)
293+
exp_uniques = np.array([2**63, 1], dtype=np.uint64)
294+
295+
labels, uniques = algos.factorize(data)
296+
tm.assert_numpy_array_equal(labels, exp_labels)
297+
tm.assert_numpy_array_equal(uniques, exp_uniques)
298+
299+
data = np.array([2**63, -1, 2**63], dtype=object)
300+
exp_labels = np.array([0, 1, 0], dtype=np.intp)
301+
exp_uniques = np.array([2**63, -1], dtype=object)
302+
303+
labels, uniques = algos.factorize(data)
304+
tm.assert_numpy_array_equal(labels, exp_labels)
305+
tm.assert_numpy_array_equal(uniques, exp_uniques)
306+
290307

291308
class TestUnique(tm.TestCase):
292309
_multiprocess_can_split_ = True
@@ -626,6 +643,19 @@ def test_value_counts_normalized(self):
626643
index=Series([2.0, 1.0], dtype=t))
627644
tm.assert_series_equal(result, expected)
628645

646+
def test_value_counts_uint64(self):
647+
arr = np.array([2**63], dtype=np.uint64)
648+
expected = Series([1], index=[2**63])
649+
result = algos.value_counts(arr)
650+
651+
tm.assert_series_equal(result, expected)
652+
653+
arr = np.array([-1, 2**63], dtype=object)
654+
expected = Series([1, 1], index=[-1, 2**63])
655+
result = algos.value_counts(arr)
656+
657+
tm.assert_series_equal(result, expected)
658+
629659

630660
class TestDuplicated(tm.TestCase):
631661

0 commit comments

Comments
 (0)