Skip to content

Commit cd9c6eb

Browse files
committed
ENH: Create and propagate UInt64Index
1 parent 6bea827 commit cd9c6eb

19 files changed

+370
-257
lines changed

doc/source/whatsnew/v0.20.0.txt

+18-4
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,24 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
8888
df = pd.read_table(url, compression='bz2') # explicitly specify compression
8989
df.head(2)
9090

91+
.. _whatsnew_0200.enhancements.uint64_support:
92+
93+
Pandas has significantly improved support for operations involving unsigned,
94+
or purely non-negative, integers. Previously, handling these integers would
95+
result in improper rounding or data-type casting, leading to incorrect results.
96+
Notably, a new numerical index, `UInt64Index`, has been created (:issue:`14937`)
97+
98+
.. ipython:: python
99+
100+
idx = pd.UInt64Index([1, 2, 3])
101+
df = pd.DataFrame(['a', 'b', 'c'], index=idx)
102+
df.index
103+
104+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
105+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
106+
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
107+
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
108+
91109
.. _whatsnew_0200.enhancements.other:
92110

93111
Other enhancements
@@ -279,7 +297,6 @@ Bug Fixes
279297
~~~~~~~~~
280298

281299
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
282-
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
283300
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
284301
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
285302
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
@@ -299,8 +316,6 @@ Bug Fixes
299316

300317

301318

302-
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
303-
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
304319

305320

306321

@@ -316,5 +331,4 @@ Bug Fixes
316331

317332

318333
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
319-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
320334
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
5353
classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
5454
'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
5555
'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
56-
'Period', 'PeriodIndex', 'RangeIndex',
56+
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
5959
'TimedeltaIndex', 'Timestamp']

pandas/core/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
13-
RangeIndex, Float64Index, MultiIndex)
13+
UInt64Index, RangeIndex, Float64Index,
14+
MultiIndex)
1415

1516
from pandas.core.series import Series, TimeSeries
1617
from pandas.core.frame import DataFrame

pandas/core/indexing.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import numpy as np
44
from pandas.compat import range, zip
55
import pandas.compat as compat
6-
from pandas.types.generic import ABCDataFrame, ABCPanel, ABCSeries
6+
from pandas.types.generic import (ABCDataFrame, ABCPanel,
7+
ABCSeries, ABCUInt64Index)
78
from pandas.types.common import (is_integer_dtype,
89
is_integer, is_float,
910
is_categorical_dtype,
@@ -859,15 +860,22 @@ def _convert_for_reindex(self, key, axis=0):
859860
return labels[key]
860861
else:
861862
if isinstance(key, Index):
862-
# want Index objects to pass through untouched
863-
keyarr = key
863+
if isinstance(labels, ABCUInt64Index) and key.is_integer():
864+
keyarr = key.astype(np.uint64)
865+
else:
866+
keyarr = key
864867
else:
865868
# asarray can be unsafe, NumPy strings are weird
866869
keyarr = _asarray_tuplesafe(key)
867870

868-
if is_integer_dtype(keyarr) and not labels.is_integer():
869-
keyarr = _ensure_platform_int(keyarr)
870-
return labels.take(keyarr)
871+
if is_integer_dtype(keyarr):
872+
if isinstance(labels, ABCUInt64Index):
873+
keyarr = _asarray_tuplesafe(keyarr, dtype=np.uint64)
874+
return keyarr
875+
876+
if not labels.is_integer():
877+
keyarr = _ensure_platform_int(keyarr)
878+
return labels.take(keyarr)
871879

872880
return keyarr
873881

@@ -1043,12 +1051,17 @@ def _getitem_iterable(self, key, axis=0):
10431051
return self.obj.take(inds, axis=axis, convert=False)
10441052
else:
10451053
if isinstance(key, Index):
1046-
# want Index objects to pass through untouched
1047-
keyarr = key
1054+
if isinstance(labels, ABCUInt64Index) and key.is_integer():
1055+
keyarr = key.astype(np.uint64)
1056+
else:
1057+
keyarr = key
10481058
else:
1049-
# asarray can be unsafe, NumPy strings are weird
10501059
keyarr = _asarray_tuplesafe(key)
10511060

1061+
if (is_integer_dtype(keyarr) and
1062+
isinstance(labels, ABCUInt64Index)):
1063+
keyarr = _asarray_tuplesafe(key, dtype=np.uint64)
1064+
10521065
if is_categorical_dtype(labels):
10531066
keyarr = labels._shallow_copy(keyarr)
10541067

pandas/index.pyx

+4-125
Original file line numberDiff line numberDiff line change
@@ -363,115 +363,6 @@ cdef class IndexEngine:
363363

364364
return result[0:count], missing[0:count_missing]
365365

366-
cdef class Int64Engine(IndexEngine):
367-
368-
cdef _get_index_values(self):
369-
return algos.ensure_int64(self.vgetter())
370-
371-
cdef _make_hash_table(self, n):
372-
return _hash.Int64HashTable(n)
373-
374-
def _call_monotonic(self, values):
375-
return algos.is_monotonic_int64(values, timelike=False)
376-
377-
def get_pad_indexer(self, other, limit=None):
378-
return algos.pad_int64(self._get_index_values(), other,
379-
limit=limit)
380-
381-
def get_backfill_indexer(self, other, limit=None):
382-
return algos.backfill_int64(self._get_index_values(), other,
383-
limit=limit)
384-
385-
cdef _check_type(self, object val):
386-
hash(val)
387-
if util.is_bool_object(val):
388-
raise KeyError(val)
389-
elif util.is_float_object(val):
390-
raise KeyError(val)
391-
392-
cdef _maybe_get_bool_indexer(self, object val):
393-
cdef:
394-
ndarray[uint8_t, cast=True] indexer
395-
ndarray[int64_t] values
396-
int count = 0
397-
Py_ssize_t i, n
398-
int64_t ival
399-
int last_true
400-
401-
if not util.is_integer_object(val):
402-
raise KeyError(val)
403-
404-
ival = val
405-
406-
values = self._get_index_values()
407-
n = len(values)
408-
409-
result = np.empty(n, dtype=bool)
410-
indexer = result.view(np.uint8)
411-
412-
for i in range(n):
413-
if values[i] == val:
414-
count += 1
415-
indexer[i] = 1
416-
last_true = i
417-
else:
418-
indexer[i] = 0
419-
420-
if count == 0:
421-
raise KeyError(val)
422-
if count == 1:
423-
return last_true
424-
425-
return result
426-
427-
cdef class Float64Engine(IndexEngine):
428-
429-
cdef _make_hash_table(self, n):
430-
return _hash.Float64HashTable(n)
431-
432-
cdef _get_index_values(self):
433-
return algos.ensure_float64(self.vgetter())
434-
435-
cdef _maybe_get_bool_indexer(self, object val):
436-
cdef:
437-
ndarray[uint8_t] indexer
438-
ndarray[float64_t] values
439-
int count = 0
440-
Py_ssize_t i, n
441-
int last_true
442-
443-
values = self._get_index_values()
444-
n = len(values)
445-
446-
result = np.empty(n, dtype=bool)
447-
indexer = result.view(np.uint8)
448-
449-
for i in range(n):
450-
if values[i] == val:
451-
count += 1
452-
indexer[i] = 1
453-
last_true = i
454-
else:
455-
indexer[i] = 0
456-
457-
if count == 0:
458-
raise KeyError(val)
459-
if count == 1:
460-
return last_true
461-
462-
return result
463-
464-
def _call_monotonic(self, values):
465-
return algos.is_monotonic_float64(values, timelike=False)
466-
467-
def get_pad_indexer(self, other, limit=None):
468-
return algos.pad_float64(self._get_index_values(), other,
469-
limit=limit)
470-
471-
def get_backfill_indexer(self, other, limit=None):
472-
return algos.backfill_float64(self._get_index_values(), other,
473-
limit=limit)
474-
475366

476367
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
477368
cdef:
@@ -510,22 +401,6 @@ _backfill_functions = {
510401
'float64': algos.backfill_float64
511402
}
512403

513-
cdef class ObjectEngine(IndexEngine):
514-
515-
cdef _make_hash_table(self, n):
516-
return _hash.PyObjectHashTable(n)
517-
518-
def _call_monotonic(self, values):
519-
return algos.is_monotonic_object(values, timelike=False)
520-
521-
def get_pad_indexer(self, other, limit=None):
522-
return algos.pad_object(self._get_index_values(), other,
523-
limit=limit)
524-
525-
def get_backfill_indexer(self, other, limit=None):
526-
return algos.backfill_object(self._get_index_values(), other,
527-
limit=limit)
528-
529404

530405
cdef class DatetimeEngine(Int64Engine):
531406

@@ -668,3 +543,7 @@ cdef inline _to_i8(object val):
668543

669544
cdef inline bint _is_utc(object tz):
670545
return tz is UTC or isinstance(tz, _du_utc)
546+
547+
548+
# Generated from template.
549+
include "index_class_helper.pxi"

pandas/indexes/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
66
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
7-
Int64Index)
7+
Int64Index, UInt64Index)
88
from pandas.indexes.range import RangeIndex # noqa
99

1010
import pandas.core.common as com
@@ -13,7 +13,7 @@
1313
# TODO: there are many places that rely on these private methods existing in
1414
# pandas.core.index
1515
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex',
16+
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
1717
'InvalidIndexError',
1818
'_new_Index',
1919
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
199199
data = np.array(data, copy=copy, dtype=dtype)
200200
elif inferred in ['floating', 'mixed-integer-float']:
201201

202-
# if we are actually all equal to integers
202+
# If we are actually all equal to integers,
203203
# then coerce to integer
204-
from .numeric import Int64Index, Float64Index
204+
from .numeric import (Int64Index, UInt64Index,
205+
Float64Index)
205206
try:
206207
res = data.astype('i8')
207208
if (res == data).all():
208209
return Int64Index(res, copy=copy,
209210
name=name)
211+
except (OverflowError, TypeError, ValueError):
212+
pass
213+
214+
# Conversion to int64 failed (possibly due to
215+
# overflow), so let's try now with uint64.
216+
try:
217+
res = data.astype('u8')
218+
if (res == data).all():
219+
return UInt64Index(res, copy=copy,
220+
name=name)
210221
except (TypeError, ValueError):
211222
pass
212223

@@ -235,9 +246,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
235246
IncompatibleFrequency)
236247
if isinstance(data, PeriodIndex):
237248
return PeriodIndex(data, copy=copy, name=name, **kwargs)
238-
if issubclass(data.dtype.type, np.integer):
249+
if issubclass(data.dtype.type, np.signedinteger):
239250
from .numeric import Int64Index
240251
return Int64Index(data, copy=copy, dtype=dtype, name=name)
252+
elif issubclass(data.dtype.type, np.unsignedinteger):
253+
from .numeric import UInt64Index
254+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
241255
elif issubclass(data.dtype.type, np.floating):
242256
from .numeric import Float64Index
243257
return Float64Index(data, copy=copy, dtype=dtype, name=name)
@@ -254,9 +268,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
254268
if dtype is None:
255269
inferred = lib.infer_dtype(subarr)
256270
if inferred == 'integer':
257-
from .numeric import Int64Index
258-
return Int64Index(subarr.astype('i8'), copy=copy,
259-
name=name)
271+
from .numeric import Int64Index, UInt64Index
272+
try:
273+
return Int64Index(subarr.astype('i8'), copy=copy,
274+
name=name)
275+
except OverflowError:
276+
return UInt64Index(subarr.astype('u8'), copy=copy,
277+
name=name)
260278
elif inferred in ['floating', 'mixed-integer-float']:
261279
from .numeric import Float64Index
262280
return Float64Index(subarr, copy=copy, name=name)

0 commit comments

Comments
 (0)