Skip to content

Commit 4f59a12

Browse files
committed
ENH: Create and propagate UInt64Index
1 parent 8b497e4 commit 4f59a12

17 files changed

+279
-192
lines changed

doc/source/whatsnew/v0.20.0.txt

+17-3
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,23 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
8888
df = pd.read_table(url, compression='bz2') # explicitly specify compression
8989
df.head(2)
9090

91+
.. _whatsnew_0200.enhancements.uint64_support:
92+
93+
Pandas has significantly improved support for operations involving unsigned,
94+
or purely non-negative, integers. Previously, handling these integers would
95+
result in improper rounding or data-type casting, leading to incorrect results.
96+
Notably, a new numerical index, `UInt64Index`, has been created (:issue:`14937`)
97+
98+
.. ipython:: python
99+
100+
idx = pd.UInt64Index([1, 2, 3])
101+
df = pd.DataFrame(['a', 'b', 'c'], index=idx)
102+
df.index
103+
104+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
105+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
106+
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
107+
91108
.. _whatsnew_0200.enhancements.other:
92109

93110
Other enhancements
@@ -279,7 +296,6 @@ Bug Fixes
279296
~~~~~~~~~
280297

281298
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
282-
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
283299
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
284300
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
285301
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
@@ -297,6 +313,4 @@ Bug Fixes
297313

298314

299315

300-
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
301316
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
302-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
5353
classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
5454
'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
5555
'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
56-
'Period', 'PeriodIndex', 'RangeIndex',
56+
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
5959
'TimedeltaIndex', 'Timestamp']

pandas/core/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
13-
RangeIndex, Float64Index, MultiIndex)
13+
UInt64Index, RangeIndex, Float64Index,
14+
MultiIndex)
1415

1516
from pandas.core.series import Series, TimeSeries
1617
from pandas.core.frame import DataFrame

pandas/index.pyx

+4-125
Original file line numberDiff line numberDiff line change
@@ -363,115 +363,6 @@ cdef class IndexEngine:
363363

364364
return result[0:count], missing[0:count_missing]
365365

366-
cdef class Int64Engine(IndexEngine):
367-
368-
cdef _get_index_values(self):
369-
return algos.ensure_int64(self.vgetter())
370-
371-
cdef _make_hash_table(self, n):
372-
return _hash.Int64HashTable(n)
373-
374-
def _call_monotonic(self, values):
375-
return algos.is_monotonic_int64(values, timelike=False)
376-
377-
def get_pad_indexer(self, other, limit=None):
378-
return algos.pad_int64(self._get_index_values(), other,
379-
limit=limit)
380-
381-
def get_backfill_indexer(self, other, limit=None):
382-
return algos.backfill_int64(self._get_index_values(), other,
383-
limit=limit)
384-
385-
cdef _check_type(self, object val):
386-
hash(val)
387-
if util.is_bool_object(val):
388-
raise KeyError(val)
389-
elif util.is_float_object(val):
390-
raise KeyError(val)
391-
392-
cdef _maybe_get_bool_indexer(self, object val):
393-
cdef:
394-
ndarray[uint8_t, cast=True] indexer
395-
ndarray[int64_t] values
396-
int count = 0
397-
Py_ssize_t i, n
398-
int64_t ival
399-
int last_true
400-
401-
if not util.is_integer_object(val):
402-
raise KeyError(val)
403-
404-
ival = val
405-
406-
values = self._get_index_values()
407-
n = len(values)
408-
409-
result = np.empty(n, dtype=bool)
410-
indexer = result.view(np.uint8)
411-
412-
for i in range(n):
413-
if values[i] == val:
414-
count += 1
415-
indexer[i] = 1
416-
last_true = i
417-
else:
418-
indexer[i] = 0
419-
420-
if count == 0:
421-
raise KeyError(val)
422-
if count == 1:
423-
return last_true
424-
425-
return result
426-
427-
cdef class Float64Engine(IndexEngine):
428-
429-
cdef _make_hash_table(self, n):
430-
return _hash.Float64HashTable(n)
431-
432-
cdef _get_index_values(self):
433-
return algos.ensure_float64(self.vgetter())
434-
435-
cdef _maybe_get_bool_indexer(self, object val):
436-
cdef:
437-
ndarray[uint8_t] indexer
438-
ndarray[float64_t] values
439-
int count = 0
440-
Py_ssize_t i, n
441-
int last_true
442-
443-
values = self._get_index_values()
444-
n = len(values)
445-
446-
result = np.empty(n, dtype=bool)
447-
indexer = result.view(np.uint8)
448-
449-
for i in range(n):
450-
if values[i] == val:
451-
count += 1
452-
indexer[i] = 1
453-
last_true = i
454-
else:
455-
indexer[i] = 0
456-
457-
if count == 0:
458-
raise KeyError(val)
459-
if count == 1:
460-
return last_true
461-
462-
return result
463-
464-
def _call_monotonic(self, values):
465-
return algos.is_monotonic_float64(values, timelike=False)
466-
467-
def get_pad_indexer(self, other, limit=None):
468-
return algos.pad_float64(self._get_index_values(), other,
469-
limit=limit)
470-
471-
def get_backfill_indexer(self, other, limit=None):
472-
return algos.backfill_float64(self._get_index_values(), other,
473-
limit=limit)
474-
475366

476367
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
477368
cdef:
@@ -510,22 +401,6 @@ _backfill_functions = {
510401
'float64': algos.backfill_float64
511402
}
512403

513-
cdef class ObjectEngine(IndexEngine):
514-
515-
cdef _make_hash_table(self, n):
516-
return _hash.PyObjectHashTable(n)
517-
518-
def _call_monotonic(self, values):
519-
return algos.is_monotonic_object(values, timelike=False)
520-
521-
def get_pad_indexer(self, other, limit=None):
522-
return algos.pad_object(self._get_index_values(), other,
523-
limit=limit)
524-
525-
def get_backfill_indexer(self, other, limit=None):
526-
return algos.backfill_object(self._get_index_values(), other,
527-
limit=limit)
528-
529404

530405
cdef class DatetimeEngine(Int64Engine):
531406

@@ -668,3 +543,7 @@ cdef inline _to_i8(object val):
668543

669544
cdef inline bint _is_utc(object tz):
670545
return tz is UTC or isinstance(tz, _du_utc)
546+
547+
548+
# Generated from template.
549+
include "index_class_helper.pxi"

pandas/indexes/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
66
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
7-
Int64Index)
7+
Int64Index, UInt64Index)
88
from pandas.indexes.range import RangeIndex # noqa
99

1010
import pandas.core.common as com
@@ -13,7 +13,7 @@
1313
# TODO: there are many places that rely on these private methods existing in
1414
# pandas.core.index
1515
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex',
16+
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
1717
'InvalidIndexError',
1818
'_new_Index',
1919
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
199199
data = np.array(data, copy=copy, dtype=dtype)
200200
elif inferred in ['floating', 'mixed-integer-float']:
201201

202-
# if we are actually all equal to integers
202+
# If we are actually all equal to integers,
203203
# then coerce to integer
204-
from .numeric import Int64Index, Float64Index
204+
from .numeric import (Int64Index, UInt64Index,
205+
Float64Index)
205206
try:
206207
res = data.astype('i8')
207208
if (res == data).all():
208209
return Int64Index(res, copy=copy,
209210
name=name)
211+
except (OverflowError, TypeError, ValueError):
212+
pass
213+
214+
# Conversion to int64 failed (possibly due to
215+
# overflow), so let's try now with uint64.
216+
try:
217+
res = data.astype('u8')
218+
if (res == data).all():
219+
return UInt64Index(res, copy=copy,
220+
name=name)
210221
except (TypeError, ValueError):
211222
pass
212223

@@ -235,9 +246,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
235246
IncompatibleFrequency)
236247
if isinstance(data, PeriodIndex):
237248
return PeriodIndex(data, copy=copy, name=name, **kwargs)
238-
if issubclass(data.dtype.type, np.integer):
249+
if issubclass(data.dtype.type, np.signedinteger):
239250
from .numeric import Int64Index
240251
return Int64Index(data, copy=copy, dtype=dtype, name=name)
252+
elif issubclass(data.dtype.type, np.unsignedinteger):
253+
from .numeric import UInt64Index
254+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
241255
elif issubclass(data.dtype.type, np.floating):
242256
from .numeric import Float64Index
243257
return Float64Index(data, copy=copy, dtype=dtype, name=name)
@@ -254,9 +268,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
254268
if dtype is None:
255269
inferred = lib.infer_dtype(subarr)
256270
if inferred == 'integer':
257-
from .numeric import Int64Index
258-
return Int64Index(subarr.astype('i8'), copy=copy,
259-
name=name)
271+
from .numeric import Int64Index, UInt64Index
272+
try:
273+
return Int64Index(subarr.astype('i8'), copy=copy,
274+
name=name)
275+
except OverflowError:
276+
return UInt64Index(subarr.astype('u8'), copy=copy,
277+
name=name)
260278
elif inferred in ['floating', 'mixed-integer-float']:
261279
from .numeric import Float64Index
262280
return Float64Index(subarr, copy=copy, name=name)

0 commit comments

Comments
 (0)