Skip to content

Commit 22402d6

Browse files
committed
ENH: Create and propagate UInt64Index
[ci skip]
1 parent f79bc7a commit 22402d6

17 files changed

+275
-68
lines changed

doc/source/whatsnew/v0.20.0.txt

+40-3
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,46 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
8888
df = pd.read_table(url, compression='bz2') # explicitly specify compression
8989
df.head(2)
9090

91+
.. _whatsnew_0200.enhancements.uint64_support:
92+
93+
Pandas has significantly improved support for operations involving unsigned,
94+
or purely non-negative, integers. Previously, handling these integers would
95+
result in improper rounding or data-type casting, leading to incorrect results.
96+
One notable place where this improved was in ``DataFrame`` creation (:issue:`14917`):
97+
98+
.. ipython:: python
99+
100+
arr = np.array([1, 2**63, 5], dtype=np.uint64)
101+
arr
102+
103+
**Previous behavior**:
104+
105+
.. code-block:: ipython
106+
107+
In [3]: df = DataFrame({'a': arr})
108+
109+
In [4]: df['a']
110+
Out[4]:
111+
a
112+
0 1
113+
1 9223372036854775808
114+
2 5
115+
Name: a, dtype: object
116+
117+
The correct data type should unsigned 64-bit integer, not object. This release
118+
rectifies this behavior as seen below:
119+
120+
**New behavior**:
121+
122+
.. ipython:: python
123+
124+
df = DataFrame({'a': arr})
125+
df['a']
126+
127+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
128+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
129+
- New ``UInt64Index`` (subclass of ``NumericIndex``) for specifically indexing unsigned integers (:issue:`14935`)
130+
91131
.. _whatsnew_0200.enhancements.other:
92132

93133
Other enhancements
@@ -279,7 +319,6 @@ Bug Fixes
279319
~~~~~~~~~
280320

281321
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
282-
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
283322
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
284323
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
285324
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
@@ -297,6 +336,4 @@ Bug Fixes
297336

298337

299338

300-
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
301339
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
302-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
5353
classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
5454
'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
5555
'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
56-
'Period', 'PeriodIndex', 'RangeIndex',
56+
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
5959
'TimedeltaIndex', 'Timestamp']

pandas/core/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
13-
RangeIndex, Float64Index, MultiIndex)
13+
UInt64Index, RangeIndex, Float64Index,
14+
MultiIndex)
1415

1516
from pandas.core.series import Series, TimeSeries
1617
from pandas.core.frame import DataFrame

pandas/index.pyx

+61
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,67 @@ cdef class Int64Engine(IndexEngine):
424424

425425
return result
426426

427+
cdef class UInt64Engine(IndexEngine):
428+
429+
cdef _get_index_values(self):
430+
return algos.ensure_uint64(self.vgetter())
431+
432+
cdef _make_hash_table(self, n):
433+
return _hash.UInt64HashTable(n)
434+
435+
def _call_monotonic(self, values):
436+
return algos.is_monotonic_uint64(values, timelike=False)
437+
438+
def get_pad_indexer(self, other, limit=None):
439+
return algos.pad_uint64(self._get_index_values(), other,
440+
limit=limit)
441+
442+
def get_backfill_indexer(self, other, limit=None):
443+
return algos.backfill_uint64(self._get_index_values(), other,
444+
limit=limit)
445+
446+
cdef _check_type(self, object val):
447+
hash(val)
448+
if util.is_bool_object(val):
449+
raise KeyError(val)
450+
elif util.is_float_object(val):
451+
raise KeyError(val)
452+
453+
cdef _maybe_get_bool_indexer(self, object val):
454+
cdef:
455+
ndarray[uint8_t, cast=True] indexer
456+
ndarray[uint64_t] values
457+
int count = 0
458+
Py_ssize_t i, n
459+
uint64_t uval
460+
int last_true
461+
462+
if not util.is_integer_object(val):
463+
raise KeyError(val)
464+
465+
uval = val
466+
467+
values = self._get_index_values()
468+
n = len(values)
469+
470+
result = np.empty(n, dtype=bool)
471+
indexer = result.view(np.uint8)
472+
473+
for i in range(n):
474+
if values[i] == uval:
475+
count += 1
476+
indexer[i] = 1
477+
last_true = i
478+
else:
479+
indexer[i] = 0
480+
481+
if count == 0:
482+
raise KeyError(val)
483+
if count == 1:
484+
return last_true
485+
486+
return result
487+
427488
cdef class Float64Engine(IndexEngine):
428489

429490
cdef _make_hash_table(self, n):

pandas/indexes/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
66
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
7-
Int64Index)
7+
Int64Index, UInt64Index)
88
from pandas.indexes.range import RangeIndex # noqa
99

1010
import pandas.core.common as com
@@ -13,7 +13,7 @@
1313
# TODO: there are many places that rely on these private methods existing in
1414
# pandas.core.index
1515
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex',
16+
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
1717
'InvalidIndexError',
1818
'_new_Index',
1919
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
199199
data = np.array(data, copy=copy, dtype=dtype)
200200
elif inferred in ['floating', 'mixed-integer-float']:
201201

202-
# if we are actually all equal to integers
202+
# If we are actually all equal to integers,
203203
# then coerce to integer
204-
from .numeric import Int64Index, Float64Index
204+
from .numeric import (Int64Index, UInt64Index,
205+
Float64Index)
205206
try:
206207
res = data.astype('i8')
207208
if (res == data).all():
208209
return Int64Index(res, copy=copy,
209210
name=name)
211+
except (OverflowError, TypeError, ValueError):
212+
pass
213+
214+
# Conversion to int64 failed (possibly due to
215+
# overflow), so let's try now with uint64.
216+
try:
217+
res = data.astype('u8')
218+
if (res == data).all():
219+
return UInt64Index(res, copy=copy,
220+
name=name)
210221
except (TypeError, ValueError):
211222
pass
212223

@@ -235,9 +246,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
235246
IncompatibleFrequency)
236247
if isinstance(data, PeriodIndex):
237248
return PeriodIndex(data, copy=copy, name=name, **kwargs)
238-
if issubclass(data.dtype.type, np.integer):
249+
if issubclass(data.dtype.type, np.signedinteger):
239250
from .numeric import Int64Index
240251
return Int64Index(data, copy=copy, dtype=dtype, name=name)
252+
elif issubclass(data.dtype.type, np.unsignedinteger):
253+
from .numeric import UInt64Index
254+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
241255
elif issubclass(data.dtype.type, np.floating):
242256
from .numeric import Float64Index
243257
return Float64Index(data, copy=copy, dtype=dtype, name=name)
@@ -254,9 +268,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
254268
if dtype is None:
255269
inferred = lib.infer_dtype(subarr)
256270
if inferred == 'integer':
257-
from .numeric import Int64Index
258-
return Int64Index(subarr.astype('i8'), copy=copy,
259-
name=name)
271+
from .numeric import Int64Index, UInt64Index
272+
try:
273+
return Int64Index(subarr.astype('i8'), copy=copy,
274+
name=name)
275+
except OverflowError:
276+
return UInt64Index(subarr.astype('u8'), copy=copy,
277+
name=name)
260278
elif inferred in ['floating', 'mixed-integer-float']:
261279
from .numeric import Float64Index
262280
return Float64Index(subarr, copy=copy, name=name)

0 commit comments

Comments
 (0)