Skip to content

Commit bb82670

Browse files
committed
ENH: Create and propagate UInt64Index
1 parent 4c3d4d4 commit bb82670

17 files changed

+215
-20
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ Other enhancements
9999
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
100100
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
101101

102+
- New ``UInt64Index`` (subclass of ``NumericIndex``) for specifically indexing unsigned integers (:issue:`14935`)
102103
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`)
103104
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
104105
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
5353
classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
5454
'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
5555
'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
56-
'Period', 'PeriodIndex', 'RangeIndex',
56+
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
5959
'TimedeltaIndex', 'Timestamp']

pandas/core/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
13-
RangeIndex, Float64Index, MultiIndex)
13+
UInt64Index, RangeIndex, Float64Index,
14+
MultiIndex)
1415

1516
from pandas.core.series import Series, TimeSeries
1617
from pandas.core.frame import DataFrame

pandas/index.pyx

+61
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,67 @@ cdef class Int64Engine(IndexEngine):
426426

427427
return result
428428

429+
cdef class UInt64Engine(IndexEngine):
430+
431+
cdef _get_index_values(self):
432+
return algos.ensure_uint64(self.vgetter())
433+
434+
cdef _make_hash_table(self, n):
435+
return _hash.UInt64HashTable(n)
436+
437+
def _call_monotonic(self, values):
438+
return algos.is_monotonic_uint64(values, timelike=False)
439+
440+
def get_pad_indexer(self, other, limit=None):
441+
return algos.pad_uint64(self._get_index_values(), other,
442+
limit=limit)
443+
444+
def get_backfill_indexer(self, other, limit=None):
445+
return algos.backfill_uint64(self._get_index_values(), other,
446+
limit=limit)
447+
448+
cdef _check_type(self, object val):
449+
hash(val)
450+
if util.is_bool_object(val):
451+
raise KeyError(val)
452+
elif util.is_float_object(val):
453+
raise KeyError(val)
454+
455+
cdef _maybe_get_bool_indexer(self, object val):
456+
cdef:
457+
ndarray[uint8_t, cast=True] indexer
458+
ndarray[uint64_t] values
459+
int count = 0
460+
Py_ssize_t i, n
461+
uint64_t uval
462+
int last_true
463+
464+
if not util.is_integer_object(val):
465+
raise KeyError(val)
466+
467+
uval = val
468+
469+
values = self._get_index_values()
470+
n = len(values)
471+
472+
result = np.empty(n, dtype=bool)
473+
indexer = result.view(np.uint8)
474+
475+
for i in range(n):
476+
if values[i] == uval:
477+
count += 1
478+
indexer[i] = 1
479+
last_true = i
480+
else:
481+
indexer[i] = 0
482+
483+
if count == 0:
484+
raise KeyError(val)
485+
if count == 1:
486+
return last_true
487+
488+
return result
489+
429490
cdef class Float64Engine(IndexEngine):
430491

431492
cdef _make_hash_table(self, n):

pandas/indexes/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
66
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
7-
Int64Index)
7+
Int64Index, UInt64Index)
88
from pandas.indexes.range import RangeIndex # noqa
99

1010
import pandas.core.common as com
@@ -13,7 +13,7 @@
1313
# TODO: there are many places that rely on these private methods existing in
1414
# pandas.core.index
1515
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex',
16+
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
1717
'InvalidIndexError',
1818
'_new_Index',
1919
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,21 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
201201

202202
# if we are actually all equal to integers
203203
# then coerce to integer
204-
from .numeric import Int64Index, Float64Index
204+
from .numeric import (Int64Index, UInt64Index,
205+
Float64Index)
205206
try:
206207
res = data.astype('i8')
207208
if (res == data).all():
208209
return Int64Index(res, copy=copy,
209210
name=name)
211+
except (OverflowError, TypeError, ValueError):
212+
pass
213+
214+
try:
215+
res = data.astype('u8')
216+
if (res == data).all():
217+
return UInt64Index(res, copy=copy,
218+
name=name)
210219
except (TypeError, ValueError):
211220
pass
212221

@@ -235,9 +244,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
235244
IncompatibleFrequency)
236245
if isinstance(data, PeriodIndex):
237246
return PeriodIndex(data, copy=copy, name=name, **kwargs)
238-
if issubclass(data.dtype.type, np.integer):
247+
if issubclass(data.dtype.type, np.signedinteger):
239248
from .numeric import Int64Index
240249
return Int64Index(data, copy=copy, dtype=dtype, name=name)
250+
elif issubclass(data.dtype.type, np.unsignedinteger):
251+
from .numeric import UInt64Index
252+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
241253
elif issubclass(data.dtype.type, np.floating):
242254
from .numeric import Float64Index
243255
return Float64Index(data, copy=copy, dtype=dtype, name=name)
@@ -254,9 +266,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
254266
if dtype is None:
255267
inferred = lib.infer_dtype(subarr)
256268
if inferred == 'integer':
257-
from .numeric import Int64Index
258-
return Int64Index(subarr.astype('i8'), copy=copy,
259-
name=name)
269+
from .numeric import Int64Index, UInt64Index
270+
try:
271+
return Int64Index(subarr.astype('i8'), copy=copy,
272+
name=name)
273+
except OverflowError:
274+
return UInt64Index(subarr.astype('u8'), copy=copy,
275+
name=name)
260276
elif inferred in ['floating', 'mixed-integer-float']:
261277
from .numeric import Float64Index
262278
return Float64Index(subarr, copy=copy, name=name)

pandas/indexes/numeric.py

+85
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,91 @@ def _assert_safe_casting(cls, data, subarr):
177177
Int64Index._add_logical_methods()
178178

179179

180+
class UInt64Index(NumericIndex):
181+
"""
182+
Immutable ndarray implementing an ordered, sliceable set. The basic object
183+
storing axis labels for all pandas objects. UInt64Index is a special case
184+
of `Index` with purely integer labels.
185+
186+
Parameters
187+
----------
188+
data : array-like (1-dimensional)
189+
dtype : NumPy dtype (default: uint64)
190+
copy : bool
191+
Make a copy of input ndarray
192+
name : object
193+
Name to be stored in the index
194+
195+
Notes
196+
-----
197+
An Index instance can **only** contain hashable objects
198+
"""
199+
200+
_typ = 'uint64index'
201+
_arrmap = _algos.arrmap_uint64
202+
_left_indexer_unique = _join.left_join_indexer_unique_uint64
203+
_left_indexer = _join.left_join_indexer_uint64
204+
_inner_indexer = _join.inner_join_indexer_uint64
205+
_outer_indexer = _join.outer_join_indexer_uint64
206+
207+
_can_hold_na = False
208+
209+
_engine_type = _index.UInt64Engine
210+
211+
_default_dtype = np.uint64
212+
213+
@property
214+
def inferred_type(self):
215+
return 'integer'
216+
217+
@property
218+
def asi8(self):
219+
# do not cache or you'll create a memory leak
220+
return self.values.view('u8')
221+
222+
@property
223+
def is_all_dates(self):
224+
"""
225+
Checks that all the labels are datetime objects
226+
"""
227+
return False
228+
229+
def _convert_scalar_indexer(self, key, kind=None):
230+
"""
231+
convert a scalar indexer
232+
233+
Parameters
234+
----------
235+
key : label of the slice bound
236+
kind : {'ix', 'loc', 'getitem'} or None
237+
"""
238+
239+
assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
240+
241+
# don't coerce ilocs to integers
242+
if kind != 'iloc':
243+
key = self._maybe_cast_indexer(key)
244+
return (super(UInt64Index, self)
245+
._convert_scalar_indexer(key, kind=kind))
246+
247+
def _wrap_joined_index(self, joined, other):
248+
name = self.name if self.name == other.name else None
249+
return Int64Index(joined, name=name)
250+
251+
@classmethod
252+
def _assert_safe_casting(cls, data, subarr):
253+
"""
254+
Ensure incoming data can be represented as uints.
255+
"""
256+
if not issubclass(data.dtype.type, np.integer):
257+
if not np.array_equal(data, subarr):
258+
raise TypeError('Unsafe NumPy casting, you must '
259+
'explicitly cast')
260+
261+
UInt64Index._add_numeric_methods()
262+
UInt64Index._add_logical_methods()
263+
264+
180265
class Float64Index(NumericIndex):
181266
"""
182267
Immutable ndarray implementing an ordered, sliceable set. The basic object

pandas/src/algos_common_helper.pxi.in

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ dtypes = [('float64', 'float64_t', 'np.float64', True, True),
2727
('object', 'object', 'object', True, False),
2828
('int32', 'int32_t', 'np.int32', False, True),
2929
('int64', 'int64_t', 'np.int64', False, True),
30+
('uint64', 'uint64_t', 'np.uint64', False, True),
3031
('bool', 'uint8_t', 'np.bool', False, True)]
3132

3233
def get_dispatch(dtypes):

pandas/src/inference.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
810810
floats[i] = <float64_t> val
811811
complexes[i] = <double complex> val
812812
if not seen_null:
813-
seen_uint = seen_uint or (val > npy_int64_max)
813+
seen_uint = seen_uint or (int(val) > npy_int64_max)
814814
seen_sint = seen_sint or (val < 0)
815815

816816
if seen_uint and seen_sint:

pandas/src/join_helper.pxi.in

+3-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ dtypes = [('float64', 'float64_t', 'np.float64'),
1515
('float32', 'float32_t', 'np.float32'),
1616
('object', 'object', 'object'),
1717
('int32', 'int32_t', 'np.int32'),
18-
('int64', 'int64_t', 'np.int64')]
18+
('int64', 'int64_t', 'np.int64'),
19+
('uint64', 'uint64_t', 'np.uint64')]
1920

2021
def get_dispatch(dtypes):
2122

@@ -404,4 +405,4 @@ def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left,
404405

405406
return result, lindexer, rindexer
406407

407-
{{endfor}}
408+
{{endfor}}

pandas/src/joins_func_helper.pxi.in

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1212
{{py:
1313

1414
# table_type, by_dtype
15-
by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')]
15+
by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t'),
16+
('UInt64HashTable', 'uint64_t')]
1617

1718
# on_dtype
1819
on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',

pandas/tests/indexes/common.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
import numpy as np
77

8-
from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex,
9-
MultiIndex, CategoricalIndex, DatetimeIndex,
8+
from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index,
9+
RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex,
1010
TimedeltaIndex, PeriodIndex, notnull)
1111
from pandas.types.common import needs_i8_conversion
1212
from pandas.util.testing import assertRaisesRegexp
@@ -744,7 +744,7 @@ def test_numpy_ufuncs(self):
744744
with tm.assertRaises(Exception):
745745
with np.errstate(all='ignore'):
746746
func(idx)
747-
elif isinstance(idx, (Float64Index, Int64Index)):
747+
elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)):
748748
# coerces to float (e.g. np.sin)
749749
with np.errstate(all='ignore'):
750750
result = func(idx)
@@ -765,7 +765,7 @@ def test_numpy_ufuncs(self):
765765
# raise TypeError or ValueError (PeriodIndex)
766766
with tm.assertRaises(Exception):
767767
func(idx)
768-
elif isinstance(idx, (Float64Index, Int64Index)):
768+
elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)):
769769
# results in bool array
770770
result = func(idx)
771771
exp = func(idx.values)
@@ -798,7 +798,7 @@ def test_hasnans_isnans(self):
798798
continue
799799
elif isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin):
800800
values[1] = pd.tslib.iNaT
801-
elif isinstance(index, Int64Index):
801+
elif isinstance(index, (Int64Index, UInt64Index)):
802802
continue
803803
else:
804804
values[1] = np.nan
@@ -838,7 +838,7 @@ def test_fillna(self):
838838

839839
if isinstance(index, pd.tseries.base.DatetimeIndexOpsMixin):
840840
values[1] = pd.tslib.iNaT
841-
elif isinstance(index, Int64Index):
841+
elif isinstance(index, (Int64Index, UInt64Index)):
842842
continue
843843
else:
844844
values[1] = np.nan

pandas/tests/indexes/test_base.py

+14
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def setUp(self):
4040
periodIndex=tm.makePeriodIndex(100),
4141
tdIndex=tm.makeTimedeltaIndex(100),
4242
intIndex=tm.makeIntIndex(100),
43+
uintIndex=tm.makeUIntIndex(100),
4344
rangeIndex=tm.makeIntIndex(100),
4445
floatIndex=tm.makeFloatIndex(100),
4546
boolIndex=Index([True, False]),
@@ -363,6 +364,19 @@ def test_constructor_dtypes_timedelta(self):
363364
pd.TimedeltaIndex(list(values), dtype=dtype)]:
364365
tm.assert_index_equal(res, idx)
365366

367+
def test_constructor_uint64(self):
368+
idx = pd.UInt64Index([1, 2, 3])
369+
res = pd.Index([1, 2, 3], dtype=np.uint64)
370+
tm.assert_index_equal(res, idx)
371+
372+
idx = pd.UInt64Index([1, 2**63])
373+
res = pd.Index([1, 2**63], dtype=np.uint64)
374+
tm.assert_index_equal(res, idx)
375+
376+
idx = pd.UInt64Index([1, 2**63])
377+
res = pd.Index([1, 2**63])
378+
tm.assert_index_equal(res, idx)
379+
366380
def test_view_with_args(self):
367381

368382
restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex',

pandas/tests/types/test_generic.py

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class TestABCClasses(tm.TestCase):
2424
def test_abc_types(self):
2525
self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
2626
self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index)
27+
self.assertIsInstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index)
2728
self.assertIsInstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index)
2829
self.assertIsInstance(self.multi_index, gt.ABCMultiIndex)
2930
self.assertIsInstance(self.datetime_index, gt.ABCDatetimeIndex)

pandas/tests/types/test_inference.py

+7
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,13 @@ def test_maybe_convert_objects_uint64(self):
260260
exp = np.array([2**63], dtype=np.uint64)
261261
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
262262

263+
# NumPy bug: can't compare uint64 to int64, as that
264+
# results in both casting to float64, so we should
265+
# make sure that this function is robust against it
266+
arr = np.array([np.uint64(2**63)], dtype=object)
267+
exp = np.array([2**63], dtype=np.uint64)
268+
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
269+
263270
arr = np.array([2, -1], dtype=object)
264271
exp = np.array([2, -1], dtype=np.int64)
265272
tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)

0 commit comments

Comments
 (0)