Skip to content

Commit dde6312

Browse files
committed
ENH: Create and propagate UInt64Index
[ci skip]
1 parent f79bc7a commit dde6312

17 files changed

+257
-24
lines changed

doc/source/whatsnew/v0.20.0.txt

+40-3
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,46 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
8888
df = pd.read_table(url, compression='bz2') # explicitly specify compression
8989
df.head(2)
9090

91+
.. _whatsnew_0200.enhancements.uint64_support:
92+
93+
Pandas has significantly improved support for operations involving unsigned,
94+
or purely non-negative, integers. Previously, handling these integers would
95+
result in improper rounding or data-type casting, leading to incorrect results.
96+
One notable place where this improved was in ``DataFrame`` creation (:issue:`14917`):
97+
98+
.. ipython:: python
99+
100+
arr = np.array([1, 2**63, 5], dtype=np.uint64)
101+
arr
102+
103+
**Previous behavior**:
104+
105+
.. code-block:: ipython
106+
107+
In [3]: df = DataFrame({'a': arr})
108+
109+
In [4]: df['a']
110+
Out[4]:
111+
a
112+
0 1
113+
1 9223372036854775808
114+
2 5
115+
Name: a, dtype: object
116+
117+
The correct data type should unsigned 64-bit integer, not object. This release
118+
rectifies this behavior as seen below:
119+
120+
**New behavior**:
121+
122+
.. ipython:: python
123+
124+
df = DataFrame({'a': arr})
125+
df['a']
126+
127+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
128+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
129+
- New ``UInt64Index`` (subclass of ``NumericIndex``) for specifically indexing unsigned integers (:issue:`14935`)
130+
91131
.. _whatsnew_0200.enhancements.other:
92132

93133
Other enhancements
@@ -279,7 +319,6 @@ Bug Fixes
279319
~~~~~~~~~
280320

281321
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
282-
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
283322
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
284323
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
285324
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
@@ -297,6 +336,4 @@ Bug Fixes
297336

298337

299338

300-
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
301339
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
302-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
5353
classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
5454
'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
5555
'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
56-
'Period', 'PeriodIndex', 'RangeIndex',
56+
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
5959
'TimedeltaIndex', 'Timestamp']

pandas/core/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
13-
RangeIndex, Float64Index, MultiIndex)
13+
UInt64Index, RangeIndex, Float64Index,
14+
MultiIndex)
1415

1516
from pandas.core.series import Series, TimeSeries
1617
from pandas.core.frame import DataFrame

pandas/index.pyx

+61
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,67 @@ cdef class Int64Engine(IndexEngine):
424424

425425
return result
426426

427+
cdef class UInt64Engine(IndexEngine):
428+
429+
cdef _get_index_values(self):
430+
return algos.ensure_uint64(self.vgetter())
431+
432+
cdef _make_hash_table(self, n):
433+
return _hash.UInt64HashTable(n)
434+
435+
def _call_monotonic(self, values):
436+
return algos.is_monotonic_uint64(values, timelike=False)
437+
438+
def get_pad_indexer(self, other, limit=None):
439+
return algos.pad_uint64(self._get_index_values(), other,
440+
limit=limit)
441+
442+
def get_backfill_indexer(self, other, limit=None):
443+
return algos.backfill_uint64(self._get_index_values(), other,
444+
limit=limit)
445+
446+
cdef _check_type(self, object val):
447+
hash(val)
448+
if util.is_bool_object(val):
449+
raise KeyError(val)
450+
elif util.is_float_object(val):
451+
raise KeyError(val)
452+
453+
cdef _maybe_get_bool_indexer(self, object val):
454+
cdef:
455+
ndarray[uint8_t, cast=True] indexer
456+
ndarray[uint64_t] values
457+
int count = 0
458+
Py_ssize_t i, n
459+
uint64_t uval
460+
int last_true
461+
462+
if not util.is_integer_object(val):
463+
raise KeyError(val)
464+
465+
uval = val
466+
467+
values = self._get_index_values()
468+
n = len(values)
469+
470+
result = np.empty(n, dtype=bool)
471+
indexer = result.view(np.uint8)
472+
473+
for i in range(n):
474+
if values[i] == uval:
475+
count += 1
476+
indexer[i] = 1
477+
last_true = i
478+
else:
479+
indexer[i] = 0
480+
481+
if count == 0:
482+
raise KeyError(val)
483+
if count == 1:
484+
return last_true
485+
486+
return result
487+
427488
cdef class Float64Engine(IndexEngine):
428489

429490
cdef _make_hash_table(self, n):

pandas/indexes/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
66
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
7-
Int64Index)
7+
Int64Index, UInt64Index)
88
from pandas.indexes.range import RangeIndex # noqa
99

1010
import pandas.core.common as com
@@ -13,7 +13,7 @@
1313
# TODO: there are many places that rely on these private methods existing in
1414
# pandas.core.index
1515
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex',
16+
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
1717
'InvalidIndexError',
1818
'_new_Index',
1919
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
199199
data = np.array(data, copy=copy, dtype=dtype)
200200
elif inferred in ['floating', 'mixed-integer-float']:
201201

202-
# if we are actually all equal to integers
202+
# If we are actually all equal to integers,
203203
# then coerce to integer
204-
from .numeric import Int64Index, Float64Index
204+
from .numeric import (Int64Index, UInt64Index,
205+
Float64Index)
205206
try:
206207
res = data.astype('i8')
207208
if (res == data).all():
208209
return Int64Index(res, copy=copy,
209210
name=name)
211+
except (OverflowError, TypeError, ValueError):
212+
pass
213+
214+
# Conversion to int64 failed (possibly due to
215+
# overflow), so let's try now with uint64.
216+
try:
217+
res = data.astype('u8')
218+
if (res == data).all():
219+
return UInt64Index(res, copy=copy,
220+
name=name)
210221
except (TypeError, ValueError):
211222
pass
212223

@@ -235,9 +246,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
235246
IncompatibleFrequency)
236247
if isinstance(data, PeriodIndex):
237248
return PeriodIndex(data, copy=copy, name=name, **kwargs)
238-
if issubclass(data.dtype.type, np.integer):
249+
if issubclass(data.dtype.type, np.signedinteger):
239250
from .numeric import Int64Index
240251
return Int64Index(data, copy=copy, dtype=dtype, name=name)
252+
elif issubclass(data.dtype.type, np.unsignedinteger):
253+
from .numeric import UInt64Index
254+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
241255
elif issubclass(data.dtype.type, np.floating):
242256
from .numeric import Float64Index
243257
return Float64Index(data, copy=copy, dtype=dtype, name=name)
@@ -254,9 +268,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
254268
if dtype is None:
255269
inferred = lib.infer_dtype(subarr)
256270
if inferred == 'integer':
257-
from .numeric import Int64Index
258-
return Int64Index(subarr.astype('i8'), copy=copy,
259-
name=name)
271+
from .numeric import Int64Index, UInt64Index
272+
try:
273+
return Int64Index(subarr.astype('i8'), copy=copy,
274+
name=name)
275+
except OverflowError:
276+
return UInt64Index(subarr.astype('u8'), copy=copy,
277+
name=name)
260278
elif inferred in ['floating', 'mixed-integer-float']:
261279
from .numeric import Float64Index
262280
return Float64Index(subarr, copy=copy, name=name)

pandas/indexes/numeric.py

+85
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,91 @@ def _assert_safe_casting(cls, data, subarr):
177177
Int64Index._add_logical_methods()
178178

179179

180+
class UInt64Index(NumericIndex):
181+
"""
182+
Immutable ndarray implementing an ordered, sliceable set. The basic object
183+
storing axis labels for all pandas objects. UInt64Index is a special case
184+
of `Index` with purely integer labels.
185+
186+
Parameters
187+
----------
188+
data : array-like (1-dimensional)
189+
dtype : NumPy dtype (default: uint64)
190+
copy : bool
191+
Make a copy of input ndarray
192+
name : object
193+
Name to be stored in the index
194+
195+
Notes
196+
-----
197+
An Index instance can **only** contain hashable objects
198+
"""
199+
200+
_typ = 'uint64index'
201+
_arrmap = _algos.arrmap_uint64
202+
_left_indexer_unique = _join.left_join_indexer_unique_uint64
203+
_left_indexer = _join.left_join_indexer_uint64
204+
_inner_indexer = _join.inner_join_indexer_uint64
205+
_outer_indexer = _join.outer_join_indexer_uint64
206+
207+
_can_hold_na = False
208+
209+
_engine_type = _index.UInt64Engine
210+
211+
_default_dtype = np.uint64
212+
213+
@property
214+
def inferred_type(self):
215+
return 'integer'
216+
217+
@property
218+
def asi8(self):
219+
# do not cache or you'll create a memory leak
220+
return self.values.view('u8')
221+
222+
@property
223+
def is_all_dates(self):
224+
"""
225+
Checks that all the labels are datetime objects
226+
"""
227+
return False
228+
229+
def _convert_scalar_indexer(self, key, kind=None):
230+
"""
231+
convert a scalar indexer
232+
233+
Parameters
234+
----------
235+
key : label of the slice bound
236+
kind : {'ix', 'loc', 'getitem'} or None
237+
"""
238+
239+
assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
240+
241+
# don't coerce ilocs to integers
242+
if kind != 'iloc':
243+
key = self._maybe_cast_indexer(key)
244+
return (super(UInt64Index, self)
245+
._convert_scalar_indexer(key, kind=kind))
246+
247+
def _wrap_joined_index(self, joined, other):
248+
name = self.name if self.name == other.name else None
249+
return Int64Index(joined, name=name)
250+
251+
@classmethod
252+
def _assert_safe_casting(cls, data, subarr):
253+
"""
254+
Ensure incoming data can be represented as uints.
255+
"""
256+
if not issubclass(data.dtype.type, np.integer):
257+
if not np.array_equal(data, subarr):
258+
raise TypeError('Unsafe NumPy casting, you must '
259+
'explicitly cast')
260+
261+
UInt64Index._add_numeric_methods()
262+
UInt64Index._add_logical_methods()
263+
264+
180265
class Float64Index(NumericIndex):
181266
"""
182267
Immutable ndarray implementing an ordered, sliceable set. The basic object

pandas/src/algos_common_helper.pxi.in

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ dtypes = [('float64', 'float64_t', 'np.float64', True, True),
2727
('object', 'object', 'object', True, False),
2828
('int32', 'int32_t', 'np.int32', False, True),
2929
('int64', 'int64_t', 'np.int64', False, True),
30+
('uint64', 'uint64_t', 'np.uint64', False, True),
3031
('bool', 'uint8_t', 'np.bool', False, True)]
3132

3233
def get_dispatch(dtypes):

pandas/src/inference.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
810810
floats[i] = <float64_t> val
811811
complexes[i] = <double complex> val
812812
if not seen_null:
813-
seen_uint = seen_uint or (val > npy_int64_max)
813+
seen_uint = seen_uint or (int(val) > npy_int64_max)
814814
seen_sint = seen_sint or (val < 0)
815815

816816
if seen_uint and seen_sint:

pandas/src/join_helper.pxi.in

+3-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ dtypes = [('float64', 'float64_t', 'np.float64'),
1515
('float32', 'float32_t', 'np.float32'),
1616
('object', 'object', 'object'),
1717
('int32', 'int32_t', 'np.int32'),
18-
('int64', 'int64_t', 'np.int64')]
18+
('int64', 'int64_t', 'np.int64'),
19+
('uint64', 'uint64_t', 'np.uint64')]
1920

2021
def get_dispatch(dtypes):
2122

@@ -404,4 +405,4 @@ def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left,
404405

405406
return result, lindexer, rindexer
406407

407-
{{endfor}}
408+
{{endfor}}

pandas/src/joins_func_helper.pxi.in

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1212
{{py:
1313

1414
# table_type, by_dtype
15-
by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')]
15+
by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t'),
16+
('UInt64HashTable', 'uint64_t')]
1617

1718
# on_dtype
1819
on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',

0 commit comments

Comments
 (0)