Skip to content

Commit 488dfd7

Browse files
committed
ENH: Create and propagate UInt64Index
1 parent 1f82f18 commit 488dfd7

20 files changed

+854
-388
lines changed

doc/source/whatsnew/v0.20.0.txt

+19-5
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,25 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
9191
df = pd.read_table(url, compression='bz2') # explicitly specify compression
9292
df.head(2)
9393

94+
.. _whatsnew_0200.enhancements.uint64_support:
95+
96+
Pandas has significantly improved support for operations involving unsigned,
97+
or purely non-negative, integers. Previously, handling these integers would
98+
result in improper rounding or data-type casting, leading to incorrect results.
99+
Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`)
100+
101+
.. ipython:: python
102+
103+
idx = pd.UInt64Index([1, 2, 3])
104+
df = pd.DataFrame(['a', 'b', 'c'], index=idx)
105+
df.index
106+
107+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
108+
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
109+
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
110+
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
111+
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
112+
94113
.. _whatsnew_0200.enhancements.other:
95114

96115
Other enhancements
@@ -298,8 +317,6 @@ Bug Fixes
298317

299318
- Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
300319
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
301-
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
302-
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
303320
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
304321
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
305322
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
@@ -324,8 +341,6 @@ Bug Fixes
324341

325342

326343

327-
- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
328-
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
329344

330345

331346

@@ -350,7 +365,6 @@ Bug Fixes
350365

351366

352367
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
353-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
354368
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
355369
- Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
356370

pandas/api/tests/test_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
5353
classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
5454
'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
5555
'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
56-
'Period', 'PeriodIndex', 'RangeIndex',
56+
'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
5757
'Series', 'SparseArray', 'SparseDataFrame',
5858
'SparseSeries', 'TimeGrouper', 'Timedelta',
5959
'TimedeltaIndex', 'Timestamp']

pandas/core/api.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from pandas.core.groupby import Grouper
1111
from pandas.formats.format import set_eng_float_format
1212
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
13-
RangeIndex, Float64Index, MultiIndex)
13+
UInt64Index, RangeIndex, Float64Index,
14+
MultiIndex)
1415

1516
from pandas.core.series import Series, TimeSeries
1617
from pandas.core.frame import DataFrame

pandas/core/indexing.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -860,15 +860,20 @@ def _convert_for_reindex(self, key, axis=0):
860860
return labels[key]
861861
else:
862862
if isinstance(key, Index):
863-
# want Index objects to pass through untouched
864-
keyarr = key
863+
keyarr = labels._convert_index_indexer(key)
865864
else:
866865
# asarray can be unsafe, NumPy strings are weird
867866
keyarr = _asarray_tuplesafe(key)
868867

869-
if is_integer_dtype(keyarr) and not labels.is_integer():
870-
keyarr = _ensure_platform_int(keyarr)
871-
return labels.take(keyarr)
868+
if is_integer_dtype(keyarr):
869+
# Cast the indexer to uint64 if possible so
870+
# that the values returned from indexing are
871+
# also uint64.
872+
keyarr = labels._convert_arr_indexer(keyarr)
873+
874+
if not labels.is_integer():
875+
keyarr = _ensure_platform_int(keyarr)
876+
return labels.take(keyarr)
872877

873878
return keyarr
874879

@@ -1044,11 +1049,10 @@ def _getitem_iterable(self, key, axis=0):
10441049
return self.obj.take(inds, axis=axis, convert=False)
10451050
else:
10461051
if isinstance(key, Index):
1047-
# want Index objects to pass through untouched
1048-
keyarr = key
1052+
keyarr = labels._convert_index_indexer(key)
10491053
else:
1050-
# asarray can be unsafe, NumPy strings are weird
10511054
keyarr = _asarray_tuplesafe(key)
1055+
keyarr = labels._convert_arr_indexer(keyarr)
10521056

10531057
if is_categorical_dtype(labels):
10541058
keyarr = labels._shallow_copy(keyarr)

pandas/indexes/api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from pandas.indexes.category import CategoricalIndex # noqa
55
from pandas.indexes.multi import MultiIndex # noqa
66
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
7-
Int64Index)
7+
Int64Index, UInt64Index)
88
from pandas.indexes.range import RangeIndex # noqa
99

1010
import pandas.core.common as com
@@ -13,7 +13,7 @@
1313
# TODO: there are many places that rely on these private methods existing in
1414
# pandas.core.index
1515
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
16-
'CategoricalIndex', 'RangeIndex',
16+
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
1717
'InvalidIndexError',
1818
'_new_Index',
1919
'_ensure_index', '_get_na_value', '_get_combined_index',

pandas/indexes/base.py

+63-9
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
is_object_dtype,
2828
is_categorical_dtype,
2929
is_bool_dtype,
30+
is_signed_integer_dtype,
31+
is_unsigned_integer_dtype,
3032
is_integer_dtype, is_float_dtype,
3133
is_datetime64_any_dtype,
3234
is_timedelta64_dtype,
@@ -199,14 +201,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
199201
data = np.array(data, copy=copy, dtype=dtype)
200202
elif inferred in ['floating', 'mixed-integer-float']:
201203

202-
# if we are actually all equal to integers
204+
# If we are actually all equal to integers,
203205
# then coerce to integer
204-
from .numeric import Int64Index, Float64Index
206+
from .numeric import (Int64Index, UInt64Index,
207+
Float64Index)
205208
try:
206-
res = data.astype('i8')
209+
res = data.astype('i8', copy=False)
207210
if (res == data).all():
208211
return Int64Index(res, copy=copy,
209212
name=name)
213+
except (OverflowError, TypeError, ValueError):
214+
pass
215+
216+
# Conversion to int64 failed (possibly due to
217+
# overflow), so let's try now with uint64.
218+
try:
219+
res = data.astype('u8', copy=False)
220+
if (res == data).all():
221+
return UInt64Index(res, copy=copy,
222+
name=name)
210223
except (TypeError, ValueError):
211224
pass
212225

@@ -235,10 +248,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
235248
IncompatibleFrequency)
236249
if isinstance(data, PeriodIndex):
237250
return PeriodIndex(data, copy=copy, name=name, **kwargs)
238-
if issubclass(data.dtype.type, np.integer):
251+
if is_signed_integer_dtype(data.dtype):
239252
from .numeric import Int64Index
240253
return Int64Index(data, copy=copy, dtype=dtype, name=name)
241-
elif issubclass(data.dtype.type, np.floating):
254+
elif is_unsigned_integer_dtype(data.dtype):
255+
from .numeric import UInt64Index
256+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
257+
elif is_float_dtype(data.dtype):
242258
from .numeric import Float64Index
243259
return Float64Index(data, copy=copy, dtype=dtype, name=name)
244260
elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
@@ -254,9 +270,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
254270
if dtype is None:
255271
inferred = lib.infer_dtype(subarr)
256272
if inferred == 'integer':
257-
from .numeric import Int64Index
258-
return Int64Index(subarr.astype('i8'), copy=copy,
259-
name=name)
273+
from .numeric import Int64Index, UInt64Index
274+
try:
275+
return Int64Index(subarr.astype('i8'), copy=copy,
276+
name=name)
277+
except OverflowError:
278+
return UInt64Index(subarr.astype('u8'), copy=copy,
279+
name=name)
260280
elif inferred in ['floating', 'mixed-integer-float']:
261281
from .numeric import Float64Index
262282
return Float64Index(subarr, copy=copy, name=name)
@@ -1253,6 +1273,40 @@ def is_int(v):
12531273

12541274
return indexer
12551275

1276+
_index_shared_docs['_convert_arr_indexer'] = """
1277+
Convert an array-like indexer to the appropriate dtype.
1278+
1279+
Parameters
1280+
----------
1281+
keyarr : array-like
1282+
Indexer to convert.
1283+
1284+
Returns
1285+
-------
1286+
converted_keyarr : array-like
1287+
"""
1288+
1289+
@Appender(_index_shared_docs['_convert_arr_indexer'])
1290+
def _convert_arr_indexer(self, keyarr):
1291+
return keyarr
1292+
1293+
_index_shared_docs['_convert_index_indexer'] = """
1294+
Convert an Index indexer to the appropriate dtype.
1295+
1296+
Parameters
1297+
----------
1298+
keyarr : Index (or sub-class)
1299+
Indexer to convert.
1300+
1301+
Returns
1302+
-------
1303+
converted_keyarr : Index (or sub-class)
1304+
"""
1305+
1306+
@Appender(_index_shared_docs['_convert_index_indexer'])
1307+
def _convert_index_indexer(self, keyarr):
1308+
return keyarr
1309+
12561310
def _convert_list_indexer(self, keyarr, kind=None):
12571311
"""
12581312
passed a key that is tuplesafe that is integer based
@@ -3489,7 +3543,7 @@ def _validate_for_numeric_binop(self, other, op, opstr):
34893543
raise ValueError("cannot evaluate a numeric op with "
34903544
"unequal lengths")
34913545
other = _values_from_object(other)
3492-
if other.dtype.kind not in ['f', 'i']:
3546+
if other.dtype.kind not in ['f', 'i', 'u']:
34933547
raise TypeError("cannot evaluate a numeric op "
34943548
"with a non-numeric dtype")
34953549
elif isinstance(other, (DateOffset, np.timedelta64,

0 commit comments

Comments
 (0)