pandas-dev · gfyoung · Dec 21, 2016 · jreback · Dec 22, 2016 · gfyoung
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -91,6 +91,25 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).
    df = pd.read_table(url, compression='bz2')  # explicitly specify compression
    df.head(2)
 
+.. _whatsnew_0200.enhancements.uint64_support:
+
+Pandas has significantly improved support for operations involving unsigned,
+or purely non-negative, integers. Previously, handling these integers would
+result in improper rounding or data-type casting, leading to incorrect results.
+Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`)
+
+.. ipython:: python
+
+   idx = pd.UInt64Index([1, 2, 3])
+   df = pd.DataFrame({'A': ['a', 'b', 'c']}, index=idx)
+   df.index
+
+- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
+- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
+- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
+- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
+- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
@@ -298,8 +317,6 @@ Bug Fixes
 
 - Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
 - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
-- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
-- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
 - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
 - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
 - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
@@ -324,8 +341,6 @@ Bug Fixes
 
 
 
-- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`)
-- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 
 
 
@@ -350,7 +365,6 @@ Bug Fixes
 
 
 - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
-- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
 - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
 - Bug in ``.to_json()`` where ``lines=True`` and contents (keys or values) contain escaped characters (:issue:`15096`)
 
@@ -369,4 +383,4 @@ Bug Fixes
 - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`)
 - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
 
-- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
+- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py
@@ -53,7 +53,7 @@ class TestPDApi(Base, tm.TestCase):
     classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset',
                'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index',
                'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex',
-               'Period', 'PeriodIndex', 'RangeIndex',
+               'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index',
                'Series', 'SparseArray', 'SparseDataFrame',
                'SparseSeries', 'TimeGrouper', 'Timedelta',
                'TimedeltaIndex', 'Timestamp']

diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -10,7 +10,8 @@
 from pandas.core.groupby import Grouper
 from pandas.formats.format import set_eng_float_format
 from pandas.core.index import (Index, CategoricalIndex, Int64Index,
-                               RangeIndex, Float64Index, MultiIndex)
+                               UInt64Index, RangeIndex, Float64Index,
+                               MultiIndex)
 
 from pandas.core.series import Series, TimeSeries
 from pandas.core.frame import DataFrame

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -860,15 +860,20 @@ def _convert_for_reindex(self, key, axis=0):
             return labels[key]
         else:
             if isinstance(key, Index):
-                # want Index objects to pass through untouched
-                keyarr = key
+                keyarr = labels._convert_index_indexer(key)
             else:
                 # asarray can be unsafe, NumPy strings are weird
                 keyarr = _asarray_tuplesafe(key)
 
-            if is_integer_dtype(keyarr) and not labels.is_integer():
-                keyarr = _ensure_platform_int(keyarr)
-                return labels.take(keyarr)
+            if is_integer_dtype(keyarr):
+                # Cast the indexer to uint64 if possible so
+                # that the values returned from indexing are
+                # also uint64.
+                keyarr = labels._convert_arr_indexer(keyarr)
+
+                if not labels.is_integer():
+                    keyarr = _ensure_platform_int(keyarr)
+                    return labels.take(keyarr)
 
             return keyarr
 
@@ -1044,11 +1049,10 @@ def _getitem_iterable(self, key, axis=0):
             return self.obj.take(inds, axis=axis, convert=False)
         else:
             if isinstance(key, Index):
-                # want Index objects to pass through untouched
-                keyarr = key
+                keyarr = labels._convert_index_indexer(key)
             else:
-                # asarray can be unsafe, NumPy strings are weird
                 keyarr = _asarray_tuplesafe(key)
+                keyarr = labels._convert_arr_indexer(keyarr)
 
             if is_categorical_dtype(labels):
                 keyarr = labels._shallow_copy(keyarr)

diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py
@@ -4,7 +4,7 @@
 from pandas.indexes.category import CategoricalIndex  # noqa
 from pandas.indexes.multi import MultiIndex  # noqa
 from pandas.indexes.numeric import (NumericIndex, Float64Index,  # noqa
-                                    Int64Index)
+                                    Int64Index, UInt64Index)
 from pandas.indexes.range import RangeIndex  # noqa
 
 import pandas.core.common as com
@@ -13,7 +13,7 @@
 # TODO: there are many places that rely on these private methods existing in
 # pandas.core.index
 __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
-           'CategoricalIndex', 'RangeIndex',
+           'CategoricalIndex', 'RangeIndex', 'UInt64Index',
            'InvalidIndexError',
            '_new_Index',
            '_ensure_index', '_get_na_value', '_get_combined_index',

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -27,6 +27,8 @@
                                  is_object_dtype,
                                  is_categorical_dtype,
                                  is_bool_dtype,
+                                 is_signed_integer_dtype,
+                                 is_unsigned_integer_dtype,
                                  is_integer_dtype, is_float_dtype,
                                  is_datetime64_any_dtype,
                                  is_timedelta64_dtype,
@@ -199,14 +201,25 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                             data = np.array(data, copy=copy, dtype=dtype)
                         elif inferred in ['floating', 'mixed-integer-float']:
 
-                            # if we are actually all equal to integers
+                            # If we are actually all equal to integers,
                             # then coerce to integer
-                            from .numeric import Int64Index, Float64Index
+                            from .numeric import (Int64Index, UInt64Index,
+                                                  Float64Index)
                             try:
-                                res = data.astype('i8')
+                                res = data.astype('i8', copy=False)
                                 if (res == data).all():
                                     return Int64Index(res, copy=copy,
                                                       name=name)
+                            except (OverflowError, TypeError, ValueError):
+                                pass
+
+                            # Conversion to int64 failed (possibly due to
+                            # overflow), so let's try now with uint64.
+                            try:
+                                res = data.astype('u8', copy=False)
+                                if (res == data).all():
+                                    return UInt64Index(res, copy=copy,
+                                                       name=name)
                             except (TypeError, ValueError):
                                 pass
 
@@ -235,10 +248,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                                                IncompatibleFrequency)
             if isinstance(data, PeriodIndex):
                 return PeriodIndex(data, copy=copy, name=name, **kwargs)
-            if issubclass(data.dtype.type, np.integer):
+            if is_signed_integer_dtype(data.dtype):
                 from .numeric import Int64Index
                 return Int64Index(data, copy=copy, dtype=dtype, name=name)
-            elif issubclass(data.dtype.type, np.floating):
+            elif is_unsigned_integer_dtype(data.dtype):
+                from .numeric import UInt64Index
+                return UInt64Index(data, copy=copy, dtype=dtype, name=name)
+            elif is_float_dtype(data.dtype):
                 from .numeric import Float64Index
                 return Float64Index(data, copy=copy, dtype=dtype, name=name)
             elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
@@ -254,9 +270,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
             if dtype is None:
                 inferred = lib.infer_dtype(subarr)
                 if inferred == 'integer':
-                    from .numeric import Int64Index
-                    return Int64Index(subarr.astype('i8'), copy=copy,
-                                      name=name)
+                    from .numeric import Int64Index, UInt64Index
+                    try:
+                        return Int64Index(subarr.astype('i8'), copy=copy,
+                                          name=name)
+                    except OverflowError:
+                        return UInt64Index(subarr.astype('u8'), copy=copy,
+                                           name=name)
                 elif inferred in ['floating', 'mixed-integer-float']:
                     from .numeric import Float64Index
                     return Float64Index(subarr, copy=copy, name=name)
@@ -1253,6 +1273,40 @@ def is_int(v):
 
         return indexer
 
+    _index_shared_docs['_convert_arr_indexer'] = """
+        Convert an array-like indexer to the appropriate dtype.
+
+        Parameters
+        ----------
+        keyarr : array-like
+            Indexer to convert.
+
+        Returns
+        -------
+        converted_keyarr : array-like
+    """
+
+    @Appender(_index_shared_docs['_convert_arr_indexer'])
+    def _convert_arr_indexer(self, keyarr):
+        return keyarr
+
+    _index_shared_docs['_convert_index_indexer'] = """
+        Convert an Index indexer to the appropriate dtype.
+
+        Parameters
+        ----------
+        keyarr : Index (or sub-class)
+            Indexer to convert.
+
+        Returns
+        -------
+        converted_keyarr : Index (or sub-class)
+    """
+
+    @Appender(_index_shared_docs['_convert_index_indexer'])
+    def _convert_index_indexer(self, keyarr):
+        return keyarr
+
     def _convert_list_indexer(self, keyarr, kind=None):
         """
         passed a key that is tuplesafe that is integer based
@@ -3489,7 +3543,7 @@ def _validate_for_numeric_binop(self, other, op, opstr):
                 raise ValueError("cannot evaluate a numeric op with "
                                  "unequal lengths")
             other = _values_from_object(other)
-            if other.dtype.kind not in ['f', 'i']:
+            if other.dtype.kind not in ['f', 'i', 'u']:
                 raise TypeError("cannot evaluate a numeric op "
                                 "with a non-numeric dtype")
         elif isinstance(other, (DateOffset, np.timedelta64,