diff --git a/pandas/core/api.py b/pandas/core/api.py index fde9bc77c4bd9..103fe740cfa36 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, RangeIndex, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/common.py b/pandas/core/common.py index 3d23aeff942dc..ab5f2f221ad66 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -86,6 +86,7 @@ def _check(cls, inst): ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", + "rangeindex", "float64index", "multiindex", "datetimeindex", @@ -2142,9 +2143,8 @@ def is_bool_indexer(key): def _default_index(n): - from pandas.core.index import Int64Index - values = np.arange(n, dtype=np.int64) - result = Int64Index(values,name=None) + from pandas.core.index import RangeIndex + result = RangeIndex(0, int(n), name=None) result.is_unique = True return result @@ -2498,6 +2498,11 @@ def is_integer_dtype(arr_or_dtype): not issubclass(tipo, (np.datetime64, np.timedelta64))) +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 272c401c18761..aa1bcd7dc182a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4826,7 +4826,7 @@ def extract_index(data): % (lengths[0], len(index))) raise ValueError(msg) else: - index = Index(np.arange(lengths[0])) + index = _default_index(lengths[0]) return _ensure_index(index) @@ -5043,11 +5043,11 @@ def convert(arr): def _get_names_from_index(data): - index = lrange(len(data)) has_some_name = any([getattr(s, 'name', None) is not None for s in data]) if not has_some_name: - return index + return _default_index(len(data)) + index = lrange(len(data)) count = 0 for i, s in enumerate(data): n = getattr(s, 'name', None) diff --git a/pandas/core/index.py b/pandas/core/index.py index 8b650fea9b440..4b63322bb516c 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,6 +7,7 @@ from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat import numpy as np +from math import ceil, floor from sys import getsizeof import pandas.tslib as tslib @@ -21,7 +22,7 @@ from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) + is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype, is_int64_dtype) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -111,9 +112,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if fastpath: return cls._simple_new(data, name) + if isinstance(data, int) and isinstance(dtype, int): + if copy == False: + copy = None + range_constructor = True + elif isinstance(copy, int): + range_constructor = True + + if range_constructor: + return RangeIndex(data, dtype, copy, name) + from pandas.tseries.period import PeriodIndex if isinstance(data, (np.ndarray, Index, ABCSeries)): - if issubclass(data.dtype.type, np.datetime64): + if (isinstance(data, RangeIndex) and + (dtype is None or is_int64_dtype(dtype))): + # copy passed-in RangeIndex + return data.copy(name=name) + elif issubclass(data.dtype.type, np.datetime64): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: @@ -230,7 +245,7 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr(other, '_id', Ellipsis) + return self._id is getattr(other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" @@ -1735,7 +1750,9 @@ def reindex(self, target, method=None, level=None, limit=None): # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator - if not isinstance(target, Index) and len(target) == 0: + if isinstance(self, RangeIndex) and len(target) == 0: + target = self._simple_new(0, 0, 1, name=self.name) + elif not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop('freq', None) # don't preserve freq target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) @@ -3299,6 +3316,431 @@ def _wrap_joined_index(self, joined, other): Int64Index._add_logical_methods() +class RangeIndex(Int64Index): + + """ + Immutable Index implementing an monotonic range. RangeIndex is a + memory-saving special case of `Int64Index` limited to representing + monotonic ranges. + + Parameters + ---------- + start : int (default: 0) + stop : int (default: 0) + step : int (default: 1) + name : object, optional + Name to be stored in the index + """ + + _typ = 'rangeindex' + _engine_type = _index.Int64Engine + _attributes = ['name', 'start', 'stop', 'step'] + + def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, **kwargs): + if fastpath: + return cls._simple_new(start, stop, step, name=name) + + # RangeIndex() constructor + if start is None and stop is None and step is None: + return cls._simple_new(0, 0, 1, name=name) + + new_start, new_stop, new_step = None, None, None + # sort the arguments depending on which are provided + if step is None: + new_step = 1 + if stop is None: + new_stop = start + new_start = 0 + + try: + # check validity of inputs + new_start = start if new_start is None else new_start + new_stop = stop if new_stop is None else new_stop + new_step = step if new_step is None else new_step + new_start = cls._ensure_int(new_start) + new_stop = cls._ensure_int(new_stop) + new_step = cls._ensure_int(new_step) + if new_step == 0: + raise ValueError("Step must not be zero") + #assert len(kwargs) == 0 + return cls._simple_new(new_start, new_stop, new_step, name) + except TypeError: + # pass all invalid inputs to Int64Index to handle + if step is None: + step = False + return super(RangeIndex, cls).__new__(cls.__bases__[0], + data=start, + dtype=stop, + copy=step, + name=name, + fastpath=fastpath, + **kwargs) + + @classmethod + def _simple_new(cls, start, stop, step, name=None): + result = object.__new__(cls) + result._start = start + result._stop = stop + result._step = step + result.name = name + result.is_unique = True + return result + + @classmethod + def _ensure_int(cls, value): + try: + int_value = int(value) + # don't allow casting 1-element arrays to int! + if int_value != value or hasattr(value, '__len__'): + raise Exception + except Exception: + raise TypeError("Need to pass integral values") + return int_value + + @cache_readonly + def _data(self): + return np.arange(self.start, self.stop, self.step, dtype=np.int64) + + @cache_readonly + def _int64index(self): + return Int64Index(self._data, name=self.name, fastpath=True) + + @property + def dtype(self): + return np.dtype(np.int64) + + @property + def start(self): + return self._start + + @property + def stop(self): + return self._stop + + @property + def step(self): + return self._step + + # TODO: make read-only + @cache_readonly(allow_setting=True) + def is_unique(self): + """ return if the index has unique values """ + return True + + @property + def has_duplicates(self): + return not self.is_unique + + def tolist(self): + return list(range(self.start, self.stop, self.step)) + + def _shallow_copy(self, values=None, **kwargs): + """ create a new Index, don't copy the data, use the same object attributes + with passed in attributes taking precedence """ + if values is None: + return RangeIndex(self.start, self.stop, self.step, + name=self.name, fastpath=True) + else: + name = kwargs.pop('name', self.name) + # TODO: check a test exists which checks for name preservation + return self._int64index._shallow_copy(values, name=name, **kwargs) + + def copy(self, names=None, name=None, dtype=None, deep=False): + """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : string, optional + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + if dtype is not None and not is_int64_dtype(dtype): + return super(RangeIndex, self).copy(names, name, dtype, deep) + + if name is None: + name = self.name + return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) + + # TODO: return arange instead of sorting + def argsort(self, *args, **kwargs): + """ + return an ndarray indexer of the underlying data + + See also + -------- + numpy.ndarray.argsort + """ + return self._data.argsort(*args, **kwargs) + + def __repr__(self): + attrs = [('start', default_pprint(self.start)), + ('stop', default_pprint(self.stop)), + ('step', default_pprint(self.step)), + ('name', default_pprint(self.name))] + + prepr = u(", ").join([u("%s=%s") % (k, v) + for k, v in attrs]) + res = u("%s(%s)") % (self.__class__.__name__, prepr) + + if not compat.PY3: + # needs to be str in Python 2 + encoding = get_option('display.encoding') + res = res.encode(encoding) + return res + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + if self.start != 0 or self.step != 1: + start = u('%s, ') % default_pprint(self.start) + else: + start = u('') + stop = default_pprint(self.stop) + step = u('') if self.step == 1 else u(', %s') % default_pprint(self.step) + if self.name is None: + name = u('') + else: + name = u(', name=%s') % default_pprint(self.name) + + res = u("%s(%s%s%s%s)") % (self.__class__.__name__, + start, stop, step, name) + return res + + # TODO: re-arrange case checking & delegation + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + elif isinstance(other, RangeIndex): + return (self.start == other.start and + self.stop == other.stop and + self.step == other.step) + + try: + return array_equivalent(_values_from_object(self), + _values_from_object(other)) + except TypeError: + # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + return False + + def __reduce__(self): + d = self._get_attributes_dict() + return _new_Index, (self.__class__, d), None + + # @jreback: Is this ok? Not sure what a "view" of a non-materialised array + # should mean. + #def view(self, cls=None): + # if cls is None or is_int64_dtype(cls): + # return self + # else: + # result = self._shallow_copy() + # if isinstance(result, Index): + # result._id = self._id + # return result + + def intersection(self, other): + """ + Form the intersection of two Index objects. Sortedness of the result is + not guaranteed + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + intersection : Index + """ + if not isinstance(other, RangeIndex): + return super(RangeIndex, self).intersection(other) + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(min(self.start, self.stop+1), + min(other.start, other.stop+1)) + int_high = min(max(self.stop, self.start+1), + max(other.stop, other.start+1)) + if int_high <= int_low: + return RangeIndex() + + ### Method hint: linear Diophantine equation + # solve intersection problem + # performance hint: for identical step sizes, could use cheaper alternative + gcd, s, t = self._extended_gcd(self.step, other.step) + + # check whether element sets intersect + if (self.start - other.start) % gcd: + return RangeIndex() + + # calculate parameters for the RangeIndex describing the intersection + # disregarding the lower bounds + tmp_start = self.start + (other.start-self.start)*self.step//gcd*s + new_step = self.step * other.step // gcd + new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + + # adjust index to limiting interval + new_index._start = new_index._min_fitting_element(int_low) + return new_index + + def _min_fitting_element(self, lower_limit): + """Returns the value of the smallest element greater than the limit""" + round = ceil if self.step > 0 else floor + no_steps = round( (float(lower_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _max_fitting_element(self, upper_limit): + """Returns the value of the largest element smaller than the limit""" + round = floor if self.step > 0 else ceil + no_steps = round( (float(upper_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _extended_gcd(self, a, b): + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + union : Index + """ + # note: could return a RangeIndex in some circumstances + return self._int64index.union(other) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + *this is an internal non-public method* + + Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + if how == 'outer' and self is not other: + # note: could return RangeIndex in more circumstances + return self._int64index.join(other, how, level, return_indexers) + + return super(RangeIndex, self).join(other, how, level, return_indexers) + + def _mul(self, other): + "__mul__() implementation" + try: + int_input = other == int(other) + if int_input: + other = int(other) + except Exception: + int_input = False + + if int_input == True and other != 0: + return RangeIndex(self.start*other, self.stop*other, self.step*other, + fastpath=True) + else: + return super(RangeIndex, self).__mul__(other) + + def __len__(self): + """ + return the length of the RangeIndex + """ + return (self.stop-self.start) // self.step + + @property + def size(self): + return len(self) + + #def __getitem__(self, key): + # """ + # Conserve RangeIndex type for scalar and slice keys. + # """ + # super_getitem = super(RangeIndex, self).__getitem__ + + # if np.isscalar(key): + # n = int(key) + # if n != key: + # return super_getitem(key) + # if n < 0: + # n = len(self) + key + # if n < 0 or n > len(self)-1: + # raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) + # return self.start + n * self.step + + # if isinstance(key, slice): + # # complete missing slice information + # n_start = 0 if key.start is None else key.start + # n_stop = len(self) if key.stop is None else key.stop + # n_step = 1 if key.step is None else key.step + + # # delegate non-integer slices + # if (n_start != int(n_start) and + # n_stop != int(n_stop) and + # n_step != int(n_step)): + # return super_getitem(key) + + # # deal with index wrap-around + # n_start = len(self)+n_start if n_start < 0 else n_start + # n_stop = len(self)+n_stop if n_stop < 0 else n_stop + + + # # convert indexes to values + # start = self.start + self.step * n_start + # stop = self.start + self.step * n_stop + 1 + # step = self.step * n_step + + # stop = min(stop, self.stop) + # return RangeIndex(start, stop, step, self.name, fastpath=True) + + # # fall back to Int64Index + # return super_getitem(key) + +RangeIndex._add_numeric_methods() +RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul +RangeIndex._add_logical_methods() + + class Float64Index(NumericIndex): """ @@ -4098,9 +4540,13 @@ def get_level_values(self, level): unique = self.levels[num] # .values labels = self.labels[num] filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, self.names[num], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + if isinstance(unique, RangeIndex): + _simple_new = Int64Index._simple_new + else: + _simple_new = unique._simple_new + values = _simple_new(filled, self.names[num], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 75ca44fd1ef3e..33946a29a9dee 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,7 +49,7 @@ from pandas.compat import u, PY3 from pandas import ( Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, + Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, DatetimeIndex, Float64Index, NaT ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -257,7 +257,14 @@ def encode(obj): tobj = type(obj) if isinstance(obj, Index): - if isinstance(obj, PeriodIndex): + if isinstance(obj, RangeIndex): + return {'typ': 'range_index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'start': getattr(obj, 'start', None), + 'stop': getattr(obj, 'stop', None), + 'step': getattr(obj, 'step', None)} + elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), @@ -447,6 +454,8 @@ def decode(obj): data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) + elif typ == 'range_index': + return globals()[obj['klass']](obj['start'], obj['stop'], obj['step'], name=obj['name']) elif typ == 'multi_index': data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 03e7a8eae549d..236254856c09e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,7 +10,7 @@ import pandas import pandas as pd -from pandas import (Series, DataFrame, Panel, MultiIndex, Categorical, bdate_range, +from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, TimedeltaIndex, isnull) from pandas.io.pytables import _tables @@ -1541,14 +1541,17 @@ def test_column_multiindex(self): index = MultiIndex.from_tuples([('A','a'), ('A','b'), ('B','a'), ('B','b')], names=['first','second']) df = DataFrame(np.arange(12).reshape(3,4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: store.put('df',df) - tm.assert_frame_equal(store['df'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df'],expected,check_index_type=True,check_column_type=True) store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) self.assertRaises(ValueError, store.put, 'df2',df,format='table',data_columns=['A']) self.assertRaises(ValueError, store.put, 'df3',df,format='table',data_columns=True) @@ -1562,11 +1565,14 @@ def test_column_multiindex(self): # non_index_axes name df = DataFrame(np.arange(12).reshape(3,4), columns=Index(list('ABCD'),name='foo')) - + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + with ensure_clean_store(self.path) as store: store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) def test_store_multiindex(self):