From 5162e0ffb14d74ff3510ee2cc6447bb737a5363a Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 12:25:02 +0200 Subject: [PATCH 01/13] Make RangeIndex default index WIP on making RangeIndex (#9977) the default index --- pandas/core/api.py | 2 +- pandas/core/index.py | 421 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 420 insertions(+), 3 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index fde9bc77c4bd9..103fe740cfa36 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, RangeIndex, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/index.py b/pandas/core/index.py index 8b650fea9b440..9bdf6dcbb0052 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,6 +7,7 @@ from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat import numpy as np +from math import ceil, floor from sys import getsizeof import pandas.tslib as tslib @@ -21,7 +22,7 @@ from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) + is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype, is_int64_dtype) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -111,9 +112,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if fastpath: return cls._simple_new(data, name) + if isinstance(data, int) and isinstance(dtype, int): + if copy == False: + copy = None + range_constructor = True + elif isinstance(copy, int): + range_constructor = True + + if range_constructor: + return RangeIndex(data, dtype, copy, name) + from pandas.tseries.period import PeriodIndex if isinstance(data, (np.ndarray, Index, ABCSeries)): - if issubclass(data.dtype.type, np.datetime64): + if (isinstance(data, RangeIndex) and + (dtype is None or is_int64_dtype(dtype))): + # copy passed-in RangeIndex + return data.copy(name=name) + elif issubclass(data.dtype.type, np.datetime64): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: @@ -3299,6 +3314,408 @@ def _wrap_joined_index(self, joined, other): Int64Index._add_logical_methods() +class RangeIndex(Int64Index): + + """ + Immutable Index implementing an monotonic range. RangeIndex is a + memory-saving special case of `Int64Index` limited to representing + monotonic ranges. + + Parameters + ---------- + start : int (default: 0) + stop : int (default: 0) + step : int (default: 1) + name : object, optional + Name to be stored in the index + """ + + _typ = 'rangeindex' + _engine_type = _index.Int64Engine + _attributes = ['name', 'start', 'stop', 'step'] + + def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False): + if fastpath: + return cls._simple_new(start, stop, step, name=name) + + # RangeIndex() constructor + if start is None and stop is None and step is None: + return cls._simple_new(0, 0, 1, name=name) + + # sort the arguments depending on which are provided + if step is None: + step = 1 + if stop is None: + stop = start + start = 0 + + # check validity of inputs + start = cls._ensure_int(start) + stop = cls._ensure_int(stop) + step = cls._ensure_int(step) + if step == 0: + raise ValueError("Step must not be zero") + + return cls._simple_new(start, stop, step, name) + + @classmethod + def _simple_new(cls, start, stop, step, name=None): + result = object.__new__(cls) + result._start = start + result._stop = stop + result._step = step + result.name = name + result.is_unique = True + return result + + @classmethod + def _ensure_int(cls, value): + try: + int_value = int(value) + if int_value != value: + raise Exception + except Exception: + raise TypeError("Need to pass integral values") + return int_value + + @cache_readonly + def _data(self): + return np.arange(self.start, self.stop, self.step, dtype=np.int64) + + @cache_readonly + def _int64index(self): + return Int64Index(self._data, name=self.name, fastpath=True) + + @property + def dtype(self): + return np.dtype(np.int64) + + @property + def start(self): + return self._start + + @property + def stop(self): + return self._stop + + @property + def step(self): + return self._step + + #@cache_readonly(allow_setting=True) + #def is_unique(self): + # """ return if the index has unique values """ + # return True + + #@property + #def has_duplicates(self): + # return not self.is_unique + + def tolist(self): + return list(range(self.start, self.stop, self.step)) + + def _shallow_copy(self, values=None, **kwargs): + """ create a new Index, don't copy the data, use the same object attributes + with passed in attributes taking precedence """ + if values is None: + return RangeIndex(self.start, self.stop, self.step, + name=self.name, fastpath=True) + else: + name = kwargs.get('name', self.name) + return self._int64index._shallow_copy(values, **kwargs) + + def copy(self, names=None, name=None, dtype=None, deep=False): + """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : string, optional + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + if dtype is not None and not is_int64_dtype(dtype): + return super(RangeIndex, self).copy(names, name, dtype, deep) + + if name is None: + name = self.name + return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) + + #def argsort(self, *args, **kwargs): + # """ + # return an ndarray indexer of the underlying data + + # See also + # -------- + # numpy.ndarray.argsort + # """ + # return self._data.argsort(*args, **kwargs) + + def __repr__(self): + attrs = [('start', default_pprint(self.start)), + ('stop', default_pprint(self.stop)), + ('step', default_pprint(self.step)), + ('name', default_pprint(self.name))] + + prepr = u(", ").join([u("%s=%s") % (k, v) + for k, v in attrs]) + res = u("%s(%s)") % (self.__class__.__name__, prepr) + + if not compat.PY3: + # needs to be str in Python 2 + encoding = get_option('display.encoding') + res = res.encode(encoding) + return res + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + if self.start != 0 or self.step != 1: + start = u('%s, ') % default_pprint(self.start) + else: + start = u('') + stop = default_pprint(self.stop) + step = u('') if self.step == 1 else u(', %s') % default_pprint(self.step) + if self.name is None: + name = u('') + else: + name = u(', name=%s') % default_pprint(self.name) + + res = u("%s(%s%s%s%s)") % (self.__class__.__name__, + start, stop, step, name) + return res + + #def equals(self, other): + # """ + # Determines if two Index objects contain the same elements. + # """ + # if self.is_(other): + # return True + + # elif isinstance(other, RangeIndex): + # return (self.start == other.start and + # self.stop == other.stop and + # self.step == other.step) + + # try: + # return array_equivalent(_values_from_object(self), + # _values_from_object(other)) + # except TypeError: + # # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + # return False + + #def __reduce__(self): + # d = self._get_attributes_dict() + # return _new_Index, (self.__class__, d), None + + #def view(self, cls=None): + # if cls is None or is_int64_dtype(cls): + # return self + # else: + # result = self._shallow_copy() + # if isinstance(result, Index): + # result._id = self._id + # return result + + #def intersection(self, other): + # """ + # Form the intersection of two Index objects. Sortedness of the result is + # not guaranteed + + # Parameters + # ---------- + # other : Index or array-like + + # Returns + # ------- + # intersection : Index + # """ + # if not isinstance(other, RangeIndex): + # return super(RangeIndex, self).intersection(other) + + # # check whether intervals intersect + # # deals with in- and decreasing ranges + # int_low = max(min(self.start, self.stop+1), + # min(other.start, other.stop+1)) + # int_high = min(max(self.stop, self.start+1), + # max(other.stop, other.start+1)) + # if int_high <= int_low: + # return RangeIndex() + + # ### Method hint: linear Diophantine equation + # # solve intersection + # # perf: for identical step sizes, could use cheaper alternative + # gcd, s, t = self._extended_gcd(self.step, other.step) + + # # check whether element sets intersect + # if (self.start - other.start) % gcd: + # return RangeIndex() + + # # calculate parameters for the RangeIndex describing the intersection + # # disregarding the lower bounds + # tmp_start = self.start + (other.start-self.start)*self.step//gcd*s + # new_step = self.step * other.step // gcd + # new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + + # # adjust index to limiting interval + # new_index._start = new_index._min_fitting_element(int_low) + # return new_index + + #def _min_fitting_element(self, lower_limit): + # """Returns the value of the smallest element greater than the limit""" + # round = ceil if self.step > 0 else floor + # no_steps = round( (float(lower_limit)-self.start) / self.step ) + # return self.start + self.step * no_steps + + #def _max_fitting_element(self, upper_limit): + # """Returns the value of the largest element smaller than the limit""" + # round = floor if self.step > 0 else ceil + # no_steps = round( (float(upper_limit)-self.start) / self.step ) + # return self.start + self.step * no_steps + + #def _extended_gcd(self, a, b): + # """ + # Extended Euclidean algorithms to solve Bezout's identity: + # a*x + b*y = gcd(x, y) + # Finds one particular solution for x, y: s, t + # Returns: gcd, s, t + # """ + # s, old_s = 0, 1 + # t, old_t = 1, 0 + # r, old_r = b, a + # while r: + # quotient = old_r // r + # old_r, r = r, old_r - quotient * r + # old_s, s = s, old_s - quotient * s + # old_t, t = t, old_t - quotient * t + # return old_r, old_s, old_t + + #def union(self, other): + # """ + # Form the union of two Index objects and sorts if possible + + # Parameters + # ---------- + # other : Index or array-like + + # Returns + # ------- + # union : Index + # """ + # # note: could return a RangeIndex in some circumstances + # return self._int64index.union(other) + + #def join(self, other, how='left', level=None, return_indexers=False): + # """ + # *this is an internal non-public method* + + # Compute join_index and indexers to conform data + # structures to the new index. + + # Parameters + # ---------- + # other : Index + # how : {'left', 'right', 'inner', 'outer'} + # level : int or level name, default None + # return_indexers : boolean, default False + + # Returns + # ------- + # join_index, (left_indexer, right_indexer) + # """ + # if how == 'outer' and self is not other: + # # note: could return RangeIndex in more circumstances + # return self._int64index.join(other, how, level, return_indexers) + + # return super(RangeIndex, self).join(other, how, level, return_indexers) + + #def _mul(self, other): + # "__mul__() implementation" + # try: + # int_input = other == int(other) + # if int_input: + # other = int(other) + # except Exception: + # int_input = False + + # if int_input == True and other != 0: + # return RangeIndex(self.start*other, self.stop*other, self.step*other, + # fastpath=True) + # else: + # return super(RangeIndex, self).__mul__(other) + + #def __len__(self): + # """ + # return the length of the RangeIndex + # """ + # return (self.stop-self.start) // self.step + + #@property + #def size(self): + # return len(self) + + #def __getitem__(self, key): + # """ + # Conserve RangeIndex type for scalar and slice keys. + # """ + # super_getitem = super(RangeIndex, self).__getitem__ + + # if np.isscalar(key): + # n = int(key) + # if n != key: + # return super_getitem(key) + # if n < 0: + # n = len(self) + key + # if n < 0 or n > len(self)-1: + # raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) + # return self.start + n * self.step + + # if isinstance(key, slice): + # # complete missing slice information + # n_start = 0 if key.start is None else key.start + # n_stop = len(self)+1 if key.stop is None else key.stop + # n_step = 1 if key.step is None else key.step + + # # delegate non-integer slices + # if (n_start != int(n_start) and + # n_stop != int(n_stop) and + # n_step != int(n_step)): + # return super_getitem(key) + + # # deal with index wrap-around + # n_start = len(self)+n_start if n_start < 0 else n_start + # n_stop = len(self)+n_stop if n_stop < 0 else n_stop + + + # # convert indexes to values + # start = self.start + self.step * n_start + # stop = self.start + self.step * n_stop + 1 + # step = self.step * n_step + + # stop = min(stop, self.stop) + # return RangeIndex(start, stop, step, self.name, fastpath=True) + + # # fall back to Int64Index + # return super_getitem(key) + +RangeIndex._add_numeric_methods() +#RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul +RangeIndex._add_logical_methods() + + class Float64Index(NumericIndex): """ From 685618be568ba60d3eaa79b368a8796dd01c3c05 Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 12:28:27 +0200 Subject: [PATCH 02/13] set RangeIndex as default index --- pandas/core/common.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 3d23aeff942dc..e84aaecfb410c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2142,9 +2142,8 @@ def is_bool_indexer(key): def _default_index(n): - from pandas.core.index import Int64Index - values = np.arange(n, dtype=np.int64) - result = Int64Index(values,name=None) + from pandas.core.index import RangeIndex + result = RangeIndex(0, int(n), name=None) result.is_unique = True return result @@ -2498,6 +2497,11 @@ def is_integer_dtype(arr_or_dtype): not issubclass(tipo, (np.datetime64, np.timedelta64))) +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or From aff3e5b131af98e5adc1eee473ae616c457c7b6f Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 12:29:29 +0200 Subject: [PATCH 03/13] fix: pandas.io.packers: encode() and decode() for RangeIndex --- pandas/io/packers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 75ca44fd1ef3e..33946a29a9dee 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,7 +49,7 @@ from pandas.compat import u, PY3 from pandas import ( Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, + Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, DatetimeIndex, Float64Index, NaT ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -257,7 +257,14 @@ def encode(obj): tobj = type(obj) if isinstance(obj, Index): - if isinstance(obj, PeriodIndex): + if isinstance(obj, RangeIndex): + return {'typ': 'range_index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'start': getattr(obj, 'start', None), + 'stop': getattr(obj, 'stop', None), + 'step': getattr(obj, 'step', None)} + elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), @@ -447,6 +454,8 @@ def decode(obj): data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) + elif typ == 'range_index': + return globals()[obj['klass']](obj['start'], obj['stop'], obj['step'], name=obj['name']) elif typ == 'multi_index': data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) From 8a1d446951059e135d2fd1a923d6ac7b75fe01bf Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 14:40:37 +0200 Subject: [PATCH 04/13] argument pass-through & fix: reindex --- pandas/core/index.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 9bdf6dcbb0052..2bc7d7d208df6 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1750,7 +1750,9 @@ def reindex(self, target, method=None, level=None, limit=None): # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator - if not isinstance(target, Index) and len(target) == 0: + if isinstance(self, RangeIndex) and len(target) == 0: + target = self._simple_new(0, 0, 1, name=self.name) + elif not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop('freq', None) # don't preserve freq target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) @@ -3334,7 +3336,7 @@ class RangeIndex(Int64Index): _engine_type = _index.Int64Engine _attributes = ['name', 'start', 'stop', 'step'] - def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False): + def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, **kwargs): if fastpath: return cls._simple_new(start, stop, step, name=name) @@ -3349,14 +3351,25 @@ def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False): stop = start start = 0 - # check validity of inputs - start = cls._ensure_int(start) - stop = cls._ensure_int(stop) - step = cls._ensure_int(step) - if step == 0: - raise ValueError("Step must not be zero") - - return cls._simple_new(start, stop, step, name) + try: + # check validity of inputs + start = cls._ensure_int(start) + stop = cls._ensure_int(stop) + step = cls._ensure_int(step) + if step == 0: + raise ValueError("Step must not be zero") + #assert len(kwargs) == 0 + return cls._simple_new(start, stop, step, name) + except TypeError: + # pass all invalid inputs to Int64Index to handle + if step is None: + step = False + return super(RangeIndex, cls).__new__(data=start, + dtype=stop, + copy=step, + name=name, + fastpath=fastpath, + **kwargs) @classmethod def _simple_new(cls, start, stop, step, name=None): @@ -3372,7 +3385,8 @@ def _simple_new(cls, start, stop, step, name=None): def _ensure_int(cls, value): try: int_value = int(value) - if int_value != value: + # don't allow casting 1-element arrays to int! + if int_value != value or hasattr(value, '__len__'): raise Exception except Exception: raise TypeError("Need to pass integral values") From d6a1389a385780e9d49d6780f656b96df895e8ef Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 15:23:21 +0200 Subject: [PATCH 05/13] fix: temporary fix for argument pass-through --- pandas/core/index.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 2bc7d7d208df6..5c5004acfc819 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3344,27 +3344,32 @@ def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, ** if start is None and stop is None and step is None: return cls._simple_new(0, 0, 1, name=name) + new_start, new_stop, new_step = None, None, None # sort the arguments depending on which are provided if step is None: - step = 1 + new_step = 1 if stop is None: - stop = start - start = 0 + new_stop = start + new_start = 0 try: # check validity of inputs - start = cls._ensure_int(start) - stop = cls._ensure_int(stop) - step = cls._ensure_int(step) - if step == 0: + new_start = start if new_start is None else new_start + new_stop = stop if new_stop is None else new_stop + new_step = step if new_step is None else new_step + new_start = cls._ensure_int(new_start) + new_stop = cls._ensure_int(new_stop) + new_step = cls._ensure_int(new_step) + if new_step == 0: raise ValueError("Step must not be zero") #assert len(kwargs) == 0 - return cls._simple_new(start, stop, step, name) + return cls._simple_new(new_start, new_stop, new_step, name) except TypeError: # pass all invalid inputs to Int64Index to handle if step is None: step = False - return super(RangeIndex, cls).__new__(data=start, + return super(RangeIndex, cls).__new__(cls.__bases__[0], + data=start, dtype=stop, copy=step, name=name, From b2d11bcf52b560d889db0c7113566a598164acad Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 16:44:28 +0200 Subject: [PATCH 06/13] use _default_index() in pandas.core.frame.extract_index() --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 272c401c18761..148bd8be620b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4826,7 +4826,7 @@ def extract_index(data): % (lengths[0], len(index))) raise ValueError(msg) else: - index = Index(np.arange(lengths[0])) + index = _default_index(lengths[0]) return _ensure_index(index) From b3332cb4f37182e6464744bd8ad08824c5276914 Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 16:47:00 +0200 Subject: [PATCH 07/13] fix: pandas.core.index.Index._is(), restore RangeIndex.union() --- pandas/core/index.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 5c5004acfc819..97f825e03b1d4 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -245,7 +245,7 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr(other, '_id', Ellipsis) + return self._id is getattr(other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" @@ -3622,20 +3622,20 @@ def __unicode__(self): # old_t, t = t, old_t - quotient * t # return old_r, old_s, old_t - #def union(self, other): - # """ - # Form the union of two Index objects and sorts if possible + def union(self, other): + """ + Form the union of two Index objects and sorts if possible - # Parameters - # ---------- - # other : Index or array-like + Parameters + ---------- + other : Index or array-like - # Returns - # ------- - # union : Index - # """ - # # note: could return a RangeIndex in some circumstances - # return self._int64index.union(other) + Returns + ------- + union : Index + """ + # note: could return a RangeIndex in some circumstances + return self._int64index.union(other) #def join(self, other, how='left', level=None, return_indexers=False): # """ From 703fc4b346877612e0b0b059b0e290fd80773ed3 Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 30 Apr 2015 12:46:33 +0200 Subject: [PATCH 08/13] fix: add RangeIndex to ABCIndexClass, use _default_index() in _get_names_from_index() --- pandas/core/common.py | 1 + pandas/core/frame.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index e84aaecfb410c..ab5f2f221ad66 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -86,6 +86,7 @@ def _check(cls, inst): ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", + "rangeindex", "float64index", "multiindex", "datetimeindex", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 148bd8be620b8..aa1bcd7dc182a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5043,11 +5043,11 @@ def convert(arr): def _get_names_from_index(data): - index = lrange(len(data)) has_some_name = any([getattr(s, 'name', None) is not None for s in data]) if not has_some_name: - return index + return _default_index(len(data)) + index = lrange(len(data)) count = 0 for i, s in enumerate(data): n = getattr(s, 'name', None) From 7dbb964e75043564768cacc69dee40dcc7a41a99 Mon Sep 17 00:00:00 2001 From: ARF Date: Thu, 30 Apr 2015 18:25:40 +0200 Subject: [PATCH 09/13] fix: pytables tests, MultiIndex.get_level_values() --- pandas/core/index.py | 2 +- pandas/io/tests/test_pytables.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 97f825e03b1d4..7451f815c9018 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -4534,7 +4534,7 @@ def get_level_values(self, level): unique = self.levels[num] # .values labels = self.labels[num] filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, self.names[num], + values = type(unique)(filled, self.names[num], freq=getattr(unique, 'freq', None), tz=getattr(unique, 'tz', None)) return values diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 03e7a8eae549d..236254856c09e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,7 +10,7 @@ import pandas import pandas as pd -from pandas import (Series, DataFrame, Panel, MultiIndex, Categorical, bdate_range, +from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, TimedeltaIndex, isnull) from pandas.io.pytables import _tables @@ -1541,14 +1541,17 @@ def test_column_multiindex(self): index = MultiIndex.from_tuples([('A','a'), ('A','b'), ('B','a'), ('B','b')], names=['first','second']) df = DataFrame(np.arange(12).reshape(3,4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: store.put('df',df) - tm.assert_frame_equal(store['df'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df'],expected,check_index_type=True,check_column_type=True) store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) self.assertRaises(ValueError, store.put, 'df2',df,format='table',data_columns=['A']) self.assertRaises(ValueError, store.put, 'df3',df,format='table',data_columns=True) @@ -1562,11 +1565,14 @@ def test_column_multiindex(self): # non_index_axes name df = DataFrame(np.arange(12).reshape(3,4), columns=Index(list('ABCD'),name='foo')) - + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + with ensure_clean_store(self.path) as store: store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) def test_store_multiindex(self): From f5a1d4977d9e89aa072491edfa1a03850e35d1c6 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 1 May 2015 12:07:08 +0200 Subject: [PATCH 10/13] fix: MultiIndex.get_level_values() --- pandas/core/index.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 7451f815c9018..71011f959ae24 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -4534,9 +4534,13 @@ def get_level_values(self, level): unique = self.levels[num] # .values labels = self.labels[num] filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) - values = type(unique)(filled, self.names[num], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + if isinstance(unique, RangeIndex): + _simple_new = Int64Index._simple_new + else: + _simple_new = unique._simple_new + values = _simple_new(filled, self.names[num], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, From 1a142748db820eeea5040f819ada0b3e0969410b Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 1 May 2015 16:04:34 +0200 Subject: [PATCH 11/13] fix: RangeIndex._shallow_copy() --- pandas/core/index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 71011f959ae24..fcc14e28e14db 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3440,8 +3440,8 @@ def _shallow_copy(self, values=None, **kwargs): return RangeIndex(self.start, self.stop, self.step, name=self.name, fastpath=True) else: - name = kwargs.get('name', self.name) - return self._int64index._shallow_copy(values, **kwargs) + name = kwargs.pop('name', self.name) + return self._int64index._shallow_copy(values, name=name, **kwargs) def copy(self, names=None, name=None, dtype=None, deep=False): """ From 449bdbc13be788b5718c4c4e796a8b2d66e39035 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 1 May 2015 22:16:45 +0200 Subject: [PATCH 12/13] reactivate non-problematic methods --- pandas/core/index.py | 382 ++++++++++++++++++++++--------------------- 1 file changed, 194 insertions(+), 188 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index fcc14e28e14db..05b05ef39833e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3421,14 +3421,15 @@ def stop(self): def step(self): return self._step - #@cache_readonly(allow_setting=True) - #def is_unique(self): - # """ return if the index has unique values """ - # return True + # TODO: make read-only + @cache_readonly(allow_setting=True) + def is_unique(self): + """ return if the index has unique values """ + return True - #@property - #def has_duplicates(self): - # return not self.is_unique + @property + def has_duplicates(self): + return not self.is_unique def tolist(self): return list(range(self.start, self.stop, self.step)) @@ -3441,6 +3442,7 @@ def _shallow_copy(self, values=None, **kwargs): name=self.name, fastpath=True) else: name = kwargs.pop('name', self.name) + # TODO: check a test exists which checks for name preservation return self._int64index._shallow_copy(values, name=name, **kwargs) def copy(self, names=None, name=None, dtype=None, deep=False): @@ -3469,15 +3471,16 @@ def copy(self, names=None, name=None, dtype=None, deep=False): name = self.name return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) - #def argsort(self, *args, **kwargs): - # """ - # return an ndarray indexer of the underlying data + # TODO: return arange instead of sorting + def argsort(self, *args, **kwargs): + """ + return an ndarray indexer of the underlying data - # See also - # -------- - # numpy.ndarray.argsort - # """ - # return self._data.argsort(*args, **kwargs) + See also + -------- + numpy.ndarray.argsort + """ + return self._data.argsort(*args, **kwargs) def __repr__(self): attrs = [('start', default_pprint(self.start)), @@ -3517,29 +3520,32 @@ def __unicode__(self): start, stop, step, name) return res - #def equals(self, other): - # """ - # Determines if two Index objects contain the same elements. - # """ - # if self.is_(other): - # return True - - # elif isinstance(other, RangeIndex): - # return (self.start == other.start and - # self.stop == other.stop and - # self.step == other.step) - - # try: - # return array_equivalent(_values_from_object(self), - # _values_from_object(other)) - # except TypeError: - # # e.g. fails in numpy 1.6 with DatetimeIndex #1681 - # return False - - #def __reduce__(self): - # d = self._get_attributes_dict() - # return _new_Index, (self.__class__, d), None + # TODO: re-arrange case checking & delegation + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + elif isinstance(other, RangeIndex): + return (self.start == other.start and + self.stop == other.stop and + self.step == other.step) + + try: + return array_equivalent(_values_from_object(self), + _values_from_object(other)) + except TypeError: + # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + return False + + def __reduce__(self): + d = self._get_attributes_dict() + return _new_Index, (self.__class__, d), None + + # @jreback: Is this ok? Not sure what a "view" of a non-materialised array + # should mean. #def view(self, cls=None): # if cls is None or is_int64_dtype(cls): # return self @@ -3549,78 +3555,78 @@ def __unicode__(self): # result._id = self._id # return result - #def intersection(self, other): - # """ - # Form the intersection of two Index objects. Sortedness of the result is - # not guaranteed - - # Parameters - # ---------- - # other : Index or array-like - - # Returns - # ------- - # intersection : Index - # """ - # if not isinstance(other, RangeIndex): - # return super(RangeIndex, self).intersection(other) - - # # check whether intervals intersect - # # deals with in- and decreasing ranges - # int_low = max(min(self.start, self.stop+1), - # min(other.start, other.stop+1)) - # int_high = min(max(self.stop, self.start+1), - # max(other.stop, other.start+1)) - # if int_high <= int_low: - # return RangeIndex() - - # ### Method hint: linear Diophantine equation - # # solve intersection - # # perf: for identical step sizes, could use cheaper alternative - # gcd, s, t = self._extended_gcd(self.step, other.step) + def intersection(self, other): + """ + Form the intersection of two Index objects. Sortedness of the result is + not guaranteed + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + intersection : Index + """ + if not isinstance(other, RangeIndex): + return super(RangeIndex, self).intersection(other) + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(min(self.start, self.stop+1), + min(other.start, other.stop+1)) + int_high = min(max(self.stop, self.start+1), + max(other.stop, other.start+1)) + if int_high <= int_low: + return RangeIndex() + + ### Method hint: linear Diophantine equation + # solve intersection problem + # performance hint: for identical step sizes, could use cheaper alternative + gcd, s, t = self._extended_gcd(self.step, other.step) - # # check whether element sets intersect - # if (self.start - other.start) % gcd: - # return RangeIndex() + # check whether element sets intersect + if (self.start - other.start) % gcd: + return RangeIndex() - # # calculate parameters for the RangeIndex describing the intersection - # # disregarding the lower bounds - # tmp_start = self.start + (other.start-self.start)*self.step//gcd*s - # new_step = self.step * other.step // gcd - # new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) - - # # adjust index to limiting interval - # new_index._start = new_index._min_fitting_element(int_low) - # return new_index - - #def _min_fitting_element(self, lower_limit): - # """Returns the value of the smallest element greater than the limit""" - # round = ceil if self.step > 0 else floor - # no_steps = round( (float(lower_limit)-self.start) / self.step ) - # return self.start + self.step * no_steps - - #def _max_fitting_element(self, upper_limit): - # """Returns the value of the largest element smaller than the limit""" - # round = floor if self.step > 0 else ceil - # no_steps = round( (float(upper_limit)-self.start) / self.step ) - # return self.start + self.step * no_steps - - #def _extended_gcd(self, a, b): - # """ - # Extended Euclidean algorithms to solve Bezout's identity: - # a*x + b*y = gcd(x, y) - # Finds one particular solution for x, y: s, t - # Returns: gcd, s, t - # """ - # s, old_s = 0, 1 - # t, old_t = 1, 0 - # r, old_r = b, a - # while r: - # quotient = old_r // r - # old_r, r = r, old_r - quotient * r - # old_s, s = s, old_s - quotient * s - # old_t, t = t, old_t - quotient * t - # return old_r, old_s, old_t + # calculate parameters for the RangeIndex describing the intersection + # disregarding the lower bounds + tmp_start = self.start + (other.start-self.start)*self.step//gcd*s + new_step = self.step * other.step // gcd + new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + + # adjust index to limiting interval + new_index._start = new_index._min_fitting_element(int_low) + return new_index + + def _min_fitting_element(self, lower_limit): + """Returns the value of the smallest element greater than the limit""" + round = ceil if self.step > 0 else floor + no_steps = round( (float(lower_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _max_fitting_element(self, upper_limit): + """Returns the value of the largest element smaller than the limit""" + round = floor if self.step > 0 else ceil + no_steps = round( (float(upper_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _extended_gcd(self, a, b): + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t def union(self, other): """ @@ -3637,86 +3643,86 @@ def union(self, other): # note: could return a RangeIndex in some circumstances return self._int64index.union(other) - #def join(self, other, how='left', level=None, return_indexers=False): - # """ - # *this is an internal non-public method* - - # Compute join_index and indexers to conform data - # structures to the new index. - - # Parameters - # ---------- - # other : Index - # how : {'left', 'right', 'inner', 'outer'} - # level : int or level name, default None - # return_indexers : boolean, default False - - # Returns - # ------- - # join_index, (left_indexer, right_indexer) - # """ - # if how == 'outer' and self is not other: - # # note: could return RangeIndex in more circumstances - # return self._int64index.join(other, how, level, return_indexers) - - # return super(RangeIndex, self).join(other, how, level, return_indexers) - - #def _mul(self, other): - # "__mul__() implementation" - # try: - # int_input = other == int(other) - # if int_input: - # other = int(other) - # except Exception: - # int_input = False - - # if int_input == True and other != 0: - # return RangeIndex(self.start*other, self.stop*other, self.step*other, - # fastpath=True) - # else: - # return super(RangeIndex, self).__mul__(other) - - #def __len__(self): - # """ - # return the length of the RangeIndex - # """ - # return (self.stop-self.start) // self.step - - #@property - #def size(self): - # return len(self) - - #def __getitem__(self, key): - # """ - # Conserve RangeIndex type for scalar and slice keys. - # """ - # super_getitem = super(RangeIndex, self).__getitem__ - - # if np.isscalar(key): - # n = int(key) - # if n != key: - # return super_getitem(key) - # if n < 0: - # n = len(self) + key - # if n < 0 or n > len(self)-1: - # raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) - # return self.start + n * self.step - - # if isinstance(key, slice): - # # complete missing slice information - # n_start = 0 if key.start is None else key.start - # n_stop = len(self)+1 if key.stop is None else key.stop - # n_step = 1 if key.step is None else key.step - - # # delegate non-integer slices - # if (n_start != int(n_start) and - # n_stop != int(n_stop) and - # n_step != int(n_step)): - # return super_getitem(key) - - # # deal with index wrap-around - # n_start = len(self)+n_start if n_start < 0 else n_start - # n_stop = len(self)+n_stop if n_stop < 0 else n_stop + def join(self, other, how='left', level=None, return_indexers=False): + """ + *this is an internal non-public method* + + Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + if how == 'outer' and self is not other: + # note: could return RangeIndex in more circumstances + return self._int64index.join(other, how, level, return_indexers) + + return super(RangeIndex, self).join(other, how, level, return_indexers) + + def _mul(self, other): + "__mul__() implementation" + try: + int_input = other == int(other) + if int_input: + other = int(other) + except Exception: + int_input = False + + if int_input == True and other != 0: + return RangeIndex(self.start*other, self.stop*other, self.step*other, + fastpath=True) + else: + return super(RangeIndex, self).__mul__(other) + + def __len__(self): + """ + return the length of the RangeIndex + """ + return (self.stop-self.start) // self.step + + @property + def size(self): + return len(self) + + def __getitem__(self, key): + """ + Conserve RangeIndex type for scalar and slice keys. + """ + super_getitem = super(RangeIndex, self).__getitem__ + + if np.isscalar(key): + n = int(key) + if n != key: + return super_getitem(key) + if n < 0: + n = len(self) + key + if n < 0 or n > len(self)-1: + raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) + return self.start + n * self.step + + if isinstance(key, slice): + # complete missing slice information + n_start = 0 if key.start is None else key.start + n_stop = len(self) if key.stop is None else key.stop + n_step = 1 if key.step is None else key.step + + # delegate non-integer slices + if (n_start != int(n_start) and + n_stop != int(n_stop) and + n_step != int(n_step)): + return super_getitem(key) + + # deal with index wrap-around + n_start = len(self)+n_start if n_start < 0 else n_start + n_stop = len(self)+n_stop if n_stop < 0 else n_stop # # convert indexes to values @@ -3731,7 +3737,7 @@ def union(self, other): # return super_getitem(key) RangeIndex._add_numeric_methods() -#RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul +RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul RangeIndex._add_logical_methods() From 11111bf3af81466f8ef4d689cc93f85dabf92c12 Mon Sep 17 00:00:00 2001 From: ARF Date: Sat, 2 May 2015 00:05:43 +0200 Subject: [PATCH 13/13] fix inconsistent commenting-out --- pandas/core/index.py | 62 ++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 05b05ef39833e..4b63322bb516c 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3692,37 +3692,37 @@ def __len__(self): def size(self): return len(self) - def __getitem__(self, key): - """ - Conserve RangeIndex type for scalar and slice keys. - """ - super_getitem = super(RangeIndex, self).__getitem__ - - if np.isscalar(key): - n = int(key) - if n != key: - return super_getitem(key) - if n < 0: - n = len(self) + key - if n < 0 or n > len(self)-1: - raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) - return self.start + n * self.step - - if isinstance(key, slice): - # complete missing slice information - n_start = 0 if key.start is None else key.start - n_stop = len(self) if key.stop is None else key.stop - n_step = 1 if key.step is None else key.step - - # delegate non-integer slices - if (n_start != int(n_start) and - n_stop != int(n_stop) and - n_step != int(n_step)): - return super_getitem(key) - - # deal with index wrap-around - n_start = len(self)+n_start if n_start < 0 else n_start - n_stop = len(self)+n_stop if n_stop < 0 else n_stop + #def __getitem__(self, key): + # """ + # Conserve RangeIndex type for scalar and slice keys. + # """ + # super_getitem = super(RangeIndex, self).__getitem__ + + # if np.isscalar(key): + # n = int(key) + # if n != key: + # return super_getitem(key) + # if n < 0: + # n = len(self) + key + # if n < 0 or n > len(self)-1: + # raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) + # return self.start + n * self.step + + # if isinstance(key, slice): + # # complete missing slice information + # n_start = 0 if key.start is None else key.start + # n_stop = len(self) if key.stop is None else key.stop + # n_step = 1 if key.step is None else key.step + + # # delegate non-integer slices + # if (n_start != int(n_start) and + # n_stop != int(n_stop) and + # n_step != int(n_step)): + # return super_getitem(key) + + # # deal with index wrap-around + # n_start = len(self)+n_start if n_start < 0 else n_start + # n_stop = len(self)+n_stop if n_stop < 0 else n_stop # # convert indexes to values