From 9c37b1fa1766b8965ab5c9e2b2c9dd6586767dea Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 24 Apr 2015 13:11:37 +0200 Subject: [PATCH 1/6] Introduction of RangeIndex `RangeIndex(1, 10, 2)` is a memory saving alternative to `Index(np.arange(1, 10,2))`: c.f. #939. This re-implementation is compatible with the current `Index()` api and is a drop-in replacement for `Int64Index()`. It automatically converts to Int64Index() when required by operations. At present only for a minimum number of operations the type is conserved (e.g. slicing, inner-, left- and right-joins). Most other operations trigger creation of an equivalent Int64Index (or at least an equivalent numpy array) and fall back to its implementation. This PR also extends the functionality of the `Index()` constructor to allow creation of `RangeIndexes()` with ``` Index(20) Index(2, 20) Index(0, 20, 2) ``` in analogy to ``` range(20) range(2, 20) range(0, 20, 2) ``` restore Index() fastpath precedence Various fixes suggested by @jreback and @shoyer Cache a private Int64Index object the first time it or its values are required. Restore Index(5) as error. Restore its test. Allow Index(0, 5) and Index(0, 5, 1). Make RangeIndex immutable. See start, stop, step properties. In test_constructor(): check class, attributes (possibly including dtype). In test_copy(): check that copy is not identical (but equal) to the existing. In test_duplicates(): Assert is_unique and has_duplicates return correct values. fix slicing fix view Set RangeIndex as default index * enh: set RangeIndex as default index * fix: pandas.io.packers: encode() and decode() for RangeIndex * enh: array argument pass-through * fix: reindex * fix: use _default_index() in pandas.core.frame.extract_index() * fix: pandas.core.index.Index._is() * fix: add RangeIndex to ABCIndexClass * fix: use _default_index() in _get_names_from_index() * fix: pytables tests * fix: MultiIndex.get_level_values() * fix: RangeIndex._shallow_copy() * fix: null-size RangeIndex equals() comparison * enh: make RangeIndex.is_unique immutable enh: various performance optimizations * optimize argsort() * optimize tolist() * comment clean-up --- pandas/core/api.py | 2 +- pandas/core/common.py | 11 +- pandas/core/frame.py | 6 +- pandas/core/index.py | 467 ++++++++++++++++++++++++- pandas/io/packers.py | 13 +- pandas/io/tests/test_pytables.py | 16 +- pandas/tests/test_index.py | 580 ++++++++++++++++++++++++++++++- 7 files changed, 1068 insertions(+), 27 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index e2ac57e37cba6..81368a91ee218 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, RangeIndex, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/common.py b/pandas/core/common.py index b80b7eecaeb11..70c11f0b8c323 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -84,6 +84,8 @@ def _check(cls, inst): ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", )) ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index", )) +ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", + ("rangeindex", )) ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index", )) ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", @@ -97,7 +99,8 @@ def _check(cls, inst): ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex", )) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", - ("index", "int64index", "float64index", + ("index", "int64index", "rangeindex", + "float64index", "multiindex", "datetimeindex", "timedeltaindex", "periodindex", "categoricalindex")) @@ -1796,10 +1799,8 @@ def is_bool_indexer(key): def _default_index(n): - from pandas.core.index import Int64Index - values = np.arange(n, dtype=np.int64) - result = Int64Index(values, name=None) - result.is_unique = True + from pandas.core.index import RangeIndex + result = RangeIndex(0, int(n), name=None) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 273166db12142..7f53e08b7c38b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5325,7 +5325,7 @@ def extract_index(data): (lengths[0], len(index))) raise ValueError(msg) else: - index = Index(np.arange(lengths[0])) + index = _default_index(lengths[0]) return _ensure_index(index) @@ -5538,11 +5538,11 @@ def convert(arr): def _get_names_from_index(data): - index = lrange(len(data)) has_some_name = any([getattr(s, 'name', None) is not None for s in data]) if not has_some_name: - return index + return _default_index(len(data)) + index = lrange(len(data)) count = 0 for i, s in enumerate(data): n = getattr(s, 'name', None) diff --git a/pandas/core/index.py b/pandas/core/index.py index 3832d0c69ed0e..bd36bae68bb38 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3,6 +3,8 @@ import warnings import operator from functools import partial +from math import ceil, floor + from sys import getsizeof import numpy as np @@ -12,7 +14,7 @@ import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array -from pandas.compat import range, zip, lrange, lzip, u, map +from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat from pandas.core import algorithms from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate @@ -24,9 +26,10 @@ from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, is_datetimetz, ABCSeries, ABCCategorical, ABCPeriodIndex, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, - _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) + ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, + is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype, is_int64_dtype) from pandas.core.strings import StringAccessorMixin + from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -127,12 +130,30 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if fastpath: return cls._simple_new(data, name) + if isinstance(data, int) and isinstance(dtype, int): + if copy == False: + copy = None + range_constructor = True + elif isinstance(copy, int): + range_constructor = True + + if range_constructor: + return RangeIndex(data, dtype, copy, name) + + # no class inference! + if fastpath: + return cls._simple_new(data, name) + if is_categorical_dtype(data) or is_categorical_dtype(dtype): return CategoricalIndex(data, copy=copy, name=name, **kwargs) if isinstance(data, (np.ndarray, Index, ABCSeries)): - if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): + if (isinstance(data, RangeIndex) and + (dtype is None or is_int64_dtype(dtype))): + # copy passed-in RangeIndex + return data.copy(name=name) + elif issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: @@ -327,7 +348,7 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr(other, '_id', Ellipsis) + return self._id is getattr(other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" @@ -2158,7 +2179,9 @@ def reindex(self, target, method=None, level=None, limit=None, # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator - if not isinstance(target, Index) and len(target) == 0: + if isinstance(self, RangeIndex) and len(target) == 0: + target = self._simple_new(0, 0, 1, name=self.name) + elif not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop('freq', None) # don't preserve freq target = self._simple_new(None, dtype=self.dtype, **attrs) @@ -3828,6 +3851,426 @@ def _wrap_joined_index(self, joined, other): Int64Index._add_logical_methods() +class RangeIndex(Int64Index): + + """ + Immutable Index implementing an monotonic range. RangeIndex is a + memory-saving special case of `Int64Index` limited to representing + monotonic ranges. + + Parameters + ---------- + start : int (default: 0) + stop : int (default: 0) + step : int (default: 1) + name : object, optional + Name to be stored in the index + """ + + _typ = 'rangeindex' + _engine_type = _index.Int64Engine + _attributes = ['name', 'start', 'stop', 'step'] + + def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, **kwargs): + if fastpath: + return cls._simple_new(start, stop, step, name=name) + + # cheap check for array input + if len(kwargs) > 0: + return cls._data_passthrough(start, stop, step, name, fastpath, **kwargs) + + # RangeIndex() constructor + if start is None and stop is None and step is None: + return cls._simple_new(0, 0, 1, name=name) + + new_start, new_stop, new_step = None, None, None + # sort the arguments depending on which are provided + if step is None: + new_step = 1 + if stop is None: + new_stop = start + new_start = 0 + + try: + # check validity of inputs + new_start = start if new_start is None else new_start + new_stop = stop if new_stop is None else new_stop + new_step = step if new_step is None else new_step + new_start = cls._ensure_int(new_start) + new_stop = cls._ensure_int(new_stop) + new_step = cls._ensure_int(new_step) + if new_step == 0: + raise ValueError("Step must not be zero") + return cls._simple_new(new_start, new_stop, new_step, name) + except TypeError: + # pass all invalid inputs to Int64Index to handle + return cls._data_passthrough(start, stop, step, name, fastpath, **kwargs) + + @classmethod + def _simple_new(cls, start, stop, step, name=None): + result = object.__new__(cls) + result._start = start + result._stop = stop + result._step = step + result.name = name + return result + + @classmethod + def _data_passthrough(cls, data, dtype, copy, name, fastpath, **kwargs): + kwargs.setdefault('data', data) + kwargs.setdefault('dtype', dtype) + if copy is not None: + kwargs.setdefault('copy', copy) + kwargs.setdefault('name', name) + kwargs.setdefault('fastpath', fastpath) + return Int64Index(**kwargs) + + @classmethod + def _ensure_int(cls, value): + try: + int_value = int(value) + # don't allow casting 1-element arrays to int! + if int_value != value or hasattr(value, '__len__'): + raise Exception + except Exception: + raise TypeError("Need to pass integral values") + return int_value + + @cache_readonly + def _data(self): + return np.arange(self.start, self.stop, self.step, dtype=np.int64) + + @cache_readonly + def _int64index(self): + return Int64Index(self._data, name=self.name, fastpath=True) + + @property + def dtype(self): + return np.dtype(np.int64) + + @property + def start(self): + return self._start + + @property + def stop(self): + return self._stop + + @property + def step(self): + return self._step + + @property + def is_unique(self): + """ return if the index has unique values """ + return True + + @property + def has_duplicates(self): + return False + + def tolist(self): + return lrange(self.start, self.stop, self.step) + + def _shallow_copy(self, values=None, **kwargs): + """ create a new Index, don't copy the data, use the same object attributes + with passed in attributes taking precedence """ + if values is None: + return RangeIndex(self.start, self.stop, self.step, + name=self.name, fastpath=True) + else: + kwargs.setdefault('name', self.name) + return self._int64index._shallow_copy(values, **kwargs) + + def copy(self, names=None, name=None, dtype=None, deep=False): + """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : string, optional + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + if dtype is not None and not is_int64_dtype(dtype): + return super(RangeIndex, self).copy(names, name, dtype, deep) + + if name is None: + name = self.name + return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) + + def argsort(self, *args, **kwargs): + """ + return an ndarray indexer of the underlying data + + See also + -------- + numpy.ndarray.argsort + """ + if self.step > 0: + return np.arange(len(self)) + else: + return np.arange(len(self)-1, -1, -1) + + def __repr__(self): + attrs = [('start', default_pprint(self.start)), + ('stop', default_pprint(self.stop)), + ('step', default_pprint(self.step)), + ('name', default_pprint(self.name))] + + prepr = u(", ").join([u("%s=%s") % (k, v) + for k, v in attrs]) + res = u("%s(%s)") % (self.__class__.__name__, prepr) + + if not compat.PY3: + # needs to be str in Python 2 + encoding = get_option('display.encoding') + res = res.encode(encoding) + return res + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + if self.start != 0 or self.step != 1: + start = u('%s, ') % default_pprint(self.start) + else: + start = u('') + stop = default_pprint(self.stop) + step = u('') if self.step == 1 else u(', %s') % default_pprint(self.step) + if self.name is None: + name = u('') + else: + name = u(', name=%s') % default_pprint(self.name) + + res = u("%s(%s%s%s%s)") % (self.__class__.__name__, + start, stop, step, name) + return res + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if isinstance(other, RangeIndex): + return (len(self) == len(other) == 0 + or (self.start == other.start and + self.stop == other.stop and + self.step == other.step) + ) + + return super(RangeIndex, self).equals(other) + + def __reduce__(self): + d = self._get_attributes_dict() + return _new_Index, (self.__class__, d), None + + def view(self, cls=None): + if cls is None or hasattr(cls,'_typ') or is_int64_dtype(cls): + result = self._shallow_copy() + else: + result = self._data.view(cls) + if isinstance(result, Index): + result._id = self._id + return result + + def intersection(self, other): + """ + Form the intersection of two Index objects. Sortedness of the result is + not guaranteed + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + intersection : Index + """ + if not isinstance(other, RangeIndex): + return super(RangeIndex, self).intersection(other) + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(min(self.start, self.stop+1), + min(other.start, other.stop+1)) + int_high = min(max(self.stop, self.start+1), + max(other.stop, other.start+1)) + if int_high <= int_low: + return RangeIndex() + + ### Method hint: linear Diophantine equation + # solve intersection problem + # performance hint: for identical step sizes, could use cheaper alternative + gcd, s, t = self._extended_gcd(self.step, other.step) + + # check whether element sets intersect + if (self.start - other.start) % gcd: + return RangeIndex() + + # calculate parameters for the RangeIndex describing the intersection + # disregarding the lower bounds + tmp_start = self.start + (other.start-self.start)*self.step//gcd*s + new_step = self.step * other.step // gcd + new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + + # adjust index to limiting interval + new_index._start = new_index._min_fitting_element(int_low) + return new_index + + def _min_fitting_element(self, lower_limit): + """Returns the value of the smallest element greater than the limit""" + round = ceil if self.step > 0 else floor + no_steps = round( (float(lower_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _max_fitting_element(self, upper_limit): + """Returns the value of the largest element smaller than the limit""" + round = floor if self.step > 0 else ceil + no_steps = round( (float(upper_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _extended_gcd(self, a, b): + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + union : Index + """ + # note: could return a RangeIndex in some circumstances + return self._int64index.union(other) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + *this is an internal non-public method* + + Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + if how == 'outer' and self is not other: + # note: could return RangeIndex in more circumstances + return self._int64index.join(other, how, level, return_indexers) + + return super(RangeIndex, self).join(other, how, level, return_indexers) + + def _mul(self, other): + "__mul__() implementation" + try: + int_input = other == int(other) + if int_input: + other = int(other) + except Exception: + int_input = False + + if int_input == True and other != 0: + return RangeIndex(self.start*other, self.stop*other, self.step*other, + fastpath=True) + else: + return super(RangeIndex, self).__mul__(other) + + def __len__(self): + """ + return the length of the RangeIndex + """ + return max(0, (self.stop-self.start) // self.step) + + @property + def size(self): + return len(self) + + def __getitem__(self, key): + """ + Conserve RangeIndex type for scalar and slice keys. + """ + super_getitem = super(RangeIndex, self).__getitem__ + + if np.isscalar(key): + n = int(key) + if n != key: + return super_getitem(key) + if n < 0: + n = len(self) + key + if n < 0 or n > len(self)-1: + raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) + return self.start + n * self.step + + if isinstance(key, slice): + + # complete missing slice information + n_start = 0 if key.start is None else key.start + n_stop = len(self)+1 if key.stop is None else key.stop + n_step = 1 if key.step is None else key.step + + # delegate non-integer slices + if (n_start != int(n_start) and + n_stop != int(n_stop) and + n_step != int(n_step)): + return super_getitem(key) + + # deal with index wrap-around + n_start = len(self)+n_start if n_start < 0 else n_start + n_stop = len(self)+n_stop if n_stop < 0 else n_stop + + + # convert indexes to values + start = self.start + self.step * start + stop = self.start + self.step * stop + step = self.step * step + + return RangeIndex(start, stop, step, self.name, fastpath=True) + + # fall back to Int64Index + return super_getitem(key) + +RangeIndex._add_numeric_methods() +RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul +RangeIndex._add_logical_methods() + + class Float64Index(NumericIndex): """ @@ -4658,10 +5101,14 @@ def get_level_values(self, level): num = self._get_level_number(level) unique = self.levels[num] # .values labels = self.labels[num] - filled = com.take_1d(unique._values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, self.names[num], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) + if isinstance(unique, RangeIndex): + _simple_new = Int64Index._simple_new + else: + _simple_new = unique._simple_new + values = _simple_new(filled, self.names[num], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, diff --git a/pandas/io/packers.py b/pandas/io/packers.py index d5c02736a1cf5..57d5bd44a681d 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,7 +49,7 @@ from pandas.compat import u, PY3 from pandas import ( Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, + Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, DatetimeIndex, Float64Index, NaT ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -273,7 +273,14 @@ def encode(obj): tobj = type(obj) if isinstance(obj, Index): - if isinstance(obj, PeriodIndex): + if isinstance(obj, RangeIndex): + return {'typ': 'range_index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'start': getattr(obj, 'start', None), + 'stop': getattr(obj, 'stop', None), + 'step': getattr(obj, 'step', None)} + elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), @@ -464,6 +471,8 @@ def decode(obj): data = unconvert(obj['data'], dtype, obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) + elif typ == 'range_index': + return globals()[obj['klass']](obj['start'], obj['stop'], obj['step'], name=obj['name']) elif typ == 'multi_index': dtype = dtype_for(obj['dtype']) data = unconvert(obj['data'], dtype, diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index c13afb34dfb84..a17d94d1f3ebf 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,7 +10,7 @@ import pandas import pandas as pd -from pandas import (Series, DataFrame, Panel, MultiIndex, Categorical, bdate_range, +from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, TimedeltaIndex, isnull) from pandas.compat import is_platform_windows, PY3, PY35 @@ -1621,14 +1621,17 @@ def test_column_multiindex(self): index = MultiIndex.from_tuples([('A','a'), ('A','b'), ('B','a'), ('B','b')], names=['first','second']) df = DataFrame(np.arange(12).reshape(3,4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: store.put('df',df) - tm.assert_frame_equal(store['df'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df'],expected,check_index_type=True,check_column_type=True) store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) self.assertRaises(ValueError, store.put, 'df2',df,format='table',data_columns=['A']) self.assertRaises(ValueError, store.put, 'df3',df,format='table',data_columns=True) @@ -1642,11 +1645,14 @@ def test_column_multiindex(self): # non_index_axes name df = DataFrame(np.arange(12).reshape(3,4), columns=Index(list('ABCD'),name='foo')) - + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + with ensure_clean_store(self.path) as store: store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) def test_store_multiindex(self): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index d0c2d2bd15b4e..27a6f73012039 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -12,7 +12,7 @@ import numpy as np from pandas import (period_range, date_range, Categorical, Series, - Index, Float64Index, Int64Index, MultiIndex, + Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, @@ -3432,6 +3432,584 @@ def test_ufunc_coercions(self): exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') tm.assert_index_equal(result, exp) +class TestRangeIndex(Numeric, tm.TestCase): + _holder = RangeIndex + + def setUp(self): + self.indices = dict(index = RangeIndex(0, 20, 2)) + self.setup_indices() + + def create_index(self): + return RangeIndex(5) + + def test_too_many_names(self): + def testit(): + self.index.names = ["roger", "harold"] + assertRaisesRegexp(ValueError, "^Length", testit) + + def test_constructor(self): + index = RangeIndex(5) + expected = np.arange(5, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.start, 0) + self.assertEqual(index.stop, 5) + self.assertEqual(index.step, 1) + self.assertEqual(index.name, None) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex(1, 5) + expected = np.arange(1, 5, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.start, 1) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex(1, 5, 2) + expected = np.arange(1, 5, 2, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.step, 2) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex() + expected = np.empty(0, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.start, 0) + self.assertEqual(index.stop, 0) + self.assertEqual(index.step, 1) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex(name='Foo') + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.name, 'Foo') + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = RangeIndex(1, 5) + self.assertEqual(index.values.dtype, np.int64) + self.assertTrue(index.equals(arr)) + + # non-int raise Exception + self.assertRaises(TypeError, RangeIndex, '1', '10', '1') + self.assertRaises(TypeError, RangeIndex, 1.1, 10.2, 1.3) + + def test_copy(self): + i = RangeIndex(5, name='Foo') + i_copy = i.copy() + self.assertTrue(i_copy is not i) + self.assertTrue(i_copy.identical(i)) + self.assertEqual(i_copy.start, 0) + self.assertEqual(i_copy.stop, 5) + self.assertEqual(i_copy.step, 1) + self.assertEqual(i_copy.name, 'Foo') + + def test_view(self): + super(TestRangeIndex, self).test_view() + + i = RangeIndex(name='Foo') + i_view = i.view() + self.assertEqual(i_view.name, 'Foo') + + i_view = i.view('i8') + tm.assert_index_equal(i, i_view) + + i_view = i.view(RangeIndex) + tm.assert_index_equal(i, i_view) + + def test_index_constructor(self): + arr = Index(0, 5) + tm.assert_isinstance(arr, RangeIndex) + + def test_dtype(self): + self.assertEqual(self.index.dtype, np.int64) + + def test_is_monotonic(self): + self.assertTrue(self.index.is_monotonic) + self.assertTrue(self.index.is_monotonic_increasing) + self.assertFalse(self.index.is_monotonic_decreasing) + + index = RangeIndex(4, 0, -1) + self.assertFalse(index.is_monotonic) + self.assertTrue(index.is_monotonic_decreasing) + + index = RangeIndex(1, 2) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + + def test_equals(self): + same_values = Index(self.index, dtype=object) + self.assertTrue(self.index.equals(same_values)) + self.assertTrue(same_values.equals(self.index)) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_identical(self): + i = Index(self.index.copy()) + self.assertTrue(i.identical(self.index)) + + same_values_different_type = Index(i, dtype=object) + self.assertFalse(i.identical(same_values_different_type)) + + i = self.index.copy(dtype=object) + i = i.rename('foo') + same_values = Index(i, dtype=object) + self.assertTrue(same_values.identical(self.index.copy(dtype=object))) + + self.assertFalse(i.identical(self.index)) + self.assertTrue(Index(same_values, name='foo', dtype=object + ).identical(i)) + + self.assertFalse( + self.index.copy(dtype=object) + .identical(self.index.copy(dtype='int64'))) + + def test_get_indexer(self): + target = RangeIndex(10) + indexer = self.index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + self.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_pad(self): + target = RangeIndex(10) + indexer = self.index.get_indexer(target, method='pad') + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + self.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_backfill(self): + target = RangeIndex(10) + indexer = self.index.get_indexer(target, method='backfill') + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + self.assert_numpy_array_equal(indexer, expected) + + def test_join_outer(self): + ### join with Int64Index + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assertTrue(res.equals(noidx_res)) + + eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) + elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, -1, -1, -1, -1, -1, -1, -1], + dtype=np.int64) + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + dtype=np.int64) + + tm.assert_isinstance(res, Int64Index) + self.assertFalse(isinstance(res, RangeIndex)) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + ### join with RangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assertTrue(res.equals(noidx_res)) + + tm.assert_isinstance(res, Int64Index) + self.assertFalse(isinstance(res, RangeIndex)) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_inner(self): + ### Join with non-RangeIndex + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Int64Index([16, 18]) + elidx = np.array([8, 9]) + eridx = np.array([9, 7]) + + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + ### Join two RangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) + + tm.assert_isinstance(res, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + + def test_join_left(self): + ### Join with Int64Index + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + eres = self.index + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], + dtype=np.int64) + + tm.assert_isinstance(res, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, eridx) + + ### Join withRangeIndex + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + + tm.assert_isinstance(res, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self): + ### Join with Int64Index + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], + dtype=np.int64) + + tm.assert_isinstance(other, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assertIsNone(ridx) + + ### Join withRangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + + tm.assert_isinstance(other, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assertIsNone(ridx) + + def test_join_non_int_index(self): + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = self.index.join(other, how='outer') + outer2 = other.join(self.index, how='outer') + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, + 16, 18], dtype=object) + self.assertTrue(outer.equals(outer2)) + self.assertTrue(outer.equals(expected)) + + inner = self.index.join(other, how='inner') + inner2 = other.join(self.index, how='inner') + expected = Index([6, 8, 10], dtype=object) + self.assertTrue(inner.equals(inner2)) + self.assertTrue(inner.equals(expected)) + + left = self.index.join(other, how='left') + self.assertTrue(left.equals(self.index)) + + left2 = other.join(self.index, how='left') + self.assertTrue(left2.equals(other)) + + right = self.index.join(other, how='right') + self.assertTrue(right.equals(other)) + + right2 = other.join(self.index, how='right') + self.assertTrue(right2.equals(self.index)) + + def test_join_non_unique(self): + other = Index([4, 4, 3, 3]) + + res, lidx, ridx = self.index.join(other, return_indexers=True) + + eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) + elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int64) + eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.int64) + + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_self(self): + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = self.index.join(self.index, how=kind) + self.assertIs(self.index, joined) + + def test_intersection(self): + ### intersect with Int64Index + other = Index(np.arange(1, 6)) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + result = other.intersection(self.index) + expected = np.sort(np.asarray(np.intersect1d(self.index.values, + other.values))) + self.assert_numpy_array_equal(result, expected) + + ### intersect with increasing RangeIndex + other = Index(1, 6) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + ### intersect with decreasing RangeIndex + other = Index(5, 0, -1) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + def test_intersect_str_dates(self): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(['aa'], dtype=object) + res = i2.intersection(i1) + + self.assertEqual(len(res), 0) + + def test_union_noncomparable(self): + from datetime import datetime, timedelta + # corner case, non-Int64Index + now = datetime.now() + other = Index([now + timedelta(i) for i in range(4)], dtype=object) + result = self.index.union(other) + expected = np.concatenate((self.index, other)) + self.assert_numpy_array_equal(result, expected) + + result = other.union(self.index) + expected = np.concatenate((other, self.index)) + self.assert_numpy_array_equal(result, expected) + + def test_cant_or_shouldnt_cast(self): + # can't + self.assertRaises(TypeError, RangeIndex, 'foo', 'bar', 'baz') + + # shouldn't + self.assertRaises(TypeError, RangeIndex, '0', '1', '2') + + def test_view_Index(self): + self.index.view(Index) + + def test_prevent_casting(self): + result = self.index.astype('O') + self.assertEqual(result.dtype, np.object_) + + def test_take_preserve_name(self): + index = RangeIndex(1, 5, name='foo') + taken = index.take([3, 0, 1]) + self.assertEqual(index.name, taken.name) + + def test_int_name_format(self): + from pandas import Series, DataFrame + index = Index(0, 3, name=0) + s = Series(lrange(3), index) + df = DataFrame(lrange(3), index=index) + repr(s) + repr(df) + + def test_print_unicode_columns(self): + df = pd.DataFrame( + {u("\u05d0"): [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_repr_roundtrip(self): + tm.assert_index_equal(eval(repr(self.index)), self.index) + + def test_unicode_string_with_unicode(self): + idx = Index(0, 1000) + + if compat.PY3: + str(idx) + else: + compat.text_type(idx) + + def test_bytestring_with_unicode(self): + idx = Index(0, 1000) + if compat.PY3: + bytes(idx) + else: + str(idx) + + def test_slice_keep_name(self): + idx = RangeIndex(1, 2, name='asdf') + self.assertEqual(idx.name, idx[1:].name) + + def test_numeric_compat(self): + idx = RangeIndex(5) + didx = Index(np.arange(5,dtype='int64')**2) + + # note: special cases of the following could return RangeIndex + # see _mul() example + + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx * idx + tm.assert_index_equal(result, didx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5,dtype='int64') + tm.assert_index_equal(result, Index(np.arange(5,dtype='int64')*5)) + + result = idx * np.arange(5,dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='float64')+0.1) + tm.assert_index_equal(result, + Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) + + # invalid + self.assertRaises(TypeError, lambda : idx * date_range('20130101',periods=5)) + self.assertRaises(ValueError, lambda : idx * self._holder(3)) + self.assertRaises(ValueError, lambda : idx * np.array([1,2])) + + def test_explicit_conversions(self): + + # GH 8608 + # add/sub are overriden explicity for Float/Int Index + idx = RangeIndex(5) + + # float conversions + arr = np.arange(5,dtype='int64')*3.2 + expected = Float64Index(arr) + fidx = idx * 3.2 + tm.assert_index_equal(fidx,expected) + fidx = 3.2 * idx + tm.assert_index_equal(fidx,expected) + + # interops with numpy arrays + expected = Float64Index(arr) + a = np.zeros(5,dtype='float64') + result = fidx - a + tm.assert_index_equal(result,expected) + + expected = Float64Index(-arr) + a = np.zeros(5,dtype='float64') + result = a - fidx + tm.assert_index_equal(result,expected) + + def test_duplicates(self): + for ind in self.indices: + if not len(ind): + continue + idx = self.indices[ind] + self.assertTrue(idx.is_unique) + self.assertFalse(idx.has_duplicates) + + def test_ufunc_compat(self): + idx = RangeIndex(5) + result = np.sin(idx) + expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) + tm.assert_index_equal(result, expected) + + def test_extended_gcd(self): + result = self.index._extended_gcd(6, 10) + self.assertEqual(result[0], result[1]*6 + result[2]*10) + self.assertEqual(2, result[0]) + + result = self.index._extended_gcd(10, 6) + self.assertEqual(2, result[1]*10 + result[2]*6) + self.assertEqual(2, result[0]) + + def test_min_fitting_element(self): + result = RangeIndex(0, 20, 2)._min_fitting_element(1) + self.assertEqual(2, result) + + result = RangeIndex(1, 6)._min_fitting_element(1) + self.assertEqual(1, result) + + result = RangeIndex(18, -2, -2)._min_fitting_element(1) + self.assertEqual(2, result) + + result = RangeIndex(5, 0, -1)._min_fitting_element(1) + self.assertEqual(1, result) + + def test_max_fitting_element(self): + result = RangeIndex(0, 20, 2)._max_fitting_element(17) + self.assertEqual(16, result) + + result = RangeIndex(1, 6)._max_fitting_element(4) + self.assertEqual(4, result) + + result = RangeIndex(18, -2, -2)._max_fitting_element(17) + self.assertEqual(16, result) + + result = RangeIndex(5, 0, -1)._max_fitting_element(4) + self.assertEqual(4, result) + + def test_pickle_compat_construction(self): + # RangeIndex() is a valid constructor + pass + + def test_slice_specialised(self): + # scalar indexing + res = self.index[1] + expected = 2 + self.assertEqual(res, expected) + + res = self.index[-1] + expected = 18 + self.assertEqual(res, expected) + + ### slicing + # slice value completion + index = self.index[:] + expected = self.index + self.assert_numpy_array_equal(index, expected) + + # positive slice values + index = self.index[7:10:2] + expected = np.array([14, 18]) + self.assert_numpy_array_equal(index, expected) + + # negative slice values + index = self.index[-1:-5:-2] + expected = np.array([18, 14]) + self.assert_numpy_array_equal(index, expected) + + # stop overshoot + index = self.index[2:100:4] + expected = np.array([4, 12]) + self.assert_numpy_array_equal(index, expected) + + def test_len_specialised(self): + # TODO: How to test that len is specialised rather than calling + # the parent classes __len__() (which is slow)? + pass + + def test_size_specialised(self): + # TODO: How to test that size is specialised rather than calling + # the parent classes size property (which is slow)? + pass class DatetimeLike(Base): From 1419e8e8b8e0a953792468a3cdee09816972755e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 3 Dec 2015 06:55:19 -0500 Subject: [PATCH 2/6] test fixes, enhancements, and code review --- doc/source/whatsnew/v0.18.0.txt | 34 + pandas/core/api.py | 3 +- pandas/core/common.py | 8 +- pandas/core/dtypes.py | 5 +- pandas/core/index.py | 766 ++++++++++++++--------- pandas/core/series.py | 6 +- pandas/io/packers.py | 15 +- pandas/io/tests/test_json/test_pandas.py | 2 +- pandas/io/tests/test_packers.py | 1 + pandas/io/tests/test_pytables.py | 48 +- pandas/src/reduce.pyx | 8 +- pandas/tests/frame/test_repr_info.py | 5 +- pandas/tests/test_index.py | 609 +++++++++++------- pandas/tests/test_indexing.py | 86 +-- pandas/tests/test_series.py | 6 +- pandas/tests/test_strings.py | 6 +- pandas/tests/test_testing.py | 40 +- pandas/tseries/tests/test_base.py | 6 +- pandas/util/testing.py | 80 ++- 19 files changed, 1092 insertions(+), 642 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index c1f14ce6703a0..05a9d3ac0c861 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -19,6 +19,7 @@ Highlights include: - Window functions are now methods on ``.groupby`` like objects, see :ref:`here `. - ``pd.test()`` top-level nose test runner is available (:issue:`4327`) +- Adding support for a ``RangeIndex`` as a specialized form of the ``Int64Index`` for memory savings, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -102,6 +103,39 @@ And multiple aggregations r.agg({'A' : ['mean','std'], 'B' : ['mean','std']}) +.. _whatsnew_0180.enhancements.rangeindex: + +Range Index +^^^^^^^^^^^ + +A ``RangeIndex`` has been added to the ``Int64Index`` sub-classes to support a memory saving alternative for common use cases. This has a similar implementation to the python ``range`` object (``xrange`` in python 2), in that it only stores the start, stop, and step values for the index. It will transparently interact with the user API, converting to ``Int64Index`` if needed. + +This will now be the default constructed index for ``NDFrame`` objects, rather than previous an ``Int64Index``. (:issue:`939`) + +Previous Behavior: + +.. code-block:: python + + In [3]: s = Series(range(1000)) + + In [4]: s.index + Out[4]: + Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + ... + 990, 991, 992, 993, 994, 995, 996, 997, 998, 999], dtype='int64', length=1000) + + In [6]: s.index.nbytes + Out[6]: 8000 + + +New Behavior: + +.. ipython:: python + + s = Series(range(1000)) + s.index + s.index.nbytes + .. _whatsnew_0180.enhancements.other: Other enhancements diff --git a/pandas/core/api.py b/pandas/core/api.py index 81368a91ee218..0c463d1a201b9 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,8 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, CategoricalIndex, Int64Index, RangeIndex, Float64Index, MultiIndex +from pandas.core.index import (Index, CategoricalIndex, Int64Index, + RangeIndex, Float64Index, MultiIndex) from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/common.py b/pandas/core/common.py index 70c11f0b8c323..7fae09c83120f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1800,8 +1800,7 @@ def is_bool_indexer(key): def _default_index(n): from pandas.core.index import RangeIndex - result = RangeIndex(0, int(n), name=None) - return result + return RangeIndex(0, n, name=None) def ensure_float(arr): @@ -2200,11 +2199,6 @@ def is_integer_dtype(arr_or_dtype): not issubclass(tipo, (np.datetime64, np.timedelta64))) -def is_int64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.int64) - - def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or diff --git a/pandas/core/dtypes.py b/pandas/core/dtypes.py index 1e358694de63e..e6adbc8500117 100644 --- a/pandas/core/dtypes.py +++ b/pandas/core/dtypes.py @@ -214,5 +214,6 @@ def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, DatetimeTZDtype) and self.unit == other.unit \ - and self.tz == other.tz + return isinstance(other, DatetimeTZDtype) and \ + self.unit == other.unit and \ + str(self.tz) == str(other.tz) diff --git a/pandas/core/index.py b/pandas/core/index.py index bd36bae68bb38..0965472e9834b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -14,20 +14,24 @@ import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array -from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map +from pandas.compat import range, zip, lrange, lzip, u, map from pandas import compat from pandas.core import algorithms -from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, PandasDelegate +from pandas.core.base import (PandasObject, FrozenList, FrozenNDArray, + IndexOpsMixin, PandasDelegate) import pandas.core.base as base from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate, deprecate_kwarg) import pandas.core.common as com from pandas.core.missing import _clean_reindex_fill_method -from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, - is_datetimetz, ABCSeries, ABCCategorical, ABCPeriodIndex, - _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, - ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype, is_int64_dtype) +from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, + is_object_dtype, is_datetimetz, ABCSeries, + ABCCategorical, ABCPeriodIndex, + _values_from_object, is_float, is_integer, + is_iterator, is_categorical_dtype, + _ensure_object, _ensure_int64, is_bool_indexer, + is_list_like, is_bool_dtype, is_null_slice, + is_integer_dtype, is_int64_dtype) from pandas.core.strings import StringAccessorMixin from pandas.core.config import get_option @@ -126,40 +130,33 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if name is None and hasattr(data, 'name'): name = data.name - # no class inference! if fastpath: return cls._simple_new(data, name) - if isinstance(data, int) and isinstance(dtype, int): - if copy == False: - copy = None - range_constructor = True - elif isinstance(copy, int): - range_constructor = True - - if range_constructor: - return RangeIndex(data, dtype, copy, name) - - # no class inference! - if fastpath: - return cls._simple_new(data, name) + # range + if isinstance(data, RangeIndex): + return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) + elif isinstance(data, range): + return RangeIndex.from_range(data, copy=copy, dtype=dtype, + name=name) + # categorical if is_categorical_dtype(data) or is_categorical_dtype(dtype): return CategoricalIndex(data, copy=copy, name=name, **kwargs) - if isinstance(data, (np.ndarray, Index, ABCSeries)): + # index-like + elif isinstance(data, (np.ndarray, Index, ABCSeries)): + + if issubclass(data.dtype.type, + np.datetime64) or is_datetimetz(data): - if (isinstance(data, RangeIndex) and - (dtype is None or is_int64_dtype(dtype))): - # copy passed-in RangeIndex - return data.copy(name=name) - elif issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: return Index(result.to_pydatetime(), dtype=_o_dtype) else: return result + elif issubclass(data.dtype.type, np.timedelta64): from pandas.tseries.tdi import TimedeltaIndex result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) @@ -348,7 +345,8 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr(other, '_id', Ellipsis) and self._id is not None + return self._id is getattr( + other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" @@ -476,14 +474,14 @@ def _coerce_scalar_to_index(self, item): """ return Index([item], dtype=self.dtype, **self._get_attributes_dict()) - def copy(self, names=None, name=None, dtype=None, deep=False): - """ + _index_shared_docs['copy'] = """ Make a copy of this object. Name and dtype sets those attributes on the new object. Parameters ---------- name : string, optional + deep : boolean, default False dtype : numpy dtype or pandas type Returns @@ -495,6 +493,10 @@ def copy(self, names=None, name=None, dtype=None, deep=False): In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ + + @Appender(_index_shared_docs['copy']) + def copy(self, name=None, deep=False, dtype=None, **kwargs): + names = kwargs.get('names') if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") if deep: @@ -1081,9 +1083,9 @@ def _invalid_indexer(self, form, key): """ consistent invalid indexer message """ raise TypeError("cannot do {form} indexing on {klass} with these " "indexers [{key}] of {kind}".format(form=form, - klass=type(self), - key=key, - kind=type(key))) + klass=type(self), + key=key, + kind=type(key))) def get_duplicates(self): from collections import defaultdict @@ -1097,6 +1099,10 @@ def get_duplicates(self): def _cleanup(self): self._engine.clear_mapping() + @cache_readonly + def _constructor(self): + return type(self) + @cache_readonly def _engine(self): # property, for now, slow to look up @@ -1660,7 +1666,7 @@ def union(self, other): def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None - return self.__class__(data=result, name=name) + return self.__class__(result, name=name) def intersection(self, other): """ @@ -2179,9 +2185,8 @@ def reindex(self, target, method=None, level=None, limit=None, # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator - if isinstance(self, RangeIndex) and len(target) == 0: - target = self._simple_new(0, 0, 1, name=self.name) - elif not isinstance(target, Index) and len(target) == 0: + + if not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop('freq', None) # don't preserve freq target = self._simple_new(None, dtype=self.dtype, **attrs) @@ -2244,9 +2249,9 @@ def _reindex_non_unique(self, target): missing = com._ensure_platform_int(missing) missing_labels = target.take(missing) - missing_indexer = com._ensure_int64(l[~check]) + missing_indexer = _ensure_int64(l[~check]) cur_labels = self.take(indexer[check])._values - cur_indexer = com._ensure_int64(l[check]) + cur_indexer = _ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]), dtype=object) new_labels[cur_indexer] = cur_labels @@ -2465,7 +2470,7 @@ def _get_leaf_sorter(labels): return np.empty(0, dtype='int64') if len(labels) == 1: - lab = com._ensure_int64(labels[0]) + lab = _ensure_int64(labels[0]) sorter, _ = groupsort_indexer(lab, 1 + lab.max()) return sorter @@ -2476,8 +2481,8 @@ def _get_leaf_sorter(labels): tic |= lab[:-1] != lab[1:] starts = np.hstack(([True], tic, [True])).nonzero()[0] - lab = com._ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, com._ensure_int64(starts)) + lab = _ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, _ensure_int64(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): raise TypeError('Join on level between two MultiIndex objects ' @@ -2509,7 +2514,7 @@ def _get_leaf_sorter(labels): join_index = left[left_indexer] else: - left_lev_indexer = com._ensure_int64(left_lev_indexer) + left_lev_indexer = _ensure_int64(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) @@ -2979,6 +2984,7 @@ def invalid_op(self, other=None): invalid_op.__name__ = name return invalid_op + cls.__pow__ = cls.__rpow__ = _make_invalid_op('__pow__') cls.__mul__ = cls.__rmul__ = _make_invalid_op('__mul__') cls.__floordiv__ = cls.__rfloordiv__ = _make_invalid_op('__floordiv__') cls.__truediv__ = cls.__rtruediv__ = _make_invalid_op('__truediv__') @@ -2993,40 +2999,82 @@ def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ return attrs + def _validate_for_numeric_unaryop(self, op, opstr): + """ validate if we can perform a numeric unary operation """ + + if not self._is_numeric_dtype: + raise TypeError("cannot evaluate a numeric op " + "{opstr} for type: {typ}".format( + opstr=opstr, + typ=type(self)) + ) + + def _validate_for_numeric_binop(self, other, op, opstr): + """ + return valid other, evaluate or raise TypeError + if we are not of the appropriate type + + internal method called by ops + """ + from pandas.tseries.offsets import DateOffset + + # if we are an inheritor of numeric, + # but not actually numeric (e.g. DatetimeIndex/PeriodInde) + if not self._is_numeric_dtype: + raise TypeError("cannot evaluate a numeric op {opstr} " + "for type: {typ}".format( + opstr=opstr, + typ=type(self)) + ) + + if isinstance(other, Index): + if not other._is_numeric_dtype: + raise TypeError("cannot evaluate a numeric op " + "{opstr} with type: {typ}".format( + opstr=type(self), + typ=type(other)) + ) + elif isinstance(other, np.ndarray) and not other.ndim: + other = other.item() + + if isinstance(other, (Index, ABCSeries, np.ndarray)): + if len(self) != len(other): + raise ValueError("cannot evaluate a numeric op with " + "unequal lengths") + other = _values_from_object(other) + if other.dtype.kind not in ['f', 'i']: + raise TypeError("cannot evaluate a numeric op " + "with a non-numeric dtype") + elif isinstance(other, (DateOffset, np.timedelta64, + Timedelta, datetime.timedelta)): + # higher up to handle + pass + elif isinstance(other, (Timestamp, np.datetime64)): + # higher up to handle + pass + else: + if not (is_float(other) or is_integer(other)): + raise TypeError("can only perform ops with scalar values") + + return other + @classmethod - def _add_numeric_methods(cls): + def _add_numeric_methods_binary(cls): """ add in numeric methods """ def _make_evaluate_binop(op, opstr, reversed=False): def _evaluate_numeric_binop(self, other): - import pandas.tseries.offsets as offsets - - # if we are an inheritor of numeric, but not actually numeric (e.g. DatetimeIndex/PeriodInde) - if not self._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op {opstr} for type: {typ}".format(opstr=opstr, - typ=type(self))) - - if isinstance(other, Index): - if not other._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op {opstr} with type: {typ}".format(opstr=type(self), - typ=type(other))) - elif isinstance(other, np.ndarray) and not other.ndim: - other = other.item() - if isinstance(other, (Index, ABCSeries, np.ndarray)): - if len(self) != len(other): - raise ValueError("cannot evaluate a numeric op with unequal lengths") - other = _values_from_object(other) - if other.dtype.kind not in ['f','i']: - raise TypeError("cannot evaluate a numeric op with a non-numeric dtype") - elif isinstance(other, (offsets.DateOffset, np.timedelta64, Timedelta, datetime.timedelta)): + from pandas.tseries.offsets import DateOffset + other = self._validate_for_numeric_binop(other, op, opstr) + + # handle time-based others + if isinstance(other, (DateOffset, np.timedelta64, + Timedelta, datetime.timedelta)): return self._evaluate_with_timedelta_like(other, op, opstr) elif isinstance(other, (Timestamp, np.datetime64)): return self._evaluate_with_datetime_like(other, op, opstr) - else: - if not (is_float(other) or is_integer(other)): - raise TypeError("can only perform ops with scalar values") # if we are a reversed non-communative op values = self.values @@ -3039,28 +3087,18 @@ def _evaluate_numeric_binop(self, other): return _evaluate_numeric_binop - def _make_evaluate_unary(op, opstr): - - def _evaluate_numeric_unary(self): - - # if we are an inheritor of numeric, but not actually numeric (e.g. DatetimeIndex/PeriodInde) - if not self._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op {opstr} for type: {typ}".format(opstr=opstr, - typ=type(self))) - attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) - return Index(op(self.values), **attrs) - - return _evaluate_numeric_unary - cls.__add__ = cls.__radd__ = _make_evaluate_binop( operator.add, '__add__') - cls.__sub__ = _make_evaluate_binop(operator.sub, '__sub__') + cls.__sub__ = _make_evaluate_binop( + operator.sub, '__sub__') cls.__rsub__ = _make_evaluate_binop( operator.sub, '__sub__', reversed=True) cls.__mul__ = cls.__rmul__ = _make_evaluate_binop( operator.mul, '__mul__') - cls.__mod__ = _make_evaluate_binop(operator.mod, '__mod__') + cls.__pow__ = cls.__rpow__ = _make_evaluate_binop( + operator.pow, '__pow__') + cls.__mod__ = _make_evaluate_binop( + operator.mod, '__mod__') cls.__floordiv__ = _make_evaluate_binop( operator.floordiv, '__floordiv__') cls.__rfloordiv__ = _make_evaluate_binop( @@ -3074,11 +3112,32 @@ def _evaluate_numeric_unary(self): operator.div, '__div__') cls.__rdiv__ = _make_evaluate_binop( operator.div, '__div__', reversed=True) + + @classmethod + def _add_numeric_methods_unary(cls): + """ add in numeric unary methods """ + + def _make_evaluate_unary(op, opstr): + + def _evaluate_numeric_unary(self): + + self._validate_for_numeric_unaryop(op, opstr) + attrs = self._get_attributes_dict() + attrs = self._maybe_update_attributes(attrs) + return Index(op(self.values), **attrs) + + return _evaluate_numeric_unary + cls.__neg__ = _make_evaluate_unary(lambda x: -x, '__neg__') cls.__pos__ = _make_evaluate_unary(lambda x: x, '__pos__') cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__') cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__') + @classmethod + def _add_numeric_methods(cls): + cls._add_numeric_methods_unary() + cls._add_numeric_methods_binary() + @classmethod def _add_logical_methods(cls): """ add in logical methods """ @@ -3854,8 +3913,8 @@ def _wrap_joined_index(self, joined, other): class RangeIndex(Int64Index): """ - Immutable Index implementing an monotonic range. RangeIndex is a - memory-saving special case of `Int64Index` limited to representing + Immutable Index implementing a monotonic range. RangeIndex is a + memory-saving special case of Int64Index limited to representing monotonic ranges. Parameters @@ -3865,100 +3924,186 @@ class RangeIndex(Int64Index): step : int (default: 1) name : object, optional Name to be stored in the index + copy : bool, default False + Make a copy of input if its a RangeIndex + """ _typ = 'rangeindex' _engine_type = _index.Int64Engine - _attributes = ['name', 'start', 'stop', 'step'] - def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, **kwargs): + def __new__(cls, start=None, stop=None, step=None, name=None, dtype=None, + fastpath=False, copy=False, **kwargs): + if fastpath: return cls._simple_new(start, stop, step, name=name) - # cheap check for array input - if len(kwargs) > 0: - return cls._data_passthrough(start, stop, step, name, fastpath, **kwargs) + cls._validate_dtype(dtype) - # RangeIndex() constructor - if start is None and stop is None and step is None: - return cls._simple_new(0, 0, 1, name=name) + # RangeIndex + if isinstance(start, RangeIndex): + if not copy: + return start + if name is None: + name = getattr(start, 'name', None) + start, stop, step = start._start, start._stop, start._step - new_start, new_stop, new_step = None, None, None - # sort the arguments depending on which are provided - if step is None: - new_step = 1 + # validate the arguments + def _ensure_int(value, field): + try: + new_value = int(value) + except: + new_value = value + + if not is_integer(new_value) or new_value != value: + raise TypeError("RangeIndex(...) must be called with integers," + " {value} was passed for {field}".format( + value=type(value).__name__, + field=field) + ) + + return new_value + + if start is None: + start = 0 + else: + start = _ensure_int(start, 'start') if stop is None: - new_stop = start - new_start = 0 + stop = start + start = 0 + else: + stop = _ensure_int(stop, 'stop') + if step is None: + step = 1 + elif step == 0: + raise ValueError("Step must not be zero") + else: + step = _ensure_int(step, 'step') - try: - # check validity of inputs - new_start = start if new_start is None else new_start - new_stop = stop if new_stop is None else new_stop - new_step = step if new_step is None else new_step - new_start = cls._ensure_int(new_start) - new_stop = cls._ensure_int(new_stop) - new_step = cls._ensure_int(new_step) - if new_step == 0: - raise ValueError("Step must not be zero") - return cls._simple_new(new_start, new_stop, new_step, name) - except TypeError: - # pass all invalid inputs to Int64Index to handle - return cls._data_passthrough(start, stop, step, name, fastpath, **kwargs) + return cls._simple_new(start, stop, step, name) @classmethod - def _simple_new(cls, start, stop, step, name=None): + def from_range(cls, data, name=None, dtype=None, **kwargs): + """ create RangeIndex from a range (py3), or xrange (py2) object """ + if not isinstance(data, range): + raise TypeError( + '{0}(...) must be called with object coercible to a ' + 'range, {1} was passed'.format(cls.__name__, repr(data))) + + # seems we only have indexing ops to infer + # rather than direct accessors + if len(data) > 1: + step = data[1] - data[0] + stop = data[-1] + step + start = data[0] + elif len(data): + start = data[0] + stop = data[0] + 1 + step = 1 + else: + start = stop = 0 + step = 1 + return RangeIndex(start, stop, step, dtype=dtype, name=name, **kwargs) + + @classmethod + def _simple_new(cls, start, stop=None, step=None, name=None, + dtype=None, **kwargs): result = object.__new__(cls) + + # handle passed None, non-integers + if start is None or not is_integer(start): + try: + return RangeIndex(start, stop, step, name=name, **kwargs) + except TypeError: + return Index(start, stop, step, name=name, **kwargs) + result._start = start - result._stop = stop - result._step = step + result._stop = stop or 0 + result._step = step or 1 result.name = name + for k, v in compat.iteritems(kwargs): + setattr(result, k, v) + + result._reset_identity() return result - @classmethod - def _data_passthrough(cls, data, dtype, copy, name, fastpath, **kwargs): - kwargs.setdefault('data', data) - kwargs.setdefault('dtype', dtype) - if copy is not None: - kwargs.setdefault('copy', copy) - kwargs.setdefault('name', name) - kwargs.setdefault('fastpath', fastpath) - return Int64Index(**kwargs) + @staticmethod + def _validate_dtype(dtype): + """ require dtype to be None or int64 """ + if not (dtype is None or is_int64_dtype(dtype)): + raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex') - @classmethod - def _ensure_int(cls, value): - try: - int_value = int(value) - # don't allow casting 1-element arrays to int! - if int_value != value or hasattr(value, '__len__'): - raise Exception - except Exception: - raise TypeError("Need to pass integral values") - return int_value + @cache_readonly + def _constructor(self): + """ return the class to use for construction """ + return Int64Index @cache_readonly def _data(self): - return np.arange(self.start, self.stop, self.step, dtype=np.int64) + return np.arange(self._start, self._stop, self._step, dtype=np.int64) @cache_readonly def _int64index(self): return Int64Index(self._data, name=self.name, fastpath=True) - @property - def dtype(self): - return np.dtype(np.int64) + def _get_data_as_items(self): + """ return a list of tuples of start, stop, step """ + return [('start', self._start), + ('stop', self._stop), + ('step', self._step)] - @property - def start(self): - return self._start + def __reduce__(self): + d = self._get_attributes_dict() + d.update(dict(self._get_data_as_items())) + return _new_Index, (self.__class__, d), None - @property - def stop(self): - return self._stop + def _format_attrs(self): + """ + Return a list of tuples of the (attr, formatted_value) + """ + attrs = self._get_data_as_items() + if self.name is not None: + attrs.append(('name', default_pprint(self.name))) + return attrs + + def _format_data(self): + # we are formatting thru the attributes + return None + + @cache_readonly + def nbytes(self): + """ return the number of bytes in the underlying data """ + return sum([getsizeof(getattr(self, v)) for v in + ['_start', '_stop', '_step']]) + + def memory_usage(self, deep=False): + """ + Memory usage of my values + + Parameters + ---------- + deep : bool + Introspect the data deeply, interrogate + `object` dtypes for system-level memory consumption + + Returns + ------- + bytes used + + Notes + ----- + Memory usage does not include memory consumed by elements that + are not components of the array if deep=False + + See Also + -------- + numpy.ndarray.nbytes + """ + return self.nbytes @property - def step(self): - return self._step + def dtype(self): + return np.dtype(np.int64) @property def is_unique(self): @@ -3970,43 +4115,25 @@ def has_duplicates(self): return False def tolist(self): - return lrange(self.start, self.stop, self.step) + return lrange(self._start, self._stop, self._step) def _shallow_copy(self, values=None, **kwargs): """ create a new Index, don't copy the data, use the same object attributes with passed in attributes taking precedence """ if values is None: - return RangeIndex(self.start, self.stop, self.step, - name=self.name, fastpath=True) + return RangeIndex(name=self.name, fastpath=True, + **dict(self._get_data_as_items())) else: kwargs.setdefault('name', self.name) return self._int64index._shallow_copy(values, **kwargs) - def copy(self, names=None, name=None, dtype=None, deep=False): - """ - Make a copy of this object. Name and dtype sets those attributes on - the new object. - - Parameters - ---------- - name : string, optional - dtype : numpy dtype or pandas type - - Returns - ------- - copy : Index - - Notes - ----- - In most cases, there should be no functional difference from using - ``deep``, but if ``deep`` is passed it will attempt to deepcopy. - """ - if dtype is not None and not is_int64_dtype(dtype): - return super(RangeIndex, self).copy(names, name, dtype, deep) - + @Appender(_index_shared_docs['copy']) + def copy(self, name=None, deep=False, dtype=None, **kwargs): + self._validate_dtype(dtype) if name is None: name = self.name - return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) + return RangeIndex(name=name, fastpath=True, + **dict(self._get_data_as_items())) def argsort(self, *args, **kwargs): """ @@ -4016,48 +4143,10 @@ def argsort(self, *args, **kwargs): -------- numpy.ndarray.argsort """ - if self.step > 0: + if self._step > 0: return np.arange(len(self)) else: - return np.arange(len(self)-1, -1, -1) - - def __repr__(self): - attrs = [('start', default_pprint(self.start)), - ('stop', default_pprint(self.stop)), - ('step', default_pprint(self.step)), - ('name', default_pprint(self.name))] - - prepr = u(", ").join([u("%s=%s") % (k, v) - for k, v in attrs]) - res = u("%s(%s)") % (self.__class__.__name__, prepr) - - if not compat.PY3: - # needs to be str in Python 2 - encoding = get_option('display.encoding') - res = res.encode(encoding) - return res - - def __unicode__(self): - """ - Return a string representation for this object. - - Invoked by unicode(df) in py2 only. Yields a Unicode String in both - py2/py3. - """ - if self.start != 0 or self.step != 1: - start = u('%s, ') % default_pprint(self.start) - else: - start = u('') - stop = default_pprint(self.stop) - step = u('') if self.step == 1 else u(', %s') % default_pprint(self.step) - if self.name is None: - name = u('') - else: - name = u(', name=%s') % default_pprint(self.name) - - res = u("%s(%s%s%s%s)") % (self.__class__.__name__, - start, stop, step, name) - return res + return np.arange(len(self) - 1, -1, -1) def equals(self, other): """ @@ -4065,26 +4154,13 @@ def equals(self, other): """ if isinstance(other, RangeIndex): return (len(self) == len(other) == 0 - or (self.start == other.start and - self.stop == other.stop and - self.step == other.step) + or (self._start == other._start and + self._stop == other._stop and + self._step == other._step) ) return super(RangeIndex, self).equals(other) - def __reduce__(self): - d = self._get_attributes_dict() - return _new_Index, (self.__class__, d), None - - def view(self, cls=None): - if cls is None or hasattr(cls,'_typ') or is_int64_dtype(cls): - result = self._shallow_copy() - else: - result = self._data.view(cls) - if isinstance(result, Index): - result._id = self._id - return result - def intersection(self, other): """ Form the intersection of two Index objects. Sortedness of the result is @@ -4103,26 +4179,28 @@ def intersection(self, other): # check whether intervals intersect # deals with in- and decreasing ranges - int_low = max(min(self.start, self.stop+1), - min(other.start, other.stop+1)) - int_high = min(max(self.stop, self.start+1), - max(other.stop, other.start+1)) + int_low = max(min(self._start, self._stop + 1), + min(other._start, other._stop + 1)) + int_high = min(max(self._stop, self._start + 1), + max(other._stop, other._start + 1)) if int_high <= int_low: return RangeIndex() - ### Method hint: linear Diophantine equation + # Method hint: linear Diophantine equation # solve intersection problem - # performance hint: for identical step sizes, could use cheaper alternative - gcd, s, t = self._extended_gcd(self.step, other.step) + # performance hint: for identical step sizes, could use + # cheaper alternative + gcd, s, t = self._extended_gcd(self._step, other._step) # check whether element sets intersect - if (self.start - other.start) % gcd: + if (self._start - other._start) % gcd: return RangeIndex() - # calculate parameters for the RangeIndex describing the intersection - # disregarding the lower bounds - tmp_start = self.start + (other.start-self.start)*self.step//gcd*s - new_step = self.step * other.step // gcd + # calculate parameters for the RangeIndex describing the + # intersection disregarding the lower bounds + tmp_start = self._start + (other._start - self._start) * \ + self._step // gcd * s + new_step = self._step * other._step // gcd new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) # adjust index to limiting interval @@ -4131,15 +4209,15 @@ def intersection(self, other): def _min_fitting_element(self, lower_limit): """Returns the value of the smallest element greater than the limit""" - round = ceil if self.step > 0 else floor - no_steps = round( (float(lower_limit)-self.start) / self.step ) - return self.start + self.step * no_steps + round = ceil if self._step > 0 else floor + no_steps = round((float(lower_limit) - self._start) / self._step) + return self._start + self._step * no_steps def _max_fitting_element(self, upper_limit): """Returns the value of the largest element smaller than the limit""" - round = floor if self.step > 0 else ceil - no_steps = round( (float(upper_limit)-self.start) / self.step ) - return self.start + self.step * no_steps + round = floor if self._step > 0 else ceil + no_steps = round((float(upper_limit) - self._start) / self._step) + return self._start + self._step * no_steps def _extended_gcd(self, a, b): """ @@ -4197,26 +4275,11 @@ def join(self, other, how='left', level=None, return_indexers=False): return super(RangeIndex, self).join(other, how, level, return_indexers) - def _mul(self, other): - "__mul__() implementation" - try: - int_input = other == int(other) - if int_input: - other = int(other) - except Exception: - int_input = False - - if int_input == True and other != 0: - return RangeIndex(self.start*other, self.stop*other, self.step*other, - fastpath=True) - else: - return super(RangeIndex, self).__mul__(other) - def __len__(self): """ return the length of the RangeIndex """ - return max(0, (self.stop-self.start) // self.step) + return max(0, -(-(self._stop - self._start) // self._step)) @property def size(self): @@ -4234,40 +4297,171 @@ def __getitem__(self, key): return super_getitem(key) if n < 0: n = len(self) + key - if n < 0 or n > len(self)-1: - raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) - return self.start + n * self.step + if n < 0 or n > len(self) - 1: + raise IndexError("index {key} is out of bounds for axis 0 " + "with size {size}".format(key=key, + size=len(self))) + return self._start + n * self._step if isinstance(key, slice): + # This is basically PySlice_GetIndicesEx, but delegation to our + # super routines if we don't have integers + + l = len(self) + # complete missing slice information - n_start = 0 if key.start is None else key.start - n_stop = len(self)+1 if key.stop is None else key.stop - n_step = 1 if key.step is None else key.step + step = 1 if key.step is None else key.step + if key.start is None: + start = l - 1 if step < 0 else 0 + else: + start = key.start - # delegate non-integer slices - if (n_start != int(n_start) and - n_stop != int(n_stop) and - n_step != int(n_step)): - return super_getitem(key) + if start < 0: + start += l + if start < 0: + start = -1 if step < 0 else 0 + if start >= l: + start = l - 1 if step < 0 else l - # deal with index wrap-around - n_start = len(self)+n_start if n_start < 0 else n_start - n_stop = len(self)+n_stop if n_stop < 0 else n_stop + if key.stop is None: + stop = -1 if step < 0 else l + else: + stop = key.stop + if stop < 0: + stop += l + if stop < 0: + stop = -1 + if stop > l: + stop = l + + # delegate non-integer slices + if (start != int(start) and + stop != int(stop) and + step != int(step)): + return super_getitem(key) # convert indexes to values - start = self.start + self.step * start - stop = self.start + self.step * stop - step = self.step * step + start = self._start + self._step * start + stop = self._start + self._step * stop + step = self._step * step return RangeIndex(start, stop, step, self.name, fastpath=True) # fall back to Int64Index return super_getitem(key) + @classmethod + def _add_numeric_methods_binary(cls): + """ add in numeric methods, specialized to RangeIndex """ + + def _make_evaluate_binop(op, opstr, reversed=False, step=False): + """ + Parameters + ---------- + op : callable that accepts 2 parms + perform the binary op + opstr : string + string name of ops + reversed : boolean, default False + if this is a reversed op, e.g. radd + step : callable, optional, default to False + op to apply to the step parm if not None + if False, use the existing step + """ + + def _evaluate_numeric_binop(self, other): + + other = self._validate_for_numeric_binop(other, op, opstr) + attrs = self._get_attributes_dict() + attrs = self._maybe_update_attributes(attrs) + + if reversed: + self, other = other, self + + try: + # alppy if we have an override + if step: + rstep = step(self._step, other) + + # we don't have a representable op + # so return a base index + if not is_integer(rstep): + raise ValueError + + else: + rstep = self._step + + rstart = op(self._start, other) + rstop = op(self._stop, other) + + result = RangeIndex(rstart, + rstop, + rstep, + **attrs) + + # for compat with numpy / Int64Index + # even if we can represent as a RangeIndex, return + # as a Float64Index if we have float-like descriptors + if not all([is_integer(x) for x in + [rstart, rstop, rstep]]): + result = result.astype('float64') + + return result + + except (ValueError, TypeError, AttributeError): + pass + + # convert to Int64Index ops + if isinstance(self, RangeIndex): + self = self.values + if isinstance(other, RangeIndex): + other = other.values + + return Index(op(self, other), **attrs) + + return _evaluate_numeric_binop + + cls.__add__ = cls.__radd__ = _make_evaluate_binop( + operator.add, '__add__') + cls.__sub__ = _make_evaluate_binop(operator.sub, '__sub__') + cls.__rsub__ = _make_evaluate_binop( + operator.sub, '__sub__', reversed=True) + cls.__mul__ = cls.__rmul__ = _make_evaluate_binop( + operator.mul, + '__mul__', + step=operator.mul) + cls.__floordiv__ = _make_evaluate_binop( + operator.floordiv, + '__floordiv__', + step=operator.floordiv) + cls.__rfloordiv__ = _make_evaluate_binop( + operator.floordiv, + '__floordiv__', + reversed=True, + step=operator.floordiv) + cls.__truediv__ = _make_evaluate_binop( + operator.truediv, + '__truediv__', + step=operator.truediv) + cls.__rtruediv__ = _make_evaluate_binop( + operator.truediv, + '__truediv__', + reversed=True, + step=operator.truediv) + if not compat.PY3: + cls.__div__ = _make_evaluate_binop( + operator.div, + '__div__', + step=operator.div) + cls.__rdiv__ = _make_evaluate_binop( + operator.div, + '__div__', + reversed=True, + step=operator.div) + RangeIndex._add_numeric_methods() -RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul RangeIndex._add_logical_methods() @@ -5101,11 +5295,9 @@ def get_level_values(self, level): num = self._get_level_number(level) unique = self.levels[num] # .values labels = self.labels[num] - filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) - if isinstance(unique, RangeIndex): - _simple_new = Int64Index._simple_new - else: - _simple_new = unique._simple_new + filled = com.take_1d(unique.values, labels, + fill_value=unique._na_value) + _simple_new = unique._simple_new values = _simple_new(filled, self.names[num], freq=getattr(unique, 'freq', None), tz=getattr(unique, 'tz', None)) @@ -6187,7 +6379,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # a partial date slicer on a DatetimeIndex generates a slice # note that the stop ALREADY includes the stopped point (if # it was a string sliced) - return convert_indexer(start.start,stop.stop,step) + return convert_indexer(start.start, stop.stop, step) elif level > 0 or self.lexsort_depth == 0 or step is not None: # need to have like semantics here to right diff --git a/pandas/core/series.py b/pandas/core/series.py index ed5b9093681f1..73e645039506f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -110,7 +110,7 @@ class Series(base.IndexOpsMixin, strings.StringAccessorMixin, generic.NDFrame,): index : array-like or Index (1d) Values must be unique and hashable, same length as data. Index object (or other iterable of same length as data) Will default to - np.arange(len(data)) if not provided. If both a dict and index + RangeIndex(len(data)) if not provided. If both a dict and index sequence are used, the index will override the keys found in the dict. dtype : numpy.dtype or None @@ -920,7 +920,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): resetted : DataFrame, or Series if drop == True """ if drop: - new_index = np.arange(len(self)) + new_index = _default_index(len(self)) if level is not None and isinstance(self.index, MultiIndex): if not isinstance(level, (tuple, list)): level = [level] @@ -1706,7 +1706,7 @@ def _try_kind_sort(arr): bad = isnull(arr) good = ~bad - idx = np.arange(len(self)) + idx = _default_index(len(self)) argsorted = _try_kind_sort(arr[good]) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 57d5bd44a681d..0ba1254659540 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,8 +49,8 @@ from pandas.compat import u, PY3 from pandas import ( Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, DatetimeIndex, Float64Index, - NaT + Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, + DatetimeIndex, Float64Index, NaT ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex @@ -277,9 +277,9 @@ def encode(obj): return {'typ': 'range_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), - 'start': getattr(obj, 'start', None), - 'stop': getattr(obj, 'stop', None), - 'step': getattr(obj, 'step', None)} + 'start': getattr(obj, '_start', None), + 'stop': getattr(obj, '_stop', None), + 'step': getattr(obj, '_step', None)} elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, @@ -472,7 +472,10 @@ def decode(obj): obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) elif typ == 'range_index': - return globals()[obj['klass']](obj['start'], obj['stop'], obj['step'], name=obj['name']) + return globals()[obj['klass']](obj['start'], + obj['stop'], + obj['step'], + name=obj['name']) elif typ == 'multi_index': dtype = dtype_for(obj['dtype']) data = unconvert(obj['data'], dtype, diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 5f41a803538e6..1690667ef743b 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -729,7 +729,7 @@ def test_misc_example(self): DataFrame\\.index values are different \\(100\\.0 %\\) \\[left\\]: Index\\(\\[u?'a', u?'b'\\], dtype='object'\\) -\\[right\\]: Int64Index\\(\\[0, 1\\], dtype='int64'\\)""" +\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)""" with tm.assertRaisesRegexp(AssertionError, error_msg): assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 61b24c858b60d..bdbcb9c0d0d3e 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -253,6 +253,7 @@ def setUp(self): 'string': tm.makeStringIndex(100), 'date': tm.makeDateIndex(100), 'int': tm.makeIntIndex(100), + 'rng': tm.makeRangeIndex(100), 'float': tm.makeFloatIndex(100), 'empty': Index([]), 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index a17d94d1f3ebf..38f5150516551 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,8 +10,10 @@ import pandas import pandas as pd -from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, - date_range, timedelta_range, Index, DatetimeIndex, TimedeltaIndex, isnull) +from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, + RangeIndex, Categorical, bdate_range, + date_range, timedelta_range, Index, DatetimeIndex, + isnull) from pandas.compat import is_platform_windows, PY3, PY35 from pandas.io.pytables import _tables, TableIterator @@ -1619,40 +1621,51 @@ def test_column_multiindex(self): # GH 4710 # recreate multi-indexes properly - index = MultiIndex.from_tuples([('A','a'), ('A','b'), ('B','a'), ('B','b')], names=['first','second']) - df = DataFrame(np.arange(12).reshape(3,4), columns=index) + index = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), + ('B', 'a'), ('B', 'b')], + names=['first', 'second']) + df = DataFrame(np.arange(12).reshape(3, 4), columns=index) expected = df.copy() if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: - store.put('df',df) - tm.assert_frame_equal(store['df'],expected,check_index_type=True,check_column_type=True) + store.put('df', df) + tm.assert_frame_equal(store['df'], expected, + check_index_type=True, + check_column_type=True) - store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) + store.put('df1', df, format='table') + tm.assert_frame_equal(store['df1'], expected, + check_index_type=True, + check_column_type=True) - self.assertRaises(ValueError, store.put, 'df2',df,format='table',data_columns=['A']) - self.assertRaises(ValueError, store.put, 'df3',df,format='table',data_columns=True) + self.assertRaises(ValueError, store.put, 'df2', df, + format='table', data_columns=['A']) + self.assertRaises(ValueError, store.put, 'df3', df, + format='table', data_columns=True) # appending multi-column on existing table (see GH 6167) with ensure_clean_store(self.path) as store: store.append('df2', df) store.append('df2', df) - tm.assert_frame_equal(store['df2'], concat((df,df))) + tm.assert_frame_equal(store['df2'], concat((df, df))) # non_index_axes name - df = DataFrame(np.arange(12).reshape(3,4), columns=Index(list('ABCD'),name='foo')) + df = DataFrame(np.arange(12).reshape(3, 4), + columns=Index(list('ABCD'), name='foo')) expected = df.copy() if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) - + with ensure_clean_store(self.path) as store: - store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) + store.put('df1', df, format='table') + tm.assert_frame_equal(store['df1'], expected, + check_index_type=True, + check_column_type=True) def test_store_multiindex(self): @@ -2484,11 +2497,6 @@ def test_backwards_compat_without_term_object(self): expected = wp.loc[:, [Timestamp('20000102'), Timestamp('20000103')]] assert_panel_equal(result, expected) - with assert_produces_warning(expected_warning=FutureWarning, - check_stacklevel=False): - result = store.select('wp', [('minor_axis', '=', ['A', 'B'])]) - expected = wp.loc[:, :, ['A', 'B']] - assert_panel_equal(result, expected) def test_same_name_scoping(self): diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index be6e11ce70c76..892fee77eb177 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -179,8 +179,8 @@ cdef class SeriesBinGrouper: if not values.flags.c_contiguous: values = values.copy('C') self.arr = values - self.typ = type(series) - self.ityp = type(series.index) + self.typ = series._constructor + self.ityp = series.index._constructor self.index = series.index.values self.name = getattr(series,'name',None) @@ -306,8 +306,8 @@ cdef class SeriesGrouper: if not values.flags.c_contiguous: values = values.copy('C') self.arr = values - self.typ = type(series) - self.ityp = type(series.index) + self.typ = series._constructor + self.ityp = series.index._constructor self.index = series.index.values self.name = getattr(series,'name',None) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index a458445081be5..c5c005beeb69e 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -344,10 +344,13 @@ def test_info_memory_usage(self): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes + # Ensure df size is as expected + # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() - exp_size = (len(dtypes) + 1) * n * 8 # (cols + index) * rows * bytes + exp_size = len(dtypes) * n * 8 + df.index.nbytes self.assertEqual(df_size, exp_size) + # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default self.assertEqual(size_df, np.size(df.memory_usage())) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 27a6f73012039..fc6617a319424 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -2,7 +2,9 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta, time -from pandas.compat import range, lrange, lzip, u, zip, PY3 +from pandas import compat +from pandas.compat import (long, is_platform_windows, range, + lrange, lzip, u, zip, PY3) import operator import re import nose @@ -12,19 +14,18 @@ import numpy as np from pandas import (period_range, date_range, Categorical, Series, - Index, Float64Index, Int64Index, RangeIndex, MultiIndex, - CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) -from pandas.core.index import InvalidIndexError, NumericIndex + DataFrame, Index, Float64Index, Int64Index, RangeIndex, + MultiIndex, CategoricalIndex, DatetimeIndex, + TimedeltaIndex, PeriodIndex) +from pandas.core.index import InvalidIndexError from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, assert_copy) -from pandas import compat -from pandas.compat import long, is_platform_windows + import pandas.util.testing as tm import pandas.core.config as cf from pandas.tseries.index import _to_m8 -import pandas.tseries.offsets as offsets import pandas as pd from pandas.lib import Timestamp @@ -90,33 +91,34 @@ def test_numeric_compat(self): idx = self.create_index() tm.assertRaisesRegexp(TypeError, "cannot perform __mul__", - lambda : idx * 1) + lambda: idx * 1) tm.assertRaisesRegexp(TypeError, "cannot perform __mul__", - lambda : 1 * idx) + lambda: 1 * idx) - div_err = "cannot perform __truediv__" if compat.PY3 else "cannot perform __div__" + div_err = "cannot perform __truediv__" if PY3 \ + else "cannot perform __div__" tm.assertRaisesRegexp(TypeError, div_err, - lambda : idx / 1) + lambda: idx / 1) tm.assertRaisesRegexp(TypeError, div_err, - lambda : 1 / idx) + lambda: 1 / idx) tm.assertRaisesRegexp(TypeError, "cannot perform __floordiv__", - lambda : idx // 1) + lambda: idx // 1) tm.assertRaisesRegexp(TypeError, "cannot perform __floordiv__", - lambda : 1 // idx) + lambda: 1 // idx) def test_logical_compat(self): idx = self.create_index() tm.assertRaisesRegexp(TypeError, 'cannot perform all', - lambda : idx.all()) + lambda: idx.all()) tm.assertRaisesRegexp(TypeError, 'cannot perform any', - lambda : idx.any()) + lambda: idx.any()) def test_boolean_context_compat(self): @@ -467,6 +469,10 @@ def test_delete_base(self): if not len(idx): continue + if isinstance(idx, RangeIndex): + # tested in class + continue + expected = idx[1:] result = idx.delete(0) self.assertTrue(result.equals(expected)) @@ -673,18 +679,19 @@ class TestIndex(Base, tm.TestCase): def setUp(self): self.indices = dict( - unicodeIndex = tm.makeUnicodeIndex(100), - strIndex = tm.makeStringIndex(100), - dateIndex = tm.makeDateIndex(100), - periodIndex = tm.makePeriodIndex(100), - tdIndex = tm.makeTimedeltaIndex(100), - intIndex = tm.makeIntIndex(100), - floatIndex = tm.makeFloatIndex(100), - boolIndex = Index([True,False]), - catIndex = tm.makeCategoricalIndex(100), - empty = Index([]), - tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], - [1, 2, 3])) + unicodeIndex=tm.makeUnicodeIndex(100), + strIndex=tm.makeStringIndex(100), + dateIndex=tm.makeDateIndex(100), + periodIndex=tm.makePeriodIndex(100), + tdIndex=tm.makeTimedeltaIndex(100), + intIndex=tm.makeIntIndex(100), + rangeIndex=tm.makeIntIndex(100), + floatIndex=tm.makeFloatIndex(100), + boolIndex=Index([True, False]), + catIndex=tm.makeCategoricalIndex(100), + empty=Index([]), + tuples=MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], + [1, 2, 3])) ) self.setup_indices() @@ -1065,7 +1072,6 @@ def test_empty_fancy(self): # be tested separately. for idx in [self.strIndex, self.intIndex, self.floatIndex]: empty_idx = idx.__class__([]) - values = idx.values self.assertTrue(idx[[]].identical(empty_idx)) self.assertTrue(idx[empty_iarr].identical(empty_idx)) @@ -2382,18 +2388,18 @@ def test_repr_roundtrip(self): ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) str(ci) - tm.assert_index_equal(eval(repr(ci)),ci,exact=True) + tm.assert_index_equal(eval(repr(ci)), ci, exact=True) # formatting - if compat.PY3: + if PY3: str(ci) else: compat.text_type(ci) # long format # this is not reprable - ci = CategoricalIndex(np.random.randint(0,5,size=100)) - if compat.PY3: + ci = CategoricalIndex(np.random.randint(0, 5, size=100)) + if PY3: str(ci) else: compat.text_type(ci) @@ -2636,7 +2642,8 @@ def test_fillna_categorical(self): self.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - with tm.assertRaisesRegexp(ValueError, 'fill value must be in categories'): + with tm.assertRaisesRegexp(ValueError, + 'fill value must be in categories'): idx.fillna(2.0) @@ -2644,42 +2651,56 @@ class Numeric(Base): def test_numeric_compat(self): - idx = self._holder(np.arange(5,dtype='int64')) - didx = self._holder(np.arange(5,dtype='int64')**2 - ) + idx = self.create_index() + didx = idx * idx + result = idx * 1 tm.assert_index_equal(result, idx) result = 1 * idx tm.assert_index_equal(result, idx) - result = idx * idx - tm.assert_index_equal(result, didx) + # in general not true for RangeIndex + if not isinstance(idx, RangeIndex): + result = idx * idx + tm.assert_index_equal(result, idx ** 2) + # truediv under PY3 result = idx / 1 - tm.assert_index_equal(result, idx) + expected = idx + if PY3: + expected = expected.astype('float64') + tm.assert_index_equal(result, expected) + + result = idx / 2 + if PY3: + expected = expected.astype('float64') + expected = Index(idx.values / 2) + tm.assert_index_equal(result, expected) result = idx // 1 tm.assert_index_equal(result, idx) - result = idx * np.array(5,dtype='int64') - tm.assert_index_equal(result, self._holder(np.arange(5,dtype='int64')*5)) + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, idx * 5) - result = idx * np.arange(5,dtype='int64') + result = idx * np.arange(5, dtype='int64') tm.assert_index_equal(result, didx) - result = idx * Series(np.arange(5,dtype='int64')) + result = idx * Series(np.arange(5, dtype='int64')) tm.assert_index_equal(result, didx) - result = idx * Series(np.arange(5,dtype='float64')+0.1) - tm.assert_index_equal(result, - Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) + result = idx * Series(np.arange(5, dtype='float64') + 0.1) + expected = Float64Index(np.arange(5, dtype='float64') * ( + np.arange(5, dtype='float64') + 0.1)) + tm.assert_index_equal(result, expected) # invalid - self.assertRaises(TypeError, lambda : idx * date_range('20130101',periods=5)) - self.assertRaises(ValueError, lambda : idx * self._holder(np.arange(3))) - self.assertRaises(ValueError, lambda : idx * np.array([1,2])) - + self.assertRaises(TypeError, lambda: idx * date_range('20130101', + periods=5) + ) + self.assertRaises(ValueError, lambda: idx * idx[0:3]) + self.assertRaises(ValueError, lambda: idx * np.array([1, 2])) def test_explicit_conversions(self): @@ -2942,11 +2963,11 @@ def test_fillna_float64(self): self.assert_index_equal(idx.fillna(0.1), exp) # downcast - exp = Int64Index([1, 2, 3], name='x') + exp = Float64Index([1.0, 2.0, 3.0], name='x') self.assert_index_equal(idx.fillna(2), exp) # object - exp = Index([1, 'obj', 3], name='x') + exp = Index([1.0, 'obj', 3.0], name='x') self.assert_index_equal(idx.fillna('obj'), exp) @@ -3358,7 +3379,6 @@ def test_take_preserve_name(self): self.assertEqual(index.name, taken.name) def test_int_name_format(self): - from pandas import Series, DataFrame index = Index(['a', 'b', 'c'], name=0) s = Series(lrange(3), index) df = DataFrame(lrange(3), index=index) @@ -3382,14 +3402,14 @@ def test_repr_roundtrip(self): def test_unicode_string_with_unicode(self): idx = Index(lrange(1000)) - if compat.PY3: + if PY3: str(idx) else: compat.text_type(idx) def test_bytestring_with_unicode(self): idx = Index(lrange(1000)) - if compat.PY3: + if PY3: bytes(idx) else: str(idx) @@ -3399,44 +3419,46 @@ def test_slice_keep_name(self): self.assertEqual(idx.name, idx[1:].name) def test_ufunc_coercions(self): - idx = pd.Int64Index([1, 2, 3, 4, 5], name='x') + idx = Int64Index([1, 2, 3, 4, 5], name='x') result = np.sqrt(idx) tm.assertIsInstance(result, Float64Index) - exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x') + exp = Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x') tm.assert_index_equal(result, exp) result = np.divide(idx, 2.) tm.assertIsInstance(result, Float64Index) - exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') tm.assert_index_equal(result, exp) # _evaluate_numeric_binop result = idx + 2. tm.assertIsInstance(result, Float64Index) - exp = pd.Float64Index([3., 4., 5., 6., 7.], name='x') + exp = Float64Index([3., 4., 5., 6., 7.], name='x') tm.assert_index_equal(result, exp) result = idx - 2. tm.assertIsInstance(result, Float64Index) - exp = pd.Float64Index([-1., 0., 1., 2., 3.], name='x') + exp = Float64Index([-1., 0., 1., 2., 3.], name='x') tm.assert_index_equal(result, exp) result = idx * 1. tm.assertIsInstance(result, Float64Index) - exp = pd.Float64Index([1., 2., 3., 4., 5.], name='x') + exp = Float64Index([1., 2., 3., 4., 5.], name='x') tm.assert_index_equal(result, exp) result = idx / 2. tm.assertIsInstance(result, Float64Index) - exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') tm.assert_index_equal(result, exp) + class TestRangeIndex(Numeric, tm.TestCase): _holder = RangeIndex + _compat_props = ['shape', 'ndim', 'size', 'itemsize'] def setUp(self): - self.indices = dict(index = RangeIndex(0, 20, 2)) + self.indices = dict(index=RangeIndex(0, 20, 2, name='foo')) self.setup_indices() def create_index(self): @@ -3450,37 +3472,149 @@ def testit(): def test_constructor(self): index = RangeIndex(5) expected = np.arange(5, dtype=np.int64) - tm.assert_isinstance(index, RangeIndex) - self.assertEqual(index.start, 0) - self.assertEqual(index.stop, 5) - self.assertEqual(index.step, 1) + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index._start, 0) + self.assertEqual(index._stop, 5) + self.assertEqual(index._step, 1) self.assertEqual(index.name, None) - self.assert_numpy_array_equal(index, expected) + tm.assert_index_equal(Index(expected), index) index = RangeIndex(1, 5) expected = np.arange(1, 5, dtype=np.int64) - tm.assert_isinstance(index, RangeIndex) - self.assertEqual(index.start, 1) - self.assert_numpy_array_equal(index, expected) + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index._start, 1) + tm.assert_index_equal(Index(expected), index) index = RangeIndex(1, 5, 2) expected = np.arange(1, 5, 2, dtype=np.int64) - tm.assert_isinstance(index, RangeIndex) - self.assertEqual(index.step, 2) - self.assert_numpy_array_equal(index, expected) + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index._step, 2) + tm.assert_index_equal(Index(expected), index) index = RangeIndex() expected = np.empty(0, dtype=np.int64) - tm.assert_isinstance(index, RangeIndex) - self.assertEqual(index.start, 0) - self.assertEqual(index.stop, 0) - self.assertEqual(index.step, 1) - self.assert_numpy_array_equal(index, expected) + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index._start, 0) + self.assertEqual(index._stop, 0) + self.assertEqual(index._step, 1) + tm.assert_index_equal(Index(expected), index) index = RangeIndex(name='Foo') - tm.assert_isinstance(index, RangeIndex) + self.assertIsInstance(index, RangeIndex) self.assertEqual(index.name, 'Foo') + # we don't allow on a bare Index + self.assertRaises(TypeError, lambda: Index(0, 1000)) + + # invalid args + for i in [Index(['a', 'b']), + Series(['a', 'b']), + np.array(['a', 'b']), + [], + 'foo', + datetime(2000, 1, 1, 0, 0), + np.arange(0, 10)]: + self.assertRaises(TypeError, lambda: RangeIndex(i)) + + def test_constructor_same(self): + + # pass thru w and w/o copy + index = RangeIndex(1, 5, 2) + result = RangeIndex(index, copy=False) + self.assertTrue(result.identical(index)) + + result = RangeIndex(index, copy=True) + self.assertTrue(result.equals(index)) + + result = RangeIndex(index) + self.assertTrue(result.equals(index)) + + self.assertRaises(TypeError, + lambda: RangeIndex(index, dtype='float64')) + + def test_constructor_range(self): + + self.assertRaises(TypeError, lambda: RangeIndex(range(1, 5, 2))) + + result = RangeIndex.from_range(range(1, 5, 2)) + expected = RangeIndex(1, 5, 2) + self.assertTrue(result.equals(expected)) + + result = RangeIndex.from_range(range(5, 6)) + expected = RangeIndex(5, 6, 1) + self.assertTrue(result.equals(expected)) + + # an invalid range + result = RangeIndex.from_range(range(5, 1)) + expected = RangeIndex(0, 0, 1) + self.assertTrue(result.equals(expected)) + + result = RangeIndex.from_range(range(5)) + expected = RangeIndex(0, 5, 1) + self.assertTrue(result.equals(expected)) + + result = Index(range(1, 5, 2)) + expected = RangeIndex(1, 5, 2) + self.assertTrue(result.equals(expected)) + + self.assertRaises(TypeError, + lambda: Index(range(1, 5, 2), dtype='float64')) + + def test_numeric_compat2(self): + # validate that we are handling the RangeIndex overrides to numeric ops + # and returning RangeIndex where possible + + idx = RangeIndex(0, 10, 2) + + result = idx * 2 + expected = RangeIndex(0, 20, 4) + self.assertTrue(result.equals(expected)) + + result = idx + 2 + expected = RangeIndex(2, 12, 2) + self.assertTrue(result.equals(expected)) + + result = idx - 2 + expected = RangeIndex(-2, 8, 2) + self.assertTrue(result.equals(expected)) + + # truediv under PY3 + result = idx / 2 + if PY3: + expected = RangeIndex(0, 5, 1) + else: + expected = RangeIndex(0, 5, 1).astype('float64') + self.assertTrue(result.equals(expected)) + + result = idx / 4 + expected = RangeIndex(0, 10, 2).values / 4 + self.assertTrue(result.equals(expected)) + + result = idx // 1 + tm.assert_index_equal(result, idx, exact=True) + + # __mul__ + result = idx * idx + expected = Index(idx.values * idx.values) + tm.assert_index_equal(result, expected, exact=True) + + # __pow__ + idx = RangeIndex(0, 1000, 2) + result = idx ** 2 + expected = idx._int64index ** 2 + tm.assert_index_equal(Index(result.values), expected, exact=True) + + # __floordiv__ + idx = RangeIndex(0, 1000, 2) + result = idx // 2 + expected = RangeIndex(0, 500, 1) + tm.assert_index_equal(result, expected, exact=True) + + idx = RangeIndex(0, 1000, 1) + result = idx // 2 + expected = idx._int64index // 2 + tm.assert_index_equal(result, expected, exact=True) + def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = RangeIndex(1, 5) @@ -3491,16 +3625,66 @@ def test_constructor_corner(self): self.assertRaises(TypeError, RangeIndex, '1', '10', '1') self.assertRaises(TypeError, RangeIndex, 1.1, 10.2, 1.3) + # invalid passed type + self.assertRaises(TypeError, + lambda: RangeIndex(1, 5, dtype='float64')) + def test_copy(self): i = RangeIndex(5, name='Foo') i_copy = i.copy() self.assertTrue(i_copy is not i) self.assertTrue(i_copy.identical(i)) - self.assertEqual(i_copy.start, 0) - self.assertEqual(i_copy.stop, 5) - self.assertEqual(i_copy.step, 1) + self.assertEqual(i_copy._start, 0) + self.assertEqual(i_copy._stop, 5) + self.assertEqual(i_copy._step, 1) self.assertEqual(i_copy.name, 'Foo') + def test_repr(self): + i = RangeIndex(5, name='Foo') + result = repr(i) + if PY3: + expected = "RangeIndex(start=0, stop=5, step=1, name='Foo')" + else: + expected = "RangeIndex(start=0, stop=5, step=1, name=u'Foo')" + self.assertTrue(result, expected) + + result = eval(result) + self.assertTrue(result.equals(i)) + + i = RangeIndex(5, 0, -1) + result = repr(i) + expected = "RangeIndex(start=5, stop=0, step=-1)" + self.assertEqual(result, expected) + + result = eval(result) + self.assertTrue(result.equals(i)) + + def test_insert(self): + + idx = RangeIndex(5, name='Foo') + result = idx[1:4] + + # test 0th element + self.assertTrue(idx[0:4].equals( + result.insert(0, idx[0]))) + + def test_delete(self): + + idx = RangeIndex(5, name='Foo') + expected = idx[1:].astype(int) + result = idx.delete(0) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + + expected = idx[:-1].astype(int) + result = idx.delete(-1) + self.assertTrue(result.equals(expected)) + self.assertEqual(result.name, expected.name) + + with tm.assertRaises((IndexError, ValueError)): + # either depending on numpy version + result = idx.delete(len(idx)) + def test_view(self): super(TestRangeIndex, self).test_view() @@ -3509,15 +3693,11 @@ def test_view(self): self.assertEqual(i_view.name, 'Foo') i_view = i.view('i8') - tm.assert_index_equal(i, i_view) + tm.assert_numpy_array_equal(i, i_view) i_view = i.view(RangeIndex) tm.assert_index_equal(i, i_view) - def test_index_constructor(self): - arr = Index(0, 5) - tm.assert_isinstance(arr, RangeIndex) - def test_dtype(self): self.assertEqual(self.index.dtype, np.int64) @@ -3536,6 +3716,9 @@ def test_is_monotonic(self): self.assertTrue(index.is_monotonic_decreasing) def test_equals(self): + + if isinstance(self.index, RangeIndex): + raise nose.SkipTest("RangeIndex does not accept dtype=object") same_values = Index(self.index, dtype=object) self.assertTrue(self.index.equals(same_values)) self.assertTrue(same_values.equals(self.index)) @@ -3549,6 +3732,10 @@ def test_identical(self): i = Index(self.index.copy()) self.assertTrue(i.identical(self.index)) + # we don't allow object dtype for RangeIndex + if isinstance(self.index, RangeIndex): + return + same_values_different_type = Index(i, dtype=object) self.assertFalse(i.identical(same_values_different_type)) @@ -3584,7 +3771,7 @@ def test_get_indexer_backfill(self): self.assert_numpy_array_equal(indexer, expected) def test_join_outer(self): - ### join with Int64Index + # join with Int64Index other = Int64Index(np.arange(25, 14, -1)) res, lidx, ridx = self.index.join(other, how='outer', @@ -3592,19 +3779,20 @@ def test_join_outer(self): noidx_res = self.index.join(other, how='outer') self.assertTrue(res.equals(noidx_res)) - eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) - elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, -1, -1, -1, -1, -1, -1, -1], - dtype=np.int64) - eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], - dtype=np.int64) + eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25]) + elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, + -1, -1, -1, -1, -1, -1, -1], dtype=np.int64) + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, + 5, 4, 3, 2, 1, 0], dtype=np.int64) - tm.assert_isinstance(res, Int64Index) + self.assertIsInstance(res, Int64Index) self.assertFalse(isinstance(res, RangeIndex)) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) - ### join with RangeIndex + # join with RangeIndex other = RangeIndex(25, 14, -1) res, lidx, ridx = self.index.join(other, how='outer', @@ -3612,14 +3800,14 @@ def test_join_outer(self): noidx_res = self.index.join(other, how='outer') self.assertTrue(res.equals(noidx_res)) - tm.assert_isinstance(res, Int64Index) + self.assertIsInstance(res, Int64Index) self.assertFalse(isinstance(res, RangeIndex)) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) def test_join_inner(self): - ### Join with non-RangeIndex + # Join with non-RangeIndex other = Int64Index(np.arange(25, 14, -1)) res, lidx, ridx = self.index.join(other, how='inner', @@ -3635,25 +3823,24 @@ def test_join_inner(self): elidx = np.array([8, 9]) eridx = np.array([9, 7]) - tm.assert_isinstance(res, Int64Index) + self.assertIsInstance(res, Int64Index) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) - ### Join two RangeIndex + # Join two RangeIndex other = RangeIndex(25, 14, -1) res, lidx, ridx = self.index.join(other, how='inner', return_indexers=True) - tm.assert_isinstance(res, RangeIndex) + self.assertIsInstance(res, RangeIndex) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) self.assert_numpy_array_equal(ridx, eridx) - def test_join_left(self): - ### Join with Int64Index + # Join with Int64Index other = Int64Index(np.arange(25, 14, -1)) res, lidx, ridx = self.index.join(other, how='left', @@ -3662,24 +3849,24 @@ def test_join_left(self): eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.int64) - tm.assert_isinstance(res, RangeIndex) + self.assertIsInstance(res, RangeIndex) self.assertTrue(res.equals(eres)) self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) - ### Join withRangeIndex + # Join withRangeIndex other = Int64Index(np.arange(25, 14, -1)) res, lidx, ridx = self.index.join(other, how='left', return_indexers=True) - tm.assert_isinstance(res, RangeIndex) + self.assertIsInstance(res, RangeIndex) self.assertTrue(res.equals(eres)) self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) def test_join_right(self): - ### Join with Int64Index + # Join with Int64Index other = Int64Index(np.arange(25, 14, -1)) res, lidx, ridx = self.index.join(other, how='right', @@ -3688,19 +3875,19 @@ def test_join_right(self): elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], dtype=np.int64) - tm.assert_isinstance(other, Int64Index) + self.assertIsInstance(other, Int64Index) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) self.assertIsNone(ridx) - ### Join withRangeIndex + # Join withRangeIndex other = RangeIndex(25, 14, -1) res, lidx, ridx = self.index.join(other, how='right', return_indexers=True) eres = other - tm.assert_isinstance(other, RangeIndex) + self.assertIsInstance(other, RangeIndex) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) self.assertIsNone(ridx) @@ -3740,7 +3927,8 @@ def test_join_non_unique(self): eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int64) - eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.int64) + eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], + dtype=np.int64) self.assertTrue(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) @@ -3753,7 +3941,7 @@ def test_join_self(self): self.assertIs(self.index, joined) def test_intersection(self): - ### intersect with Int64Index + # intersect with Int64Index other = Index(np.arange(1, 6)) result = self.index.intersection(other) expected = np.sort(np.intersect1d(self.index.values, other.values)) @@ -3764,14 +3952,14 @@ def test_intersection(self): other.values))) self.assert_numpy_array_equal(result, expected) - ### intersect with increasing RangeIndex - other = Index(1, 6) + # intersect with increasing RangeIndex + other = RangeIndex(1, 6) result = self.index.intersection(other) expected = np.sort(np.intersect1d(self.index.values, other.values)) self.assert_numpy_array_equal(result, expected) - ### intersect with decreasing RangeIndex - other = Index(5, 0, -1) + # intersect with decreasing RangeIndex + other = RangeIndex(5, 0, -1) result = self.index.intersection(other) expected = np.sort(np.intersect1d(self.index.values, other.values)) self.assert_numpy_array_equal(result, expected) @@ -3798,6 +3986,16 @@ def test_union_noncomparable(self): expected = np.concatenate((other, self.index)) self.assert_numpy_array_equal(result, expected) + def test_nbytes(self): + + # memory savings vs int index + i = RangeIndex(0, 1000) + self.assertTrue(i.nbytes < i.astype(int).nbytes / 10) + + # constant memory usage + i2 = RangeIndex(0, 10) + self.assertEqual(i.nbytes, i2.nbytes) + def test_cant_or_shouldnt_cast(self): # can't self.assertRaises(TypeError, RangeIndex, 'foo', 'bar', 'baz') @@ -3817,14 +4015,6 @@ def test_take_preserve_name(self): taken = index.take([3, 0, 1]) self.assertEqual(index.name, taken.name) - def test_int_name_format(self): - from pandas import Series, DataFrame - index = Index(0, 3, name=0) - s = Series(lrange(3), index) - df = DataFrame(lrange(3), index=index) - repr(s) - repr(df) - def test_print_unicode_columns(self): df = pd.DataFrame( {u("\u05d0"): [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) @@ -3833,65 +4023,10 @@ def test_print_unicode_columns(self): def test_repr_roundtrip(self): tm.assert_index_equal(eval(repr(self.index)), self.index) - def test_unicode_string_with_unicode(self): - idx = Index(0, 1000) - - if compat.PY3: - str(idx) - else: - compat.text_type(idx) - - def test_bytestring_with_unicode(self): - idx = Index(0, 1000) - if compat.PY3: - bytes(idx) - else: - str(idx) - def test_slice_keep_name(self): idx = RangeIndex(1, 2, name='asdf') self.assertEqual(idx.name, idx[1:].name) - def test_numeric_compat(self): - idx = RangeIndex(5) - didx = Index(np.arange(5,dtype='int64')**2) - - # note: special cases of the following could return RangeIndex - # see _mul() example - - result = idx * 1 - tm.assert_index_equal(result, idx) - - result = 1 * idx - tm.assert_index_equal(result, idx) - - result = idx * idx - tm.assert_index_equal(result, didx) - - result = idx / 1 - tm.assert_index_equal(result, idx) - - result = idx // 1 - tm.assert_index_equal(result, idx) - - result = idx * np.array(5,dtype='int64') - tm.assert_index_equal(result, Index(np.arange(5,dtype='int64')*5)) - - result = idx * np.arange(5,dtype='int64') - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5,dtype='int64')) - tm.assert_index_equal(result, didx) - - result = idx * Series(np.arange(5,dtype='float64')+0.1) - tm.assert_index_equal(result, - Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) - - # invalid - self.assertRaises(TypeError, lambda : idx * date_range('20130101',periods=5)) - self.assertRaises(ValueError, lambda : idx * self._holder(3)) - self.assertRaises(ValueError, lambda : idx * np.array([1,2])) - def test_explicit_conversions(self): # GH 8608 @@ -3899,23 +4034,23 @@ def test_explicit_conversions(self): idx = RangeIndex(5) # float conversions - arr = np.arange(5,dtype='int64')*3.2 + arr = np.arange(5, dtype='int64') * 3.2 expected = Float64Index(arr) fidx = idx * 3.2 - tm.assert_index_equal(fidx,expected) + tm.assert_index_equal(fidx, expected) fidx = 3.2 * idx - tm.assert_index_equal(fidx,expected) + tm.assert_index_equal(fidx, expected) # interops with numpy arrays expected = Float64Index(arr) - a = np.zeros(5,dtype='float64') + a = np.zeros(5, dtype='float64') result = fidx - a - tm.assert_index_equal(result,expected) + tm.assert_index_equal(result, expected) expected = Float64Index(-arr) - a = np.zeros(5,dtype='float64') + a = np.zeros(5, dtype='float64') result = a - fidx - tm.assert_index_equal(result,expected) + tm.assert_index_equal(result, expected) def test_duplicates(self): for ind in self.indices: @@ -3928,16 +4063,16 @@ def test_duplicates(self): def test_ufunc_compat(self): idx = RangeIndex(5) result = np.sin(idx) - expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) + expected = Float64Index(np.sin(np.arange(5, dtype='int64'))) tm.assert_index_equal(result, expected) def test_extended_gcd(self): result = self.index._extended_gcd(6, 10) - self.assertEqual(result[0], result[1]*6 + result[2]*10) + self.assertEqual(result[0], result[1] * 6 + result[2] * 10) self.assertEqual(2, result[0]) result = self.index._extended_gcd(10, 6) - self.assertEqual(2, result[1]*10 + result[2]*6) + self.assertEqual(2, result[1] * 10 + result[2] * 6) self.assertEqual(2, result[0]) def test_min_fitting_element(self): @@ -3971,6 +4106,7 @@ def test_pickle_compat_construction(self): pass def test_slice_specialised(self): + # scalar indexing res = self.index[1] expected = 2 @@ -3980,7 +4116,7 @@ def test_slice_specialised(self): expected = 18 self.assertEqual(res, expected) - ### slicing + # slicing # slice value completion index = self.index[:] expected = self.index @@ -4001,15 +4137,50 @@ def test_slice_specialised(self): expected = np.array([4, 12]) self.assert_numpy_array_equal(index, expected) + # reverse + index = self.index[::-1] + expected = self.index.values[::-1] + self.assert_numpy_array_equal(index, expected) + + index = self.index[-8::-1] + expected = np.array([4, 2, 0]) + self.assert_numpy_array_equal(index, expected) + + index = self.index[-40::-1] + expected = np.array([]) + self.assert_numpy_array_equal(index, expected) + + index = self.index[40::-1] + expected = self.index.values[40::-1] + self.assert_numpy_array_equal(index, expected) + + index = self.index[10::-1] + expected = self.index.values[::-1] + self.assert_numpy_array_equal(index, expected) + def test_len_specialised(self): - # TODO: How to test that len is specialised rather than calling - # the parent classes __len__() (which is slow)? - pass - def test_size_specialised(self): - # TODO: How to test that size is specialised rather than calling - # the parent classes size property (which is slow)? - pass + # make sure that our len is the same as + # np.arange calc + + for step in np.arange(1, 6, 1): + + arr = np.arange(0, 5, step) + i = RangeIndex(0, 5, step) + self.assertEqual(len(i), len(arr)) + + i = RangeIndex(5, 0, step) + self.assertEqual(len(i), 0) + + for step in np.arange(-6, -1, 1): + + arr = np.arange(5, 0, step) + i = RangeIndex(5, 0, step) + self.assertEqual(len(i), len(arr)) + + i = RangeIndex(0, 5, step) + self.assertEqual(len(i), 0) + class DatetimeLike(Base): @@ -4700,24 +4871,25 @@ def test_numeric_compat(self): result = idx // 1 tm.assert_index_equal(result, idx) - result = idx * np.array(5,dtype='int64') - tm.assert_index_equal(result, self._holder(np.arange(5,dtype='int64')*5)) + result = idx * np.array(5, dtype='int64') + tm.assert_index_equal(result, + self._holder(np.arange(5, dtype='int64') * 5)) - result = idx * np.arange(5,dtype='int64') + result = idx * np.arange(5, dtype='int64') tm.assert_index_equal(result, didx) - result = idx * Series(np.arange(5,dtype='int64')) + result = idx * Series(np.arange(5, dtype='int64')) tm.assert_index_equal(result, didx) - result = idx * Series(np.arange(5,dtype='float64')+0.1) + result = idx * Series(np.arange(5, dtype='float64') + 0.1) tm.assert_index_equal(result, - Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) - + self._holder(np.arange(5, dtype='float64') * ( + np.arange(5, dtype='float64') + 0.1))) # invalid - self.assertRaises(TypeError, lambda : idx * idx) - self.assertRaises(ValueError, lambda : idx * self._holder(np.arange(3))) - self.assertRaises(ValueError, lambda : idx * np.array([1,2])) + self.assertRaises(TypeError, lambda: idx * idx) + self.assertRaises(ValueError, lambda: idx * self._holder(np.arange(3))) + self.assertRaises(ValueError, lambda: idx * np.array([1, 2])) def test_pickle_compat_construction(self): pass @@ -5420,8 +5592,9 @@ def test_iter(self): self.assertEqual(result, expected) def test_legacy_pickle(self): - if compat.PY3: - raise nose.SkipTest("testing for legacy pickles not support on py3") + if PY3: + raise nose.SkipTest("testing for legacy pickles not " + "support on py3") path = tm.get_data_path('multiindex_v1.pickle') obj = pd.read_pickle(path) @@ -6504,10 +6677,11 @@ def test_repr_with_unicode_data(self): def test_repr_roundtrip(self): - mi = MultiIndex.from_product([list('ab'),range(3)],names=['first','second']) + mi = MultiIndex.from_product([list('ab'), range(3)], + names=['first', 'second']) str(mi) - if compat.PY3: + if PY3: tm.assert_index_equal(eval(repr(mi)), mi, exact=True) else: result = eval(repr(mi)) @@ -6521,16 +6695,17 @@ def test_repr_roundtrip(self): tm.assert_index_equal(result, mi_u, exact=True) # formatting - if compat.PY3: + if PY3: str(mi) else: compat.text_type(mi) # long format - mi = MultiIndex.from_product([list('abcdefg'),range(10)],names=['first','second']) + mi = MultiIndex.from_product([list('abcdefg'), range(10)], + names=['first', 'second']) result = str(mi) - if compat.PY3: + if PY3: tm.assert_index_equal(eval(repr(mi)), mi, exact=True) else: result = eval(repr(mi)) @@ -6551,7 +6726,7 @@ def test_unicode_string_with_unicode(self): d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} idx = pd.DataFrame(d).set_index(["a", "b"]).index - if compat.PY3: + if PY3: str(idx) else: compat.text_type(idx) @@ -6560,7 +6735,7 @@ def test_bytestring_with_unicode(self): d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} idx = pd.DataFrame(d).set_index(["a", "b"]).index - if compat.PY3: + if PY3: bytes(idx) else: str(idx) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index c6d80a08ad61a..5c3e4c01a965a 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -4352,25 +4352,29 @@ def check_invalid(index, loc=None, iloc=None, ix=None, getitem=None): # related 236/4850 # trying to access with a float index - s = Series(np.arange(len(index)),index=index) + s = Series(np.arange(len(index)), index=index) if iloc is None: iloc = TypeError - self.assertRaises(iloc, lambda : s.iloc[3.5]) + self.assertRaises(iloc, lambda: s.iloc[3.5]) if loc is None: loc = TypeError - self.assertRaises(loc, lambda : s.loc[3.5]) + self.assertRaises(loc, lambda: s.loc[3.5]) if ix is None: ix = TypeError - self.assertRaises(ix, lambda : s.ix[3.5]) + self.assertRaises(ix, lambda: s.ix[3.5]) if getitem is None: getitem = TypeError - self.assertRaises(getitem, lambda : s[3.5]) + self.assertRaises(getitem, lambda: s[3.5]) - for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex ]: - check_invalid(index()) - check_invalid(Index(np.arange(5) * 2.5),loc=KeyError, ix=KeyError, getitem=KeyError) + for index in [tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeIntIndex, tm.makeRangeIndex, + tm.makeDateIndex, tm.makePeriodIndex]: + check_invalid(index()) + check_invalid(Index(np.arange(5) * 2.5), + loc=KeyError, + ix=KeyError, + getitem=KeyError) def check_index(index, error): index = index() @@ -4472,37 +4476,38 @@ def check_slicing_positional(index): ############ # IntIndex # ############ - index = tm.makeIntIndex() - s = Series(np.arange(len(index),dtype='int64')+10,index+5) + for index in [tm.makeIntIndex(), tm.makeRangeIndex()]: - # this is positional - result1 = s[2:5] - result4 = s.iloc[2:5] - assert_series_equal(result1, result4) + s = Series(np.arange(len(index), dtype='int64') + 10, index + 5) - # these are all label based - result2 = s.ix[2:5] - result3 = s.loc[2:5] - assert_series_equal(result2, result3) + # this is positional + result1 = s[2:5] + result4 = s.iloc[2:5] + assert_series_equal(result1, result4) + + # these are all label based + result2 = s.ix[2:5] + result3 = s.loc[2:5] + assert_series_equal(result2, result3) - # float slicers on an int index - expected = Series([11,12,13],index=[6,7,8]) - for method in [lambda x: x.loc, lambda x: x.ix]: - result = method(s)[6.0:8.5] - assert_series_equal(result, expected) + # float slicers on an int index + expected = Series([11, 12, 13], index=[6, 7, 8]) + for method in [lambda x: x.loc, lambda x: x.ix]: + result = method(s)[6.0:8.5] + assert_series_equal(result, expected) - result = method(s)[5.5:8.5] - assert_series_equal(result, expected) + result = method(s)[5.5:8.5] + assert_series_equal(result, expected) - result = method(s)[5.5:8.0] - assert_series_equal(result, expected) + result = method(s)[5.5:8.0] + assert_series_equal(result, expected) - # make all float slicing fail for [] with an int index - self.assertRaises(TypeError, lambda : s[6.0:8]) - self.assertRaises(TypeError, lambda : s[6.0:8.0]) - self.assertRaises(TypeError, lambda : s[6:8.0]) + # make all float slicing fail for [] with an int index + self.assertRaises(TypeError, lambda: s[6.0:8]) + self.assertRaises(TypeError, lambda: s[6.0:8.0]) + self.assertRaises(TypeError, lambda: s[6:8.0]) - check_iloc_compat(s) + check_iloc_compat(s) ############## # FloatIndex # @@ -4658,19 +4663,20 @@ def f(): self.assertRaises(FutureWarning, f) # slices - for index in [ tm.makeIntIndex, tm.makeFloatIndex, - tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex ]: + for index in [tm.makeIntIndex, tm.makeRangeIndex, tm.makeFloatIndex, + tm.makeStringIndex, tm.makeUnicodeIndex, + tm.makeDateIndex, tm.makePeriodIndex]: index = index(5) - for s in [ Series(range(5),index=index), DataFrame(np.random.randn(5,2),index=index) ]: + for s in [Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index)]: # getitem - self.assertRaises(FutureWarning, lambda : + self.assertRaises(FutureWarning, lambda: s.iloc[3.0:4]) - self.assertRaises(FutureWarning, lambda : + self.assertRaises(FutureWarning, lambda: s.iloc[3.0:4.0]) - self.assertRaises(FutureWarning, lambda : + self.assertRaises(FutureWarning, lambda: s.iloc[3:4.0]) # setitem diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index d37ac530d02e8..a2b1a84e78f22 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -826,6 +826,9 @@ def test_constructor(self): def test_constructor_empty(self): empty = Series() empty2 = Series([]) + + # the are Index() and RangeIndex() which don't compare type equal + # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) empty = Series(index=lrange(10)) @@ -1226,7 +1229,7 @@ def test_constructor_dict(self): def test_constructor_dict_multiindex(self): check = lambda result, expected: tm.assert_series_equal( - result, expected, check_dtype=True, check_index_type=True, + result, expected, check_dtype=True, check_series_type=True) d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.} _d = sorted(d.items()) @@ -7418,6 +7421,7 @@ def test_reindex_nan(self): assert_series_equal(ts.reindex(i), ts.iloc[j]) ts.index = ts.index.astype('object') + # reindex coerces index.dtype to float, loc/iloc doesn't assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 0013a6579718a..269d272525ce6 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1324,7 +1324,7 @@ def test_split_no_pat_with_nonzero_n(self): s = Series(['split once', 'split once too!']) result = s.str.split(n=1) expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) - tm.assert_series_equal(expected, result) + tm.assert_series_equal(expected, result, check_index_type=False) def test_split_to_dataframe(self): s = Series(['nosplit', 'alsonosplit']) @@ -1393,7 +1393,7 @@ def test_split_to_dataframe_expand(self): def test_split_to_multiindex_expand(self): idx = Index(['nosplit', 'alsonosplit']) result = idx.str.split('_', expand=True) - exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])]) + exp = idx tm.assert_index_equal(result, exp) self.assertEqual(result.nlevels, 1) @@ -1446,7 +1446,7 @@ def test_rsplit_to_dataframe_expand(self): def test_rsplit_to_multiindex_expand(self): idx = Index(['nosplit', 'alsonosplit']) result = idx.str.rsplit('_', expand=True) - exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])]) + exp = idx tm.assert_index_equal(result, exp) self.assertEqual(result.nlevels, 1) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 13c0b6a08f6e7..58c4285b8394e 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -283,9 +283,8 @@ def test_index_equal_message(self): \\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\], labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" idx1 = pd.Index([1, 2, 3]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4)]) - with assertRaisesRegexp(AssertionError, expected): - assert_index_equal(idx1, idx2) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2, exact=False) @@ -471,8 +470,8 @@ def test_series_equal_message(self): expected = """Series are different Series length are different -\\[left\\]: 3, Int64Index\\(\\[0, 1, 2\\], dtype='int64'\\) -\\[right\\]: 4, Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\)""" +\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\) +\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)""" with assertRaisesRegexp(AssertionError, expected): assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 3, 4])) @@ -526,12 +525,11 @@ def test_frame_equal_message(self): expected = """DataFrame are different DataFrame shape \\(number of rows\\) are different -\\[left\\]: 3, Int64Index\\(\\[0, 1, 2\\], dtype='int64'\\) -\\[right\\]: 4, Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\)""" +\\[left\\]: 3, RangeIndex\\(start=0, stop=3, step=1\\) +\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)""" with assertRaisesRegexp(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A':[1, 2, 3]}), - pd.DataFrame({'A':[1, 2, 3, 4]})) - + assert_frame_equal(pd.DataFrame({'A': [1, 2, 3]}), + pd.DataFrame({'A': [1, 2, 3, 4]})) expected = """DataFrame are different @@ -539,9 +537,8 @@ def test_frame_equal_message(self): \\[left\\]: 2, Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) \\[right\\]: 1, Index\\(\\[u?'A'\\], dtype='object'\\)""" with assertRaisesRegexp(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}), - pd.DataFrame({'A':[1, 2, 3]})) - + assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), + pd.DataFrame({'A': [1, 2, 3]})) expected = """DataFrame\\.index are different @@ -549,10 +546,10 @@ def test_frame_equal_message(self): \\[left\\]: Index\\(\\[u?'a', u?'b', u?'c'\\], dtype='object'\\) \\[right\\]: Index\\(\\[u?'a', u?'b', u?'d'\\], dtype='object'\\)""" with assertRaisesRegexp(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}, - index=['a', 'b', 'c']), - pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}, - index=['a', 'b', 'd'])) + assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + index=['a', 'b', 'c']), + pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + index=['a', 'b', 'd'])) expected = """DataFrame\\.columns are different @@ -560,11 +557,10 @@ def test_frame_equal_message(self): \\[left\\]: Index\\(\\[u?'A', u?'B'\\], dtype='object'\\) \\[right\\]: Index\\(\\[u?'A', u?'b'\\], dtype='object'\\)""" with assertRaisesRegexp(AssertionError, expected): - assert_frame_equal(pd.DataFrame({'A':[1, 2, 3], 'B':[4, 5, 6]}, - index=['a', 'b', 'c']), - pd.DataFrame({'A':[1, 2, 3], 'b':[4, 5, 6]}, - index=['a', 'b', 'c'])) - + assert_frame_equal(pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, + index=['a', 'b', 'c']), + pd.DataFrame({'A': [1, 2, 3], 'b': [4, 5, 6]}, + index=['a', 'b', 'c'])) expected = """DataFrame\\.iloc\\[:, 1\\] are different diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index bf37bd4afe1da..2a1e59154f3d1 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -731,13 +731,13 @@ def test_ops_compat(self): # multiply for offset in offsets: - self.assertRaises(TypeError, lambda : rng * offset) + self.assertRaises(TypeError, lambda: rng * offset) # divide - expected = Int64Index((np.arange(10)+1)*12,name='foo') + expected = Int64Index((np.arange(10) + 1) * 12, name='foo') for offset in offsets: result = rng / offset - tm.assert_index_equal(result,expected) + tm.assert_index_equal(result, expected, exact=False) # divide with nats rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 1c21863415c62..685d89fee53b5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -36,8 +36,9 @@ from pandas.computation import expressions as expr -from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, - Index, MultiIndex, Series, DataFrame, Panel, Panel4D) +from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, + TimedeltaIndex, PeriodIndex, RangeIndex, Index, MultiIndex, + Series, DataFrame, Panel, Panel4D) from pandas.util.decorators import deprecate from pandas import _testing from pandas.io.common import urlopen @@ -599,19 +600,22 @@ def assert_equal(a, b, msg=""): ... AssertionError: 5.2 was really a dead parrot: 5.2 != 1.2 """ - assert a == b, "%s: %r != %r" % (msg.format(a,b), a, b) + assert a == b, "%s: %r != %r" % (msg.format(a, b), a, b) -def assert_index_equal(left, right, exact=False, check_names=True, - check_less_precise=False, check_exact=True, obj='Index'): +def assert_index_equal(left, right, exact='equiv', check_names=True, + check_less_precise=False, check_exact=True, + obj='Index'): """Check that left and right Index are equal. Parameters ---------- left : Index right : Index - exact : bool, default False - Whether to check the Index class, dtype and inferred_type are identical. + exact : bool / string {'equiv'}, default False + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substitued for + Int64Index as well check_names : bool, default True Whether to check the names attribute. check_less_precise : bool, default False @@ -626,9 +630,19 @@ def assert_index_equal(left, right, exact=False, check_names=True, def _check_types(l, r, obj='Index'): if exact: - if type(l) != type(r): - msg = '{0} classes are different'.format(obj) - raise_assert_detail(obj, msg, l, r) + + if exact == 'equiv': + if type(l) != type(r): + # allow equivalence of Int64Index/RangeIndex + types = set([type(l).__name__, type(r).__name__]) + if len(types - set(['Int64Index', 'RangeIndex'])): + msg = '{0} classes are not equivalent'.format(obj) + raise_assert_detail(obj, msg, l, r) + else: + if type(l) != type(r): + msg = '{0} classes are different'.format(obj) + raise_assert_detail(obj, msg, l, r) + assert_attr_equal('dtype', l, r, obj=obj) # allow string-like to have different inferred_types @@ -642,7 +656,8 @@ def _get_ilevel_values(index, level): unique = index.levels[level] labels = index.labels[level] filled = take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, index.names[level], + values = unique._simple_new(filled, + name=index.names[level], freq=getattr(unique, 'freq', None), tz=getattr(unique, 'tz', None)) return values @@ -652,7 +667,7 @@ def _get_ilevel_values(index, level): assertIsInstance(right, Index, '[index] ') # class / dtype comparison - _check_types(left, right) + _check_types(left, right, obj=obj) # level comparison if left.nlevels != right.nlevels: @@ -876,7 +891,7 @@ def assert_numpy_array_equal(left, right, # This could be refactored to use the NDFrame.equals method def assert_series_equal(left, right, check_dtype=True, - check_index_type=True, + check_index_type='equiv', check_series_type=True, check_less_precise=False, check_names=True, @@ -892,8 +907,9 @@ def assert_series_equal(left, right, check_dtype=True, right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. - check_index_type : bool, default False - Whether to check the Index class, dtype and inferred_type are identical. + check_index_type : bool / string {'equiv'}, default False + Whether to check the Index class, dtype and inferred_type + are identical. check_series_type : bool, default False Whether to check the Series class is identical. check_less_precise : bool, default False @@ -958,8 +974,8 @@ def assert_series_equal(left, right, check_dtype=True, # This could be refactored to use the NDFrame.equals method def assert_frame_equal(left, right, check_dtype=True, - check_index_type=True, - check_column_type=True, + check_index_type='equiv', + check_column_type='equiv', check_frame_type=True, check_less_precise=False, check_names=True, @@ -976,10 +992,12 @@ def assert_frame_equal(left, right, check_dtype=True, right : DataFrame check_dtype : bool, default True Whether to check the DataFrame dtype is identical. - check_index_type : bool, default False - Whether to check the Index class, dtype and inferred_type are identical. - check_column_type : bool, default False - Whether to check the columns class, dtype and inferred_type are identical. + check_index_type : bool / string {'equiv'}, default False + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool / string {'equiv'}, default False + Whether to check the columns class, dtype and inferred_type + are identical. check_frame_type : bool, default False Whether to check the DataFrame class is identical. check_less_precise : bool, default False @@ -1106,6 +1124,7 @@ def assert_copy(iter1, iter2, **eql_kwargs): def getCols(k): return string.ascii_uppercase[:k] + def getArangeMat(): return np.arange(N * K).reshape((N, K)) @@ -1118,38 +1137,50 @@ def makeStringIndex(k=10, name=None): def makeUnicodeIndex(k=10, name=None): return Index(randu_array(nchars=10, size=k)) + def makeCategoricalIndex(k=10, n=3, name=None): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x,k), name=name) + return CategoricalIndex(np.random.choice(x, k), name=name) + def makeBoolIndex(k=10, name=None): if k == 1: return Index([True], name=name) elif k == 2: - return Index([False,True], name=name) - return Index([False,True] + [False]*(k-2), name=name) + return Index([False, True], name=name) + return Index([False, True] + [False] * (k - 2), name=name) + def makeIntIndex(k=10, name=None): return Index(lrange(k), name=name) + +def makeRangeIndex(k=10, name=None): + return RangeIndex(0, k, 1, name=name) + + def makeFloatIndex(k=10, name=None): values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) return Index(values * (10 ** np.random.randint(0, 9)), name=name) + def makeDateIndex(k=10, freq='B', name=None): dt = datetime(2000, 1, 1) dr = bdate_range(dt, periods=k, freq=freq, name=name) return DatetimeIndex(dr, name=name) + def makeTimedeltaIndex(k=10, freq='D', name=None): return TimedeltaIndex(start='1 day', periods=k, freq=freq, name=name) + def makePeriodIndex(k=10, name=None): dt = datetime(2000, 1, 1) dr = PeriodIndex(start=dt, periods=k, freq='B', name=name) return dr + def all_index_generator(k=10): """Generator which can be iterated over to get instances of all the various index classes. @@ -1165,6 +1196,7 @@ def all_index_generator(k=10): for make_index_func in all_make_index_funcs: yield make_index_func(k=k) + def all_timeseries_index_generator(k=10): """Generator which can be iterated over to get instances of all the classes which represent time-seires. From 56cef1bc4aa5d4cde4b5d02bcb67ba10eb7320e2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 13 Jan 2016 13:25:44 -0500 Subject: [PATCH 3/6] DOC: documentation --- doc/source/advanced.rst | 25 +++++++++++++++++++++++-- doc/source/timeseries.rst | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index d976b0c8c21a5..465fbf483b1cc 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -617,10 +617,20 @@ faster than fancy indexing. timeit ser.ix[indexer] timeit ser.take(indexer) +.. _indexing.index_types: + +Index Types +----------- + +We have discussed ``MultiIndex`` in the previous sections pretty extensively. ``DatetimeIndex`` and ``PeriodIndex`` +are shown :ref:`here `. ``TimedeltaIndex`` are :ref:`here `. + +In the following sub-sections we will highlite some other index types. + .. _indexing.categoricalindex: CategoricalIndex ----------------- +~~~~~~~~~~~~~~~~ .. versionadded:: 0.16.1 @@ -702,10 +712,21 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. In [12]: pd.concat([df2, df3] TypeError: categories must match existing categories when appending +.. _indexing.rangeindex: + +Int64Index and RangeIndex +~~~~~~~~~~~~~~~~~~~~~~~~~ + +``Int64Index`` is a fundamental basic index in *pandas*. This is an Immutable array implementing an ordered, sliceable set. +Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects. + +``RangeIndex`` is a sub-class of ``Int64Index`` added in version 0.18.0, now providing the default index for all ``NDFrame`` objects. +``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analagous to python :ref:`range types `. + .. _indexing.float64index: Float64Index ------------- +~~~~~~~~~~~~ .. note:: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index b5be9cf395feb..80a4774e02e69 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1091,7 +1091,7 @@ An example of how holidays and holiday calendars are defined: Using this calendar, creating an index or doing offset arithmetic skips weekends and holidays (i.e., Memorial Day/July 4th). For example, the below defines a custom business day offset using the ``ExampleCalendar``. Like any other offset, -it can be used to create a ``DatetimeIndex`` or added to ``datetime`` +it can be used to create a ``DatetimeIndex`` or added to ``datetime`` or ``Timestamp`` objects. .. ipython:: python From c5255da43d5b2366aa69a6da680d87b24a1d8f74 Mon Sep 17 00:00:00 2001 From: Ka Wo Chen Date: Thu, 14 Jan 2016 23:33:39 -0500 Subject: [PATCH 4/6] fixed equals, added test cases, shortcut from_range if PY3 --- pandas/core/index.py | 40 +++++++++++++++++++++++--------------- pandas/tests/test_index.py | 12 +++++++----- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 0965472e9834b..9e323cc161f30 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3990,19 +3990,24 @@ def from_range(cls, data, name=None, dtype=None, **kwargs): '{0}(...) must be called with object coercible to a ' 'range, {1} was passed'.format(cls.__name__, repr(data))) + if compat.PY3: + step = data.step + stop = data.stop + start = data.start + else: # seems we only have indexing ops to infer # rather than direct accessors - if len(data) > 1: - step = data[1] - data[0] - stop = data[-1] + step - start = data[0] - elif len(data): - start = data[0] - stop = data[0] + 1 - step = 1 - else: - start = stop = 0 - step = 1 + if len(data) > 1: + step = data[1] - data[0] + stop = data[-1] + step + start = data[0] + elif len(data): + start = data[0] + stop = data[0] + 1 + step = 1 + else: + start = stop = 0 + step = 1 return RangeIndex(start, stop, step, dtype=dtype, name=name, **kwargs) @classmethod @@ -4153,11 +4158,14 @@ def equals(self, other): Determines if two Index objects contain the same elements. """ if isinstance(other, RangeIndex): - return (len(self) == len(other) == 0 - or (self._start == other._start and - self._stop == other._stop and - self._step == other._step) - ) + ls = len(self) + lo = len(other) + return (ls == lo == 0 or + ls == lo == 1 and + self._start == other._start or + ls == lo and + self._start == other._start and + self._step == other._step) return super(RangeIndex, self).equals(other) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index fc6617a319424..7721ddf90d4ab 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3717,11 +3717,13 @@ def test_is_monotonic(self): def test_equals(self): - if isinstance(self.index, RangeIndex): - raise nose.SkipTest("RangeIndex does not accept dtype=object") - same_values = Index(self.index, dtype=object) - self.assertTrue(self.index.equals(same_values)) - self.assertTrue(same_values.equals(self.index)) + equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), + (RangeIndex(0), RangeIndex(1, -1, 3)), + (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), + (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2))] + for left, right in equiv_pairs: + self.assertTrue(left.equals(right)) + self.assertTrue(right.equals(left)) def test_logical_compat(self): idx = self.create_index() From 0407502810dbd223463ea0f682c4266a8f5d3aa7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 14 Jan 2016 21:28:50 -0600 Subject: [PATCH 5/6] floordiv addtl tests --- pandas/tests/test_index.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 7721ddf90d4ab..1b4578668adaf 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3464,6 +3464,26 @@ def setUp(self): def create_index(self): return RangeIndex(5) + def test_binops(self): + import operator as op + from itertools import combinations + ops = [op.add, op.sub, op.mul, op.floordiv, op.truediv, pow] + scalars = [-1, 1, 2] + idxs = [RangeIndex(0, 10, 1), + RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), + RangeIndex(5, -5, -1)] + for op in ops: + for a, b in combinations(idxs, 2): + result = op(a, b) + expected = op(Int64Index(a), Int64Index(b)) + tm.assert_index_equal(result, expected) + for idx in idxs: + for scalar in scalars: + result = op(idx, scalar) + expected = op(Int64Index(idx), scalar) + tm.assert_index_equal(result, expected) + def test_too_many_names(self): def testit(): self.index.names = ["roger", "harold"] From fab291b306ebf8b89d094379c99bd1b2a2b601b9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 15 Jan 2016 09:11:54 -0500 Subject: [PATCH 6/6] make floordiv return int64index always --- pandas/core/common.py | 5 +++++ pandas/core/index.py | 15 +++------------ pandas/tests/test_index.py | 11 ++++++----- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 7fae09c83120f..7f955002a2c68 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2199,6 +2199,11 @@ def is_integer_dtype(arr_or_dtype): not issubclass(tipo, (np.datetime64, np.timedelta64))) +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or diff --git a/pandas/core/index.py b/pandas/core/index.py index 9e323cc161f30..63b748ada6afa 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3995,8 +3995,8 @@ def from_range(cls, data, name=None, dtype=None, **kwargs): stop = data.stop start = data.start else: - # seems we only have indexing ops to infer - # rather than direct accessors + # seems we only have indexing ops to infer + # rather than direct accessors if len(data) > 1: step = data[1] - data[0] stop = data[-1] + step @@ -4395,7 +4395,7 @@ def _evaluate_numeric_binop(self, other): # we don't have a representable op # so return a base index - if not is_integer(rstep): + if not is_integer(rstep) or not rstep: raise ValueError else: @@ -4440,15 +4440,6 @@ def _evaluate_numeric_binop(self, other): operator.mul, '__mul__', step=operator.mul) - cls.__floordiv__ = _make_evaluate_binop( - operator.floordiv, - '__floordiv__', - step=operator.floordiv) - cls.__rfloordiv__ = _make_evaluate_binop( - operator.floordiv, - '__floordiv__', - reversed=True, - step=operator.floordiv) cls.__truediv__ = _make_evaluate_binop( operator.truediv, '__truediv__', diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 1b4578668adaf..4dcc390787908 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -5,6 +5,7 @@ from pandas import compat from pandas.compat import (long, is_platform_windows, range, lrange, lzip, u, zip, PY3) +from itertools import combinations import operator import re import nose @@ -3465,9 +3466,8 @@ def create_index(self): return RangeIndex(5) def test_binops(self): - import operator as op - from itertools import combinations - ops = [op.add, op.sub, op.mul, op.floordiv, op.truediv, pow] + ops = [operator.add, operator.sub, operator.mul, + operator.floordiv, operator.truediv, pow] scalars = [-1, 1, 2] idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), @@ -3611,7 +3611,8 @@ def test_numeric_compat2(self): self.assertTrue(result.equals(expected)) result = idx // 1 - tm.assert_index_equal(result, idx, exact=True) + expected = idx._int64index // 1 + tm.assert_index_equal(result, expected, exact=True) # __mul__ result = idx * idx @@ -3627,7 +3628,7 @@ def test_numeric_compat2(self): # __floordiv__ idx = RangeIndex(0, 1000, 2) result = idx // 2 - expected = RangeIndex(0, 500, 1) + expected = idx._int64index // 2 tm.assert_index_equal(result, expected, exact=True) idx = RangeIndex(0, 1000, 1)