From 991c6847e64459a2d74d91a6f1f1a46e4a4edc64 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 24 Apr 2015 13:11:37 +0200 Subject: [PATCH 1/7] Introduction of RangeIndex `RangeIndex(1, 10, 2)` is a memory saving alternative to `Index(np.arange(1, 10,2))`: c.f. #939. This re-implementation is compatible with the current `Index()` api and is a drop-in replacement for `Int64Index()`. It automatically converts to Int64Index() when required by operations. At present only for a minimum number of operations the type is conserved (e.g. slicing, inner-, left- and right-joins). Most other operations trigger creation of an equivalent Int64Index (or at least an equivalent numpy array) and fall back to its implementation. This PR also extends the functionality of the `Index()` constructor to allow creation of `RangeIndexes()` with ``` Index(20) Index(2, 20) Index(0, 20, 2) ``` in analogy to ``` range(20) range(2, 20) range(0, 20, 2) ``` --- pandas/core/api.py | 2 +- pandas/core/common.py | 5 + pandas/core/index.py | 437 +++++++++++++++++++++++++++- pandas/tests/test_index.py | 567 ++++++++++++++++++++++++++++++++++++- 4 files changed, 1003 insertions(+), 8 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index fde9bc77c4bd9..103fe740cfa36 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, RangeIndex, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/common.py b/pandas/core/common.py index 3d23aeff942dc..878b1af078d4d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2498,6 +2498,11 @@ def is_integer_dtype(arr_or_dtype): not issubclass(tipo, (np.datetime64, np.timedelta64))) +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or diff --git a/pandas/core/index.py b/pandas/core/index.py index 8b650fea9b440..f5278ccc9cf6e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,6 +7,7 @@ from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat import numpy as np +from math import ceil, floor from sys import getsizeof import pandas.tslib as tslib @@ -21,7 +22,7 @@ from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) + is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype, is_int64_dtype) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -107,13 +108,36 @@ class Index(IndexOpsMixin, PandasObject): def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): + # RangeIndex pass-through + # Index(start, stop, ...) --> RangeIndex(start, stop, ...) + if isinstance(data, int): + if dtype is None and copy == False: + copy = None + range_constructor = True + elif isinstance(dtype, int): + range_constructor = True + if copy == False: + copy = None + elif isinstance(copy, int): + range_constructor = True + else: + range_constructor = False + + if range_constructor: + return RangeIndex(data, dtype, copy, name) + + # no class inference! if fastpath: return cls._simple_new(data, name) from pandas.tseries.period import PeriodIndex if isinstance(data, (np.ndarray, Index, ABCSeries)): - if issubclass(data.dtype.type, np.datetime64): + if (isinstance(data, RangeIndex) and + (dtype is None or is_int64_dtype(dtype))): + # copy passed-in RangeIndex + return data.copy(name=name) + elif issubclass(data.dtype.type, np.datetime64): from pandas.tseries.index import DatetimeIndex result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: @@ -3299,6 +3323,415 @@ def _wrap_joined_index(self, joined, other): Int64Index._add_logical_methods() +class RangeIndex(Int64Index): + + """ + Immutable Index implementing an monotonic range. RangeIndex is a + memory-saving special case of `Int64Index` limited to representing + monotonic ranges. + + Parameters + ---------- + start : int (default: 0) + stop : int (default: 0) + step : int (default: 1) + name : object, optional + Name to be stored in the index + """ + + _typ = 'rangeindex' + _engine_type = _index.Int64Engine + _attributes = ['name', 'start', 'stop', 'step'] + + def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False): + if fastpath: + return cls._simple_new(start, stop, step, name=name) + + # RangeIndex() constructor + if start is None and stop is None and step is None: + return cls._simple_new(0, 0, 1, name=name) + + # sort the arguments depending on which are provided + if step is None: + step = 1 + if stop is None: + stop = start + start = 0 + + # check validity of inputs + start = cls._ensure_int(start) + stop = cls._ensure_int(stop) + step = cls._ensure_int(step) + if step == 0: + raise ValueError("Step must not be zero") + + return cls._simple_new(start, stop, step, name) + + @classmethod + def _simple_new(cls, start, stop, step, name=None): + result = object.__new__(cls) + result._start = start + result._stop = stop + result._step = step + result.name = name + return result + + @classmethod + def _ensure_int(cls, value): + try: + int_value = int(value) + if int_value != value: + raise Exception + except Exception: + raise TypeError("Need to pass integral values") + return int_value + + @property + def _data(self): + return np.arange(self.start, self.stop, self.step, dtype=np.int64) + + @property + def dtype(self): + return np.dtype(np.int64) + + @property + def start(self): + return self._start + + @start.setter + def start(self, value): + self._start = self._ensure_int(value) + + @property + def stop(self): + return self._stop + + @stop.setter + def stop(self, value): + self._stop = self._ensure_int(value) + + @property + def step(self): + return self._step + + @step.setter + def step(self, value): + self._step = self._ensure_int(value) + + @cache_readonly(allow_setting=True) + def is_unique(self): + """ return if the index has unique values """ + return True + + @property + def has_duplicates(self): + return not self.is_unique + + def tolist(self): + return list(range(self.start, self.stop, self.step)) + + def _shallow_copy(self, values=None, **kwargs): + """ create a new Index, don't copy the data, use the same object attributes + with passed in attributes taking precedence """ + if values is None: + return RangeIndex(self.start, self.stop, self.step, + name=self.name, fastpath=True) + else: + name = kwargs.get('name', self.name) + return Int64Index(self.values, name=name, copy=False)._shallow_copy(values, **kwargs) + + def copy(self, names=None, name=None, dtype=None, deep=False): + """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. + + Parameters + ---------- + name : string, optional + dtype : numpy dtype or pandas type + + Returns + ------- + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + if dtype is not None and not is_int64_dtype(dtype): + return super(RangeIndex, self).copy(names, name, dtype, deep) + + if name is None: + name = self.name + return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) + + def argsort(self, *args, **kwargs): + """ + return an ndarray indexer of the underlying data + + See also + -------- + numpy.ndarray.argsort + """ + return self._data.argsort(*args, **kwargs) + + def __repr__(self): + attrs = [('start', default_pprint(self.start)), + ('stop', default_pprint(self.stop)), + ('step', default_pprint(self.step)), + ('name', default_pprint(self.name))] + + prepr = u(", ").join([u("%s=%s") % (k, v) + for k, v in attrs]) + res = u("%s(%s)") % (self.__class__.__name__, prepr) + + if not compat.PY3: + # needs to be str in Python 2 + encoding = get_option('display.encoding') + res = res.encode(encoding) + return res + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + if self.start != 0 or self.step != 1: + start = u('%s, ') % default_pprint(self.start) + else: + start = u('') + stop = default_pprint(self.stop) + step = u('') if self.step == 1 else u(', %s') % default_pprint(self.step) + if self.name is None: + name = u('') + else: + name = u(', name=%s') % default_pprint(self.name) + + res = u("%s(%s%s%s%s)") % (self.__class__.__name__, + start, stop, step, name) + return res + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + elif isinstance(other, RangeIndex): + return (self.start == other.start and + self.stop == other.stop and + self.step == other.step) + + try: + return array_equivalent(_values_from_object(self), + _values_from_object(other)) + except TypeError: + # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + return False + + def __reduce__(self): + d = self._get_attributes_dict() + return _new_Index, (self.__class__, d), None + + def view(self, cls=None): + if cls is None or is_int64_dtype(cls): + return self + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result + + def intersection(self, other): + """ + Form the intersection of two Index objects. Sortedness of the result is + not guaranteed + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + intersection : Index + """ + if not isinstance(other, RangeIndex): + return super(RangeIndex, self).intersection(other) + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(min(self.start, self.stop+1), + min(other.start, other.stop+1)) + int_high = min(max(self.stop, self.start+1), + max(other.stop, other.start+1)) + if int_high <= int_low: + return RangeIndex() + + ### Method hint: linear Diophantine equation + # solve intersection + # perf: for identical step sizes, could use cheaper alternative + gcd, s, t = self._extended_gcd(self.step, other.step) + + # check whether element sets intersect + if (self.start - other.start) % gcd: + return RangeIndex() + + # calculate parameters for the RangeIndex describing the intersection + # disregarding the lower bounds + tmp_start = self.start + (other.start-self.start)*self.step//gcd*s + new_step = self.step * other.step // gcd + new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) + + # adjust index to limiting interval + new_index.start = new_index._min_fitting_element(int_low) + return new_index + + def _min_fitting_element(self, lower_limit): + """Returns the value of the smallest element greater than the limit""" + round = ceil if self.step > 0 else floor + no_steps = round( (float(lower_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _max_fitting_element(self, upper_limit): + """Returns the value of the largest element smaller than the limit""" + round = floor if self.step > 0 else ceil + no_steps = round( (float(upper_limit)-self.start) / self.step ) + return self.start + self.step * no_steps + + def _extended_gcd(self, a, b): + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + union : Index + """ + # note: could return a RangeIndex in some circumstances + return Int64Index(self.values, copy=False).union(other) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + *this is an internal non-public method* + + Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + if how == 'outer' and self is not other: + # note: could return RangeIndex in more circumstances + return Int64Index(self.values, copy=False).join(other, how, level, return_indexers) + + return super(RangeIndex, self).join(other, how, level, return_indexers) + + def _mul(self, other): + "__mul__() implementation" + try: + int_input = other == int(other) + if int_input: + other = int(other) + except Exception: + int_input = False + + if int_input == True and other != 0: + return RangeIndex(self.start*other, self.stop*other, self.step*other, + fastpath=True) + else: + return super(RangeIndex, self).__mul__(other) + + def __len__(self): + """ + return the length of the RangeIndex + """ + return (self.stop-self.start) // self.step + + @property + def size(self): + return len(self) + + def __getitem__(self, key): + """ + Conserve RangeIndex type for scalar and slice keys. + """ + super_getitem = super(RangeIndex, self).__getitem__ + + if np.isscalar(key): + n = int(key) + if n != key: + return super_getitem(key) + if n < 0: + n = len(self) + key + if n < 0 or n > len(self)-1: + raise IndexError('index %d is out of bounds for axis 0 with size %d' % (key, len(self))) + return self.start + n * self.step + + if isinstance(key, slice): + # complete missing slice information + n_start = 0 if key.start is None else key.start + n_stop = len(self)+1 if key.stop is None else key.stop + n_step = 1 if key.step is None else key.step + + # delegate non-integer slices + if (n_start != int(n_start) and + n_stop != int(n_stop) and + n_step != int(n_step)): + return super_getitem(key) + + # deal with index wrap-around + n_start = len(self)+n_start if n_start < 0 else n_start + n_stop = len(self)+n_stop if n_stop < 0 else n_stop + + + # convert indexes to values + start = self.start + self.step * n_start + stop = self.start + self.step * n_stop + 1 + step = self.step * n_step + + stop = min(stop, self.stop) + return RangeIndex(start, stop, step, self.name, fastpath=True) + + # fall back to Int64Index + return super_getitem(key) + +RangeIndex._add_numeric_methods() +RangeIndex.__mul__ = RangeIndex.__rmul__ = RangeIndex._mul +RangeIndex._add_logical_methods() + + class Float64Index(NumericIndex): """ diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 3c9dbd2e48cb6..bb0e6abf93d59 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -13,7 +13,7 @@ from numpy.testing import assert_array_equal from pandas import (period_range, date_range, Categorical, Series, - Index, Float64Index, Int64Index, MultiIndex, + Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, @@ -297,10 +297,6 @@ def test_constructor(self): # arr = np.array(5.) # self.assertRaises(Exception, arr.view, Index) - def test_constructor_corner(self): - # corner case - self.assertRaises(TypeError, Index, 0) - def test_constructor_from_series(self): expected = DatetimeIndex([Timestamp('20110101'),Timestamp('20120101'),Timestamp('20130101')]) @@ -2447,6 +2443,567 @@ def test_slice_keep_name(self): idx = Int64Index([1, 2], name='asdf') self.assertEqual(idx.name, idx[1:].name) +class TestRangeIndex(Numeric, tm.TestCase): + _holder = RangeIndex + ### what does the following do? + #_multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(index = RangeIndex(0, 20, 2)) + self.setup_indices() + + def create_index(self): + return RangeIndex(5) + + def test_too_many_names(self): + def testit(): + self.index.names = ["roger", "harold"] + assertRaisesRegexp(ValueError, "^Length", testit) + + def test_constructor(self): + index = RangeIndex(5) + expected = np.arange(5, dtype=np.int64) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex(1, 5) + expected = np.arange(1, 5, dtype=np.int64) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex(1, 5, 2) + expected = np.arange(1, 5, 2, dtype=np.int64) + self.assert_numpy_array_equal(index, expected) + + index = RangeIndex() + expected = np.empty(0, dtype=np.int64) + self.assert_numpy_array_equal(index, expected) + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = RangeIndex(1, 5) + self.assertEqual(index.values.dtype, np.int64) + self.assertTrue(index.equals(arr)) + + # non-int raise Exception + self.assertRaises(TypeError, RangeIndex, '1', '10', '1') + self.assertRaises(TypeError, RangeIndex, 1.1, 10.2, 1.3) + + # iterable raise Exception + self.assertRaises(TypeError, RangeIndex, iter([-5, 0, 1, 2])) + + def test_copy(self): + i = RangeIndex(5, name='Foo') + i_copy = i.copy() + self.assertEqual(i_copy.start, 0) + self.assertEqual(i_copy.stop, 5) + self.assertEqual(i_copy.step, 1) + self.assertEqual(i_copy.name, 'Foo') + + def test_view(self): + super(TestRangeIndex, self).test_view() + + i = RangeIndex(name='Foo') + i_view = i.view() + self.assertEqual(i_view.name, 'Foo') + + i_view = i.view('i8') + tm.assert_index_equal(i, i_view) + + i_view = i.view(RangeIndex) + tm.assert_index_equal(i, i_view) + + def test_index_constructor(self): + arr = Index(5) + tm.assert_isinstance(arr, RangeIndex) + + def test_dtype(self): + self.assertEqual(self.index.dtype, np.int64) + + def test_is_monotonic(self): + self.assertTrue(self.index.is_monotonic) + self.assertTrue(self.index.is_monotonic_increasing) + self.assertFalse(self.index.is_monotonic_decreasing) + + index = RangeIndex(4, 0, -1) + self.assertFalse(index.is_monotonic) + self.assertTrue(index.is_monotonic_decreasing) + + index = RangeIndex(1, 2) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + + def test_equals(self): + same_values = Index(self.index, dtype=object) + self.assertTrue(self.index.equals(same_values)) + self.assertTrue(same_values.equals(self.index)) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_identical(self): + i = Index(self.index.copy()) + self.assertTrue(i.identical(self.index)) + + same_values_different_type = Index(i, dtype=object) + self.assertFalse(i.identical(same_values_different_type)) + + i = self.index.copy(dtype=object) + i = i.rename('foo') + same_values = Index(i, dtype=object) + self.assertTrue(same_values.identical(self.index.copy(dtype=object))) + + self.assertFalse(i.identical(self.index)) + self.assertTrue(Index(same_values, name='foo', dtype=object + ).identical(i)) + + self.assertFalse( + self.index.copy(dtype=object) + .identical(self.index.copy(dtype='int64'))) + + def test_get_indexer(self): + target = RangeIndex(10) + indexer = self.index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + self.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_pad(self): + target = RangeIndex(10) + indexer = self.index.get_indexer(target, method='pad') + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + self.assert_numpy_array_equal(indexer, expected) + + def test_get_indexer_backfill(self): + target = RangeIndex(10) + indexer = self.index.get_indexer(target, method='backfill') + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + self.assert_numpy_array_equal(indexer, expected) + + def test_join_outer(self): + ### join with Int64Index + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assertTrue(res.equals(noidx_res)) + + eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) + elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, -1, -1, -1, -1, -1, -1, -1], + dtype=np.int64) + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + dtype=np.int64) + + tm.assert_isinstance(res, Int64Index) + self.assertFalse(isinstance(res, RangeIndex)) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + ### join with RangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assertTrue(res.equals(noidx_res)) + + tm.assert_isinstance(res, Int64Index) + self.assertFalse(isinstance(res, RangeIndex)) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_inner(self): + ### Join with non-RangeIndex + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Int64Index([16, 18]) + elidx = np.array([8, 9]) + eridx = np.array([9, 7]) + + tm.assert_isinstance(res, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + ### Join two RangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) + + tm.assert_isinstance(res, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + + def test_join_left(self): + ### Join with Int64Index + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + eres = self.index + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], + dtype=np.int64) + + tm.assert_isinstance(res, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, eridx) + + ### Join withRangeIndex + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + + tm.assert_isinstance(res, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assertIsNone(lidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_right(self): + ### Join with Int64Index + other = Int64Index(np.arange(25, 14, -1)) + + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], + dtype=np.int64) + + tm.assert_isinstance(other, Int64Index) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assertIsNone(ridx) + + ### Join withRangeIndex + other = RangeIndex(25, 14, -1) + + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + + tm.assert_isinstance(other, RangeIndex) + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assertIsNone(ridx) + + def test_join_non_int_index(self): + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = self.index.join(other, how='outer') + outer2 = other.join(self.index, how='outer') + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, + 16, 18], dtype=object) + self.assertTrue(outer.equals(outer2)) + self.assertTrue(outer.equals(expected)) + + inner = self.index.join(other, how='inner') + inner2 = other.join(self.index, how='inner') + expected = Index([6, 8, 10], dtype=object) + self.assertTrue(inner.equals(inner2)) + self.assertTrue(inner.equals(expected)) + + left = self.index.join(other, how='left') + self.assertTrue(left.equals(self.index)) + + left2 = other.join(self.index, how='left') + self.assertTrue(left2.equals(other)) + + right = self.index.join(other, how='right') + self.assertTrue(right.equals(other)) + + right2 = other.join(self.index, how='right') + self.assertTrue(right2.equals(self.index)) + + def test_join_non_unique(self): + other = Index([4, 4, 3, 3]) + + res, lidx, ridx = self.index.join(other, return_indexers=True) + + eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) + elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int64) + eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.int64) + + self.assertTrue(res.equals(eres)) + self.assert_numpy_array_equal(lidx, elidx) + self.assert_numpy_array_equal(ridx, eridx) + + def test_join_self(self): + kinds = 'outer', 'inner', 'left', 'right' + for kind in kinds: + joined = self.index.join(self.index, how=kind) + self.assertIs(self.index, joined) + + def test_intersection(self): + ### intersect with Int64Index + other = Index(np.arange(1, 6)) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + result = other.intersection(self.index) + expected = np.sort(np.asarray(np.intersect1d(self.index.values, + other.values))) + self.assert_numpy_array_equal(result, expected) + + ### intersect with increasing RangeIndex + other = Index(1, 6) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + ### intersect with decreasing RangeIndex + other = Index(5, 0, -1) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_numpy_array_equal(result, expected) + + def test_intersect_str_dates(self): + dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(['aa'], dtype=object) + res = i2.intersection(i1) + + self.assertEqual(len(res), 0) + + def test_union_noncomparable(self): + from datetime import datetime, timedelta + # corner case, non-Int64Index + now = datetime.now() + other = Index([now + timedelta(i) for i in range(4)], dtype=object) + result = self.index.union(other) + expected = np.concatenate((self.index, other)) + self.assert_numpy_array_equal(result, expected) + + result = other.union(self.index) + expected = np.concatenate((other, self.index)) + self.assert_numpy_array_equal(result, expected) + + def test_cant_or_shouldnt_cast(self): + # can't + self.assertRaises(TypeError, RangeIndex, 'foo', 'bar', 'baz') + + # shouldn't + self.assertRaises(TypeError, RangeIndex, '0', '1', '2') + + def test_view_Index(self): + self.index.view(Index) + + def test_prevent_casting(self): + result = self.index.astype('O') + self.assertEqual(result.dtype, np.object_) + + def test_take_preserve_name(self): + index = RangeIndex(1, 5, name='foo') + taken = index.take([3, 0, 1]) + self.assertEqual(index.name, taken.name) + + def test_int_name_format(self): + from pandas import Series, DataFrame + index = Index(3, name=0) + s = Series(lrange(3), index) + df = DataFrame(lrange(3), index=index) + repr(s) + repr(df) + + def test_print_unicode_columns(self): + df = pd.DataFrame( + {u("\u05d0"): [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_repr_roundtrip(self): + tm.assert_index_equal(eval(repr(self.index)), self.index) + + def test_unicode_string_with_unicode(self): + idx = Index(1000) + + if compat.PY3: + str(idx) + else: + compat.text_type(idx) + + def test_bytestring_with_unicode(self): + idx = Index(1000) + if compat.PY3: + bytes(idx) + else: + str(idx) + + def test_slice_keep_name(self): + idx = RangeIndex(1, 2, name='asdf') + self.assertEqual(idx.name, idx[1:].name) + + def test_numeric_compat(self): + idx = RangeIndex(5) + didx = Index(np.arange(5,dtype='int64')**2) + + # note: special cases of the following could return RangeIndex + # see _mul() example + + result = idx * 1 + tm.assert_index_equal(result, idx) + + result = 1 * idx + tm.assert_index_equal(result, idx) + + result = idx * idx + tm.assert_index_equal(result, didx) + + result = idx / 1 + tm.assert_index_equal(result, idx) + + result = idx // 1 + tm.assert_index_equal(result, idx) + + result = idx * np.array(5,dtype='int64') + tm.assert_index_equal(result, Index(np.arange(5,dtype='int64')*5)) + + result = idx * np.arange(5,dtype='int64') + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='int64')) + tm.assert_index_equal(result, didx) + + result = idx * Series(np.arange(5,dtype='float64')+0.1) + tm.assert_index_equal(result, + Float64Index(np.arange(5,dtype='float64')*(np.arange(5,dtype='float64')+0.1))) + + # invalid + self.assertRaises(TypeError, lambda : idx * date_range('20130101',periods=5)) + self.assertRaises(ValueError, lambda : idx * self._holder(3)) + self.assertRaises(ValueError, lambda : idx * np.array([1,2])) + + def test_explicit_conversions(self): + + # GH 8608 + # add/sub are overriden explicity for Float/Int Index + idx = RangeIndex(5) + + # float conversions + arr = np.arange(5,dtype='int64')*3.2 + expected = Float64Index(arr) + fidx = idx * 3.2 + tm.assert_index_equal(fidx,expected) + fidx = 3.2 * idx + tm.assert_index_equal(fidx,expected) + + # interops with numpy arrays + expected = Float64Index(arr) + a = np.zeros(5,dtype='float64') + result = fidx - a + tm.assert_index_equal(result,expected) + + expected = Float64Index(-arr) + a = np.zeros(5,dtype='float64') + result = a - fidx + tm.assert_index_equal(result,expected) + + def test_duplicates(self): + # RangeIndex has no duplicates by definition + pass + + def test_ufunc_compat(self): + idx = RangeIndex(5) + result = np.sin(idx) + expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) + tm.assert_index_equal(result, expected) + + def test_extended_gcd(self): + result = self.index._extended_gcd(6, 10) + self.assertEqual(result[0], result[1]*6 + result[2]*10) + self.assertEqual(2, result[0]) + + result = self.index._extended_gcd(10, 6) + self.assertEqual(2, result[1]*10 + result[2]*6) + self.assertEqual(2, result[0]) + + def test_min_fitting_element(self): + result = RangeIndex(0, 20, 2)._min_fitting_element(1) + self.assertEqual(2, result) + + result = RangeIndex(1, 6)._min_fitting_element(1) + self.assertEqual(1, result) + + result = RangeIndex(18, -2, -2)._min_fitting_element(1) + self.assertEqual(2, result) + + result = RangeIndex(5, 0, -1)._min_fitting_element(1) + self.assertEqual(1, result) + + def test_max_fitting_element(self): + result = RangeIndex(0, 20, 2)._max_fitting_element(17) + self.assertEqual(16, result) + + result = RangeIndex(1, 6)._max_fitting_element(4) + self.assertEqual(4, result) + + result = RangeIndex(18, -2, -2)._max_fitting_element(17) + self.assertEqual(16, result) + + result = RangeIndex(5, 0, -1)._max_fitting_element(4) + self.assertEqual(4, result) + + def test_pickle_compat_construction(self): + # RangeIndex() is a valid constructor + pass + + def test_slice_specialised(self): + # scalar indexing + res = self.index[1] + expected = 2 + self.assertEqual(res, expected) + + res = self.index[-1] + expected = 18 + self.assertEqual(res, expected) + + ### slicing + # slice value completion + index = self.index[:] + expected = self.index + self.assert_numpy_array_equal(index, expected) + + # positive slice values + index = self.index[7:10:2] + expected = np.array([14, 18]) + self.assert_numpy_array_equal(index, expected) + + # negative slice values + index = self.index[-1:-5:-2] + expected = np.array([18, 14]) + self.assert_numpy_array_equal(index, expected) + + # stop overshoot + index = self.index[2:100:4] + expected = np.array([4, 12]) + self.assert_numpy_array_equal(index, expected) + + def test_len_specialised(self): + # TODO: How to test that len is specialised rather than calling + # the parent classes __len__() (which is slow)? + pass + + def test_size_specialised(self): + # TODO: How to test that size is specialised rather than calling + # the parent classes size property (which is slow)? + pass + class DatetimeLike(Base): def test_view(self): From 872775d8b2f0311321a7986bb97a164c284029a1 Mon Sep 17 00:00:00 2001 From: ARF Date: Fri, 24 Apr 2015 13:18:37 +0200 Subject: [PATCH 2/7] restore Index() fastpath precedence --- pandas/core/index.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index f5278ccc9cf6e..9ef5424ecc378 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -108,6 +108,10 @@ class Index(IndexOpsMixin, PandasObject): def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, tupleize_cols=True, **kwargs): + # no class inference! + if fastpath: + return cls._simple_new(data, name) + # RangeIndex pass-through # Index(start, stop, ...) --> RangeIndex(start, stop, ...) if isinstance(data, int): @@ -122,15 +126,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, range_constructor = True else: range_constructor = False - + if range_constructor: return RangeIndex(data, dtype, copy, name) - - # no class inference! - if fastpath: - return cls._simple_new(data, name) - from pandas.tseries.period import PeriodIndex if isinstance(data, (np.ndarray, Index, ABCSeries)): if (isinstance(data, RangeIndex) and From 57e0865ec2f962a87b54725965fbdcb867d3afea Mon Sep 17 00:00:00 2001 From: ARF Date: Mon, 27 Apr 2015 10:26:25 +0200 Subject: [PATCH 3/7] Various fixes suggested by @jreback and @shoyer Cache a private Int64Index object the first time it or its values are required. Restore Index(5) as error. Restore its test. Allow Index(0, 5) and Index(0, 5, 1). Make RangeIndex immutable. See start, stop, step properties. In test_constructor(): check class, attributes (possibly including dtype). In test_copy(): check that copy is not identical (but equal) to the existing. In test_duplicates(): Assert is_unique and has_duplicates return correct values. --- pandas/core/index.py | 41 ++++++++++++-------------------------- pandas/tests/test_index.py | 39 ++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 9ef5424ecc378..38fdec4e1866b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -112,20 +112,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if fastpath: return cls._simple_new(data, name) - # RangeIndex pass-through - # Index(start, stop, ...) --> RangeIndex(start, stop, ...) - if isinstance(data, int): - if dtype is None and copy == False: + if isinstance(data, int) and isinstance(dtype, int): + if copy == False: copy = None range_constructor = True - elif isinstance(dtype, int): + elif isinstance(copy, int): range_constructor = True - if copy == False: - copy = None - elif isinstance(copy, int): - range_constructor = True - else: - range_constructor = False if range_constructor: return RangeIndex(data, dtype, copy, name) @@ -3373,6 +3365,7 @@ def _simple_new(cls, start, stop, step, name=None): result._stop = stop result._step = step result.name = name + result.is_unique = True return result @classmethod @@ -3385,10 +3378,14 @@ def _ensure_int(cls, value): raise TypeError("Need to pass integral values") return int_value - @property + @cache_readonly def _data(self): return np.arange(self.start, self.stop, self.step, dtype=np.int64) + @cache_readonly + def _int64index(self): + return Int64Index(self._data, name=self.name, fastpath=True) + @property def dtype(self): return np.dtype(np.int64) @@ -3397,26 +3394,14 @@ def dtype(self): def start(self): return self._start - @start.setter - def start(self, value): - self._start = self._ensure_int(value) - @property def stop(self): return self._stop - @stop.setter - def stop(self, value): - self._stop = self._ensure_int(value) - @property def step(self): return self._step - @step.setter - def step(self, value): - self._step = self._ensure_int(value) - @cache_readonly(allow_setting=True) def is_unique(self): """ return if the index has unique values """ @@ -3437,7 +3422,7 @@ def _shallow_copy(self, values=None, **kwargs): name=self.name, fastpath=True) else: name = kwargs.get('name', self.name) - return Int64Index(self.values, name=name, copy=False)._shallow_copy(values, **kwargs) + return self._int64index._shallow_copy(values, **kwargs) def copy(self, names=None, name=None, dtype=None, deep=False): """ @@ -3586,7 +3571,7 @@ def intersection(self, other): new_index = RangeIndex(tmp_start, int_high, new_step, fastpath=True) # adjust index to limiting interval - new_index.start = new_index._min_fitting_element(int_low) + new_index._start = new_index._min_fitting_element(int_low) return new_index def _min_fitting_element(self, lower_limit): @@ -3631,7 +3616,7 @@ def union(self, other): union : Index """ # note: could return a RangeIndex in some circumstances - return Int64Index(self.values, copy=False).union(other) + return self._int64index.union(other) def join(self, other, how='left', level=None, return_indexers=False): """ @@ -3653,7 +3638,7 @@ def join(self, other, how='left', level=None, return_indexers=False): """ if how == 'outer' and self is not other: # note: could return RangeIndex in more circumstances - return Int64Index(self.values, copy=False).join(other, how, level, return_indexers) + return self._int64index.join(other, how, level, return_indexers) return super(RangeIndex, self).join(other, how, level, return_indexers) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index bb0e6abf93d59..b4caca84ab3da 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -297,6 +297,10 @@ def test_constructor(self): # arr = np.array(5.) # self.assertRaises(Exception, arr.view, Index) + def test_constructor_corner(self): + # corner case + self.assertRaises(TypeError, Index, 0) + def test_constructor_from_series(self): expected = DatetimeIndex([Timestamp('20110101'),Timestamp('20120101'),Timestamp('20130101')]) @@ -2463,20 +2467,37 @@ def testit(): def test_constructor(self): index = RangeIndex(5) expected = np.arange(5, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.start, 0) + self.assertEqual(index.stop, 5) + self.assertEqual(index.step, 1) + self.assertEqual(index.name, None) self.assert_numpy_array_equal(index, expected) index = RangeIndex(1, 5) expected = np.arange(1, 5, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.start, 1) self.assert_numpy_array_equal(index, expected) index = RangeIndex(1, 5, 2) expected = np.arange(1, 5, 2, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.step, 2) self.assert_numpy_array_equal(index, expected) index = RangeIndex() expected = np.empty(0, dtype=np.int64) + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.start, 0) + self.assertEqual(index.stop, 0) + self.assertEqual(index.step, 1) self.assert_numpy_array_equal(index, expected) + index = RangeIndex(name='Foo') + tm.assert_isinstance(index, RangeIndex) + self.assertEqual(index.name, 'Foo') + def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = RangeIndex(1, 5) @@ -2493,6 +2514,8 @@ def test_constructor_corner(self): def test_copy(self): i = RangeIndex(5, name='Foo') i_copy = i.copy() + self.assertTrue(i_copy is not i) + self.assertTrue(i_copy.identical(i)) self.assertEqual(i_copy.start, 0) self.assertEqual(i_copy.stop, 5) self.assertEqual(i_copy.step, 1) @@ -2512,7 +2535,7 @@ def test_view(self): tm.assert_index_equal(i, i_view) def test_index_constructor(self): - arr = Index(5) + arr = Index(0, 5) tm.assert_isinstance(arr, RangeIndex) def test_dtype(self): @@ -2816,7 +2839,7 @@ def test_take_preserve_name(self): def test_int_name_format(self): from pandas import Series, DataFrame - index = Index(3, name=0) + index = Index(0, 3, name=0) s = Series(lrange(3), index) df = DataFrame(lrange(3), index=index) repr(s) @@ -2831,7 +2854,7 @@ def test_repr_roundtrip(self): tm.assert_index_equal(eval(repr(self.index)), self.index) def test_unicode_string_with_unicode(self): - idx = Index(1000) + idx = Index(0, 1000) if compat.PY3: str(idx) @@ -2839,7 +2862,7 @@ def test_unicode_string_with_unicode(self): compat.text_type(idx) def test_bytestring_with_unicode(self): - idx = Index(1000) + idx = Index(0, 1000) if compat.PY3: bytes(idx) else: @@ -2915,8 +2938,12 @@ def test_explicit_conversions(self): tm.assert_index_equal(result,expected) def test_duplicates(self): - # RangeIndex has no duplicates by definition - pass + for ind in self.indices: + if not len(ind): + continue + idx = self.indices[ind] + self.assertTrue(idx.is_unique) + self.assertFalse(idx.has_duplicates) def test_ufunc_compat(self): idx = RangeIndex(5) From e8c31c51a5a433f0d017164d8e5510f06d4d9dc0 Mon Sep 17 00:00:00 2001 From: ARF Date: Sat, 2 May 2015 00:03:58 +0200 Subject: [PATCH 4/7] fix slicing --- pandas/core/index.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 38fdec4e1866b..ffc9235e2120a 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3360,6 +3360,9 @@ def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False): @classmethod def _simple_new(cls, start, stop, step, name=None): + # canonise empty RangeIndex objects + if (stop-start)//step <= 0: + start, stop, step = 0, 0, 1 result = object.__new__(cls) result._start = start result._stop = stop @@ -3684,28 +3687,13 @@ def __getitem__(self, key): return self.start + n * self.step if isinstance(key, slice): - # complete missing slice information - n_start = 0 if key.start is None else key.start - n_stop = len(self)+1 if key.stop is None else key.stop - n_step = 1 if key.step is None else key.step - - # delegate non-integer slices - if (n_start != int(n_start) and - n_stop != int(n_stop) and - n_step != int(n_step)): - return super_getitem(key) - - # deal with index wrap-around - n_start = len(self)+n_start if n_start < 0 else n_start - n_stop = len(self)+n_stop if n_stop < 0 else n_stop - + start, stop, step = key.indices(len(self)) # convert indexes to values - start = self.start + self.step * n_start - stop = self.start + self.step * n_stop + 1 - step = self.step * n_step + start = self.start + self.step * start + stop = self.start + self.step * stop + step = self.step * step - stop = min(stop, self.stop) return RangeIndex(start, stop, step, self.name, fastpath=True) # fall back to Int64Index From b4d1085c31af12c0d56720ae240328cee31e0945 Mon Sep 17 00:00:00 2001 From: ARF Date: Sat, 2 May 2015 10:38:26 +0200 Subject: [PATCH 5/7] fix view --- pandas/core/index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index ffc9235e2120a..52f59fab57248 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3525,10 +3525,10 @@ def __reduce__(self): return _new_Index, (self.__class__, d), None def view(self, cls=None): - if cls is None or is_int64_dtype(cls): - return self - else: + if cls is None or hasattr(cls,'_typ') or is_int64_dtype(cls): result = self._shallow_copy() + else: + result = self._data.view(cls) if isinstance(result, Index): result._id = self._id return result From ac18a31cacc2055800d9fd61272d2d49a20e4f55 Mon Sep 17 00:00:00 2001 From: ARF Date: Sat, 2 May 2015 00:05:43 +0200 Subject: [PATCH 6/7] Set RangeIndex as default index * enh: set RangeIndex as default index * fix: pandas.io.packers: encode() and decode() for RangeIndex * enh: array argument pass-through * fix: reindex * fix: use _default_index() in pandas.core.frame.extract_index() * fix: pandas.core.index.Index._is() * fix: add RangeIndex to ABCIndexClass * fix: use _default_index() in _get_names_from_index() * fix: pytables tests * fix: MultiIndex.get_level_values() * fix: RangeIndex._shallow_copy() * fix: null-size RangeIndex equals() comparison * enh: make RangeIndex.is_unique immutable --- pandas/core/common.py | 7 +-- pandas/core/frame.py | 6 +- pandas/core/index.py | 102 +++++++++++++++++++------------ pandas/io/packers.py | 13 +++- pandas/io/tests/test_pytables.py | 16 +++-- pandas/tests/test_index.py | 3 - 6 files changed, 90 insertions(+), 57 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 878b1af078d4d..b888b8721a24d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -86,6 +86,7 @@ def _check(cls, inst): ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", + "rangeindex", "float64index", "multiindex", "datetimeindex", @@ -2142,10 +2143,8 @@ def is_bool_indexer(key): def _default_index(n): - from pandas.core.index import Int64Index - values = np.arange(n, dtype=np.int64) - result = Int64Index(values,name=None) - result.is_unique = True + from pandas.core.index import RangeIndex + result = RangeIndex(0, int(n), name=None) return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 272c401c18761..aa1bcd7dc182a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4826,7 +4826,7 @@ def extract_index(data): % (lengths[0], len(index))) raise ValueError(msg) else: - index = Index(np.arange(lengths[0])) + index = _default_index(lengths[0]) return _ensure_index(index) @@ -5043,11 +5043,11 @@ def convert(arr): def _get_names_from_index(data): - index = lrange(len(data)) has_some_name = any([getattr(s, 'name', None) is not None for s in data]) if not has_some_name: - return index + return _default_index(len(data)) + index = lrange(len(data)) count = 0 for i, s in enumerate(data): n = getattr(s, 'name', None) diff --git a/pandas/core/index.py b/pandas/core/index.py index 52f59fab57248..5fa81fd9c2624 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -245,7 +245,7 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr(other, '_id', Ellipsis) + return self._id is getattr(other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): """Initializes or resets ``_id`` attribute with new object""" @@ -1750,7 +1750,9 @@ def reindex(self, target, method=None, level=None, limit=None): # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator - if not isinstance(target, Index) and len(target) == 0: + if isinstance(self, RangeIndex) and len(target) == 0: + target = self._simple_new(0, 0, 1, name=self.name) + elif not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop('freq', None) # don't preserve freq target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) @@ -3334,48 +3336,69 @@ class RangeIndex(Int64Index): _engine_type = _index.Int64Engine _attributes = ['name', 'start', 'stop', 'step'] - def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False): + def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, **kwargs): if fastpath: return cls._simple_new(start, stop, step, name=name) + # cheap check for array input + if len(kwargs) > 0: + return cls._data_passthrough(start, stop, step, name, fastpath, **kwargs) + # RangeIndex() constructor if start is None and stop is None and step is None: return cls._simple_new(0, 0, 1, name=name) + new_start, new_stop, new_step = None, None, None # sort the arguments depending on which are provided if step is None: - step = 1 + new_step = 1 if stop is None: - stop = start - start = 0 - - # check validity of inputs - start = cls._ensure_int(start) - stop = cls._ensure_int(stop) - step = cls._ensure_int(step) - if step == 0: - raise ValueError("Step must not be zero") + new_stop = start + new_start = 0 - return cls._simple_new(start, stop, step, name) + try: + # check validity of inputs + new_start = start if new_start is None else new_start + new_stop = stop if new_stop is None else new_stop + new_step = step if new_step is None else new_step + new_start = cls._ensure_int(new_start) + new_stop = cls._ensure_int(new_stop) + new_step = cls._ensure_int(new_step) + if new_step == 0: + raise ValueError("Step must not be zero") + return cls._simple_new(new_start, new_stop, new_step, name) + except TypeError: + # pass all invalid inputs to Int64Index to handle + return cls._data_passthrough(start, stop, step, name, fastpath, **kwargs) @classmethod def _simple_new(cls, start, stop, step, name=None): # canonise empty RangeIndex objects - if (stop-start)//step <= 0: - start, stop, step = 0, 0, 1 + #if (stop-start)//step <= 0: + # start, stop, step = 0, 0, 1 result = object.__new__(cls) result._start = start result._stop = stop result._step = step result.name = name - result.is_unique = True return result + @classmethod + def _data_passthrough(cls, data, dtype, copy, name, fastpath, **kwargs): + kwargs.setdefault('data', data) + kwargs.setdefault('dtype', dtype) + if copy is not None: + kwargs.setdefault('copy', copy) + kwargs.setdefault('name', name) + kwargs.setdefault('fastpath', fastpath) + return Int64Index(**kwargs) + @classmethod def _ensure_int(cls, value): try: int_value = int(value) - if int_value != value: + # don't allow casting 1-element arrays to int! + if int_value != value or hasattr(value, '__len__'): raise Exception except Exception: raise TypeError("Need to pass integral values") @@ -3405,14 +3428,14 @@ def stop(self): def step(self): return self._step - @cache_readonly(allow_setting=True) + @property def is_unique(self): """ return if the index has unique values """ return True @property def has_duplicates(self): - return not self.is_unique + return False def tolist(self): return list(range(self.start, self.stop, self.step)) @@ -3424,7 +3447,7 @@ def _shallow_copy(self, values=None, **kwargs): return RangeIndex(self.start, self.stop, self.step, name=self.name, fastpath=True) else: - name = kwargs.get('name', self.name) + kwargs.setdefault('name', self.name) return self._int64index._shallow_copy(values, **kwargs) def copy(self, names=None, name=None, dtype=None, deep=False): @@ -3453,6 +3476,7 @@ def copy(self, names=None, name=None, dtype=None, deep=False): name = self.name return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) + # TODO: return arange instead of sorting def argsort(self, *args, **kwargs): """ return an ndarray indexer of the underlying data @@ -3505,20 +3529,14 @@ def equals(self, other): """ Determines if two Index objects contain the same elements. """ - if self.is_(other): - return True + if isinstance(other, RangeIndex): + return (len(self) == len(other) == 0 + or (self.start == other.start and + self.stop == other.stop and + self.step == other.step) + ) - elif isinstance(other, RangeIndex): - return (self.start == other.start and - self.stop == other.stop and - self.step == other.step) - - try: - return array_equivalent(_values_from_object(self), - _values_from_object(other)) - except TypeError: - # e.g. fails in numpy 1.6 with DatetimeIndex #1681 - return False + return super(RangeIndex, self).equals(other) def __reduce__(self): d = self._get_attributes_dict() @@ -3559,8 +3577,8 @@ def intersection(self, other): return RangeIndex() ### Method hint: linear Diophantine equation - # solve intersection - # perf: for identical step sizes, could use cheaper alternative + # solve intersection problem + # performance hint: for identical step sizes, could use cheaper alternative gcd, s, t = self._extended_gcd(self.step, other.step) # check whether element sets intersect @@ -3664,7 +3682,7 @@ def __len__(self): """ return the length of the RangeIndex """ - return (self.stop-self.start) // self.step + return max(0, (self.stop-self.start) // self.step) @property def size(self): @@ -4503,9 +4521,13 @@ def get_level_values(self, level): unique = self.levels[num] # .values labels = self.labels[num] filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, self.names[num], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + if isinstance(unique, RangeIndex): + _simple_new = Int64Index._simple_new + else: + _simple_new = unique._simple_new + values = _simple_new(filled, self.names[num], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 75ca44fd1ef3e..33946a29a9dee 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,7 +49,7 @@ from pandas.compat import u, PY3 from pandas import ( Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, PeriodIndex, DatetimeIndex, Float64Index, + Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, DatetimeIndex, Float64Index, NaT ) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -257,7 +257,14 @@ def encode(obj): tobj = type(obj) if isinstance(obj, Index): - if isinstance(obj, PeriodIndex): + if isinstance(obj, RangeIndex): + return {'typ': 'range_index', + 'klass': obj.__class__.__name__, + 'name': getattr(obj, 'name', None), + 'start': getattr(obj, 'start', None), + 'stop': getattr(obj, 'stop', None), + 'step': getattr(obj, 'step', None)} + elif isinstance(obj, PeriodIndex): return {'typ': 'period_index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), @@ -447,6 +454,8 @@ def decode(obj): data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) + elif typ == 'range_index': + return globals()[obj['klass']](obj['start'], obj['stop'], obj['step'], name=obj['name']) elif typ == 'multi_index': data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 03e7a8eae549d..236254856c09e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -10,7 +10,7 @@ import pandas import pandas as pd -from pandas import (Series, DataFrame, Panel, MultiIndex, Categorical, bdate_range, +from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, TimedeltaIndex, isnull) from pandas.io.pytables import _tables @@ -1541,14 +1541,17 @@ def test_column_multiindex(self): index = MultiIndex.from_tuples([('A','a'), ('A','b'), ('B','a'), ('B','b')], names=['first','second']) df = DataFrame(np.arange(12).reshape(3,4), columns=index) + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: store.put('df',df) - tm.assert_frame_equal(store['df'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df'],expected,check_index_type=True,check_column_type=True) store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) self.assertRaises(ValueError, store.put, 'df2',df,format='table',data_columns=['A']) self.assertRaises(ValueError, store.put, 'df3',df,format='table',data_columns=True) @@ -1562,11 +1565,14 @@ def test_column_multiindex(self): # non_index_axes name df = DataFrame(np.arange(12).reshape(3,4), columns=Index(list('ABCD'),name='foo')) - + expected = df.copy() + if isinstance(expected.index, RangeIndex): + expected.index = Int64Index(expected.index) + with ensure_clean_store(self.path) as store: store.put('df1',df,format='table') - tm.assert_frame_equal(store['df1'],df,check_index_type=True,check_column_type=True) + tm.assert_frame_equal(store['df1'],expected,check_index_type=True,check_column_type=True) def test_store_multiindex(self): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index b4caca84ab3da..a3ba08bae7d8c 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -2508,9 +2508,6 @@ def test_constructor_corner(self): self.assertRaises(TypeError, RangeIndex, '1', '10', '1') self.assertRaises(TypeError, RangeIndex, 1.1, 10.2, 1.3) - # iterable raise Exception - self.assertRaises(TypeError, RangeIndex, iter([-5, 0, 1, 2])) - def test_copy(self): i = RangeIndex(5, name='Foo') i_copy = i.copy() From c9ac12f6043dfc1aba8f871086b6c69b26940ba4 Mon Sep 17 00:00:00 2001 From: ARF Date: Sat, 2 May 2015 15:33:13 +0200 Subject: [PATCH 7/7] enh: various performance optimizations * optimize argsort() * optimize tolist() * comment clean-up --- pandas/core/index.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 5fa81fd9c2624..387bd2c0c98e0 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3373,9 +3373,6 @@ def __new__(cls, start=None, stop=None, step=None, name=None, fastpath=False, ** @classmethod def _simple_new(cls, start, stop, step, name=None): - # canonise empty RangeIndex objects - #if (stop-start)//step <= 0: - # start, stop, step = 0, 0, 1 result = object.__new__(cls) result._start = start result._stop = stop @@ -3438,7 +3435,7 @@ def has_duplicates(self): return False def tolist(self): - return list(range(self.start, self.stop, self.step)) + return lrange(self.start, self.stop, self.step) def _shallow_copy(self, values=None, **kwargs): """ create a new Index, don't copy the data, use the same object attributes @@ -3476,7 +3473,6 @@ def copy(self, names=None, name=None, dtype=None, deep=False): name = self.name return RangeIndex(self.start, self.stop, self.step, name, fastpath=True) - # TODO: return arange instead of sorting def argsort(self, *args, **kwargs): """ return an ndarray indexer of the underlying data @@ -3485,7 +3481,10 @@ def argsort(self, *args, **kwargs): -------- numpy.ndarray.argsort """ - return self._data.argsort(*args, **kwargs) + if self.step > 0: + return np.arange(len(self)) + else: + return np.arange(len(self)-1, -1, -1) def __repr__(self): attrs = [('start', default_pprint(self.start)),