Skip to content

PERF: don't call RangeIndex._data unnecessarily #26565

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ def time_min(self):
def time_min_trivial(self):
self.idx_inc.min()

def time_get_loc_inc(self):
self.idx_inc.get_loc(900000)

def time_get_loc_dec(self):
self.idx_dec.get_loc(100000)


class IndexAppend:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ Performance Improvements
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
- Improved performance when slicing :class:`RangeIndex` (:issue:`26565`)
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
- Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
Expand Down
32 changes: 30 additions & 2 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from pandas.core.indexes.base import Index, _index_shared_docs
from pandas.core.indexes.numeric import Int64Index

from pandas.io.formats.printing import pprint_thing


class RangeIndex(Int64Index):
"""
Expand Down Expand Up @@ -64,6 +66,8 @@ class RangeIndex(Int64Index):
_typ = 'rangeindex'
_engine_type = libindex.Int64Engine

# check whether self._data has benn called
_cached_data = None # type: np.ndarray
# --------------------------------------------------------------------
# Constructors

Expand Down Expand Up @@ -164,6 +168,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None,
for k, v in kwargs.items():
setattr(result, k, v)

result._range = range(result._start, result._stop, result._step)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could actually remove the _start, _stop, _step properties as well?

Copy link
Contributor Author

@topper-123 topper-123 May 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'm planning to do that in an upcoming PR.

Python3's range accepts slicing, which Python2's xrange didn't, so this refactoring will also allow dropping doing custom slicing operations in RangeIndex.


result._reset_identity()
return result

Expand All @@ -180,9 +186,19 @@ def _constructor(self):
""" return the class to use for construction """
return Int64Index

@cache_readonly
@property
def _data(self):
return np.arange(self._start, self._stop, self._step, dtype=np.int64)
"""
An int array that for performance reasons is created only when needed.

The constructed array is saved in ``_cached_data``. This allows us to
check if the array has been created without accessing ``_data`` and
triggering the construction.
"""
if self._cached_data is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you give this a doc-string (e.g. that cached_data is actually an int array and be constructed only if necessary for performance reasons

self._cached_data = np.arange(self._start, self._stop, self._step,
dtype=np.int64)
return self._cached_data

@cache_readonly
def _int64index(self):
Expand Down Expand Up @@ -215,6 +231,9 @@ def _format_data(self, name=None):
# we are formatting thru the attributes
return None

def _format_with_header(self, header, na_rep='NaN', **kwargs):
return header + list(map(pprint_thing, self._range))

# --------------------------------------------------------------------
@property
def start(self):
Expand Down Expand Up @@ -296,6 +315,15 @@ def is_monotonic_decreasing(self):
def has_duplicates(self):
return False

@Appender(_index_shared_docs['get_loc'])
def get_loc(self, key, method=None, tolerance=None):
if is_integer(key) and method is None and tolerance is None:
try:
return self._range.index(key)
except ValueError:
raise KeyError(key)
return super().get_loc(key, method=method, tolerance=tolerance)

def tolist(self):
return list(range(self._start, self._stop, self._step))

Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/indexes/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,42 @@ def test_view(self):
def test_dtype(self):
assert self.index.dtype == np.int64

def test_cached_data(self):
# GH 26565
# Calling RangeIndex._data caches an int64 array of the same length at
# self._cached_data. This tests whether _cached_data has been set.
idx = RangeIndex(0, 100, 10)

assert idx._cached_data is None

repr(idx)
assert idx._cached_data is None

str(idx)
assert idx._cached_data is None

idx.get_loc(20)
assert idx._cached_data is None

df = pd.DataFrame({'a': range(10)}, index=idx)

df.loc[50]
assert idx._cached_data is None

with pytest.raises(KeyError):
df.loc[51]
assert idx._cached_data is None

df.loc[10:50]
assert idx._cached_data is None

df.iloc[5:10]
assert idx._cached_data is None

# actually calling data._data
assert isinstance(idx._data, np.ndarray)
assert isinstance(idx._cached_data, np.ndarray)

def test_is_monotonic(self):
assert self.index.is_monotonic is True
assert self.index.is_monotonic_increasing is True
Expand Down