Skip to content

CLN/PERF: move RangeIndex._cached_data to RangeIndex._cache #35432

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 4 additions & 12 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datetime import timedelta
import operator
from sys import getsizeof
from typing import Any, Optional
from typing import Any
import warnings

import numpy as np
Expand Down Expand Up @@ -78,8 +78,6 @@ class RangeIndex(Int64Index):
_engine_type = libindex.Int64Engine
_range: range

# check whether self._data has been called
_cached_data: Optional[np.ndarray] = None
# --------------------------------------------------------------------
# Constructors

Expand Down Expand Up @@ -150,20 +148,14 @@ def _constructor(self):
""" return the class to use for construction """
return Int64Index

@property
@cache_readonly
def _data(self):
"""
An int array that for performance reasons is created only when needed.
The constructed array is saved in ``_cached_data``. This allows us to
check if the array has been created without accessing ``_data`` and
triggering the construction.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we ever do this check outside of tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, and the checks in the tests are to ensure this array isn't created unless it's needed.

The constructed array is saved in ``_cache``.
"""
if self._cached_data is None:
self._cached_data = np.arange(
self.start, self.stop, self.step, dtype=np.int64
)
return self._cached_data
return np.arange(self.start, self.stop, self.step, dtype=np.int64)

@cache_readonly
def _int64index(self) -> Int64Index:
Expand Down
45 changes: 25 additions & 20 deletions pandas/tests/indexes/ranges/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,53 +137,58 @@ def test_dtype(self):
index = self.create_index()
assert index.dtype == np.int64

def test_cached_data(self):
# GH 26565, GH26617
# Calling RangeIndex._data caches an int64 array of the same length at
# self._cached_data. This test checks whether _cached_data has been set
def test_cache(self):
# GH 26565, GH26617, GH35432
# This test checks whether _cache has been set.
# Calling RangeIndex._cache["_data"] creates an int64 array of the same length
# as the RangeIndex and stores it in _cache.
idx = RangeIndex(0, 100, 10)

assert idx._cached_data is None
assert idx._cache == {}

repr(idx)
assert idx._cached_data is None
assert idx._cache == {}

str(idx)
assert idx._cached_data is None
assert idx._cache == {}

idx.get_loc(20)
assert idx._cached_data is None
assert idx._cache == {}

90 in idx
assert idx._cached_data is None
90 in idx # True
assert idx._cache == {}

91 in idx
assert idx._cached_data is None
91 in idx # False
assert idx._cache == {}

idx.all()
assert idx._cached_data is None
assert idx._cache == {}

idx.any()
assert idx._cached_data is None
assert idx._cache == {}

df = pd.DataFrame({"a": range(10)}, index=idx)

df.loc[50]
assert idx._cached_data is None
assert idx._cache == {}

with pytest.raises(KeyError, match="51"):
df.loc[51]
assert idx._cached_data is None
assert idx._cache == {}

df.loc[10:50]
assert idx._cached_data is None
assert idx._cache == {}

df.iloc[5:10]
assert idx._cached_data is None
assert idx._cache == {}

# actually calling idx._data
# idx._cache should contain a _data entry after call to idx._data
idx._data
assert isinstance(idx._data, np.ndarray)
assert isinstance(idx._cached_data, np.ndarray)
assert idx._data is idx._data # check cached value is reused
assert len(idx._cache) == 4
expected = np.arange(0, 100, 10, dtype="int64")
tm.assert_numpy_array_equal(idx._cache["_data"], expected)

def test_is_monotonic(self):
index = RangeIndex(0, 20, 2)
Expand Down