Skip to content

PERF: RangeIndex.__getitem__ with integers return RangeIndex #57770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 12, 2024
8 changes: 4 additions & 4 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,11 +255,11 @@ Performance improvements
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
- Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)

Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4241,7 +4241,6 @@ def join(

return self._join_via_get_indexer(other, how, sort)

@final
def _join_empty(
self, other: Index, how: JoinHow, sort: bool
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
Expand Down
53 changes: 39 additions & 14 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,18 +472,31 @@ def _shallow_copy(self, values, name: Hashable = no_default):

if values.dtype.kind == "f":
return Index(values, name=name, dtype=np.float64)
if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1:
if values.dtype.kind == "i" and values.ndim == 1:
# GH 46675 & 43885: If values is equally spaced, return a
# more memory-compact RangeIndex instead of Index with 64-bit dtype
if len(values) == 0:
return type(self)._simple_new(_empty_range, name=name)
elif len(values) == 1:
start = values[0]
new_range = range(start, start + self.step, self.step)
return type(self)._simple_new(new_range, name=name)
diff = values[1] - values[0]
if not missing.isna(diff) and diff != 0:
maybe_range_indexer, remainder = np.divmod(values - values[0], diff)
if (
lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer))
and not remainder.any()
):
if len(values) == 2:
# Can skip is_range_indexer check
new_range = range(values[0], values[-1] + diff, diff)
return type(self)._simple_new(new_range, name=name)
else:
maybe_range_indexer, remainder = np.divmod(values - values[0], diff)
if (
lib.is_range_indexer(
maybe_range_indexer, len(maybe_range_indexer)
)
and not remainder.any()
):
new_range = range(values[0], values[-1] + diff, diff)
return type(self)._simple_new(new_range, name=name)
return self._constructor._simple_new(values, name=name)

def _view(self) -> Self:
Expand Down Expand Up @@ -897,12 +910,19 @@ def symmetric_difference(
result = result.rename(result_name)
return result

def _join_empty(
self, other: Index, how: JoinHow, sort: bool
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
if other.dtype.kind == "i":
other = self._shallow_copy(other._values, name=other.name)
return super()._join_empty(other, how=how, sort=sort)

def _join_monotonic(
self, other: Index, how: JoinHow = "left"
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
# This currently only gets called for the monotonic increasing case
if not isinstance(other, type(self)):
maybe_ri = self._shallow_copy(other._values)
maybe_ri = self._shallow_copy(other._values, name=other.name)
if not isinstance(maybe_ri, type(self)):
return super()._join_monotonic(other, how=how)
other = maybe_ri
Expand Down Expand Up @@ -1078,6 +1098,8 @@ def __getitem__(self, key):
"""
Conserve RangeIndex type for scalar and slice keys.
"""
if key is Ellipsis:
key = slice(None)
if isinstance(key, slice):
return self._getitem_slice(key)
elif is_integer(key):
Expand All @@ -1097,17 +1119,20 @@ def __getitem__(self, key):
)
elif com.is_bool_indexer(key):
if isinstance(getattr(key, "dtype", None), ExtensionDtype):
np_key = key.to_numpy(dtype=bool, na_value=False)
key = key.to_numpy(dtype=bool, na_value=False)
else:
np_key = np.asarray(key, dtype=bool)
check_array_indexer(self._range, np_key) # type: ignore[arg-type]
key = np.asarray(key, dtype=bool)
check_array_indexer(self._range, key) # type: ignore[arg-type]
# Short circuit potential _shallow_copy check
if np_key.all():
if key.all():
return self._simple_new(self._range, name=self.name)
elif not np_key.any():
elif not key.any():
return self._simple_new(_empty_range, name=self.name)
return self.take(np.flatnonzero(np_key))
return super().__getitem__(key)
key = np.flatnonzero(key)
try:
return self.take(key)
except (TypeError, ValueError):
return super().__getitem__(key)

def _getitem_slice(self, slobj: slice) -> Self:
"""
Expand Down
8 changes: 7 additions & 1 deletion pandas/tests/indexes/ranges/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,15 @@ def test_join_self(self, join_type):
[-1, -1, 0, 1],
"outer",
],
[RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"],
[RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"],
[RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"],
[RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"],
],
)
@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))])
@pytest.mark.parametrize(
"right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)]
)
def test_join_preserves_rangeindex(
left, right, expected, expected_lidx, expected_ridx, how, right_type
):
Expand Down
57 changes: 57 additions & 0 deletions pandas/tests/indexes/ranges/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,26 @@ def test_range_index_rsub_by_const(self):
tm.assert_index_equal(result, expected)


def test_reindex_1_value_returns_rangeindex():
ri = RangeIndex(0, 10, 2, name="foo")
result, result_indexer = ri.reindex([2])
expected = RangeIndex(2, 4, 2, name="foo")
tm.assert_index_equal(result, expected, exact=True)

expected_indexer = np.array([1], dtype=np.intp)
tm.assert_numpy_array_equal(result_indexer, expected_indexer)


def test_reindex_empty_returns_rangeindex():
ri = RangeIndex(0, 10, 2, name="foo")
result, result_indexer = ri.reindex([])
expected = RangeIndex(0, 0, 2, name="foo")
tm.assert_index_equal(result, expected, exact=True)

expected_indexer = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result_indexer, expected_indexer)


def test_append_non_rangeindex_return_rangeindex():
ri = RangeIndex(1)
result = ri.append(Index([1]))
Expand Down Expand Up @@ -653,6 +673,21 @@ def test_take_return_rangeindex():
tm.assert_index_equal(result, expected, exact=True)


@pytest.mark.parametrize(
"rng, exp_rng",
[
[range(5), range(3, 4)],
[range(0, -10, -2), range(-6, -8, -2)],
[range(0, 10, 2), range(6, 8, 2)],
],
)
def test_take_1_value_returns_rangeindex(rng, exp_rng):
ri = RangeIndex(rng, name="foo")
result = ri.take([3])
expected = RangeIndex(exp_rng, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_append_one_nonempty_preserve_step():
expected = RangeIndex(0, -1, -1)
result = RangeIndex(0).append([expected])
Expand Down Expand Up @@ -695,3 +730,25 @@ def test_getitem_boolmask_wrong_length():
ri = RangeIndex(4, name="foo")
with pytest.raises(IndexError, match="Boolean index has wrong length"):
ri[[True]]


def test_getitem_integers_return_rangeindex():
result = RangeIndex(0, 10, 2, name="foo")[[0, -1]]
expected = RangeIndex(start=0, stop=16, step=8, name="foo")
tm.assert_index_equal(result, expected, exact=True)

result = RangeIndex(0, 10, 2, name="foo")[[3]]
expected = RangeIndex(start=6, stop=8, step=2, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_getitem_empty_return_rangeindex():
result = RangeIndex(0, 10, 2, name="foo")[[]]
expected = RangeIndex(start=0, stop=0, step=1, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_getitem_integers_return_index():
result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]]
expected = Index([0, 2, 8], dtype="int64", name="foo")
tm.assert_index_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def test_loc_getitem_list_with_fail(self):

s.loc[[2]]

msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]"
msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]"
with pytest.raises(KeyError, match=re.escape(msg)):
s.loc[[3]]

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/io/pytables/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,8 @@ def test_append_to_multiple_min_itemsize(setup_path):
}
)
expected = df.iloc[[0]]
# Reading/writing RangeIndex info is not supported yet
expected.index = Index(list(range(len(expected.index))))

with ensure_clean_store(setup_path) as store:
store.append_to_multiple(
Expand Down