From aa5589ce90f65a68c73a9562738b95b629ce0e47 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:46:12 -0800 Subject: [PATCH 01/10] PERF: RangeIndex.take with 1 value return RangeIndex --- pandas/core/indexes/range.py | 8 +++++++- pandas/tests/indexes/ranges/test_range.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 09d635b53c482..76068c8ee901e 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1260,4 +1260,10 @@ def take( # type: ignore[override] if self.start != 0: taken += self.start - return self._shallow_copy(taken, name=self.name) + if len(taken) == 1: + start = taken[0] + return self._simple_new( + range(start, start + self.step, self.step), name=self.name + ) + else: + return self._shallow_copy(taken, name=self.name) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 898548d1cc4dc..2b81d4794627d 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -639,6 +639,21 @@ def test_take_return_rangeindex(): tm.assert_index_equal(result, expected, exact=True) +@pytest.mark.parametrize( + "rng, exp_rng", + [ + [range(5), range(3, 4)], + [range(0, -10, -2), range(-6, -8, -2)], + [range(0, 10, 2), range(6, 8, 2)], + ], +) +def test_take_1_value_returns_rangeindex(rng, exp_rng): + ri = RangeIndex(rng, name="foo") + result = ri.take([3]) + expected = RangeIndex(exp_rng, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + def test_append_one_nonempty_preserve_step(): expected = RangeIndex(0, -1, -1) result = RangeIndex(0).append([expected]) From 47af1ce9abaa3e112a995d10ecb76ae48ef4bbf5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:47:17 -0800 Subject: [PATCH 02/10] add issue number --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7802ef4798659..8941af2a5b463 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -258,7 +258,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) -- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) +- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) From e7e0fcba0cf6fb2f94ceb5d9b8ae8ec4b5131f39 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:08:26 -0800 Subject: [PATCH 03/10] Move to _shallow_copy, support empty join as well --- doc/source/whatsnew/v3.0.0.rst | 4 +-- pandas/core/indexes/base.py | 1 - pandas/core/indexes/range.py | 44 +++++++++++++++-------- pandas/tests/indexes/ranges/test_join.py | 8 ++++- pandas/tests/indexes/ranges/test_range.py | 20 +++++++++++ 5 files changed, 58 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8941af2a5b463..70fe03000290b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -256,8 +256,8 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) -- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) -- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) +- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c72c5fa019bd7..4e5cc986b7325 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4241,7 +4241,6 @@ def join( return self._join_via_get_indexer(other, how, sort) - @final def _join_empty( self, other: Index, how: JoinHow, sort: bool ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 76068c8ee901e..573bd6cf14bcb 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -472,18 +472,31 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1: + if values.dtype.kind == "i" and values.ndim == 1: # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype + if len(values) == 0: + return type(self)._simple_new(_empty_range, name=self.name) + elif len(values) == 1: + start = values[0] + new_range = range(start, start + self.step, self.step) + return type(self)._simple_new(new_range, name=self.name) diff = values[1] - values[0] if not missing.isna(diff) and diff != 0: - maybe_range_indexer, remainder = np.divmod(values - values[0], diff) - if ( - lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) - and not remainder.any() - ): + if len(values) == 2: + # Can skip is_range_indexer check new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=name) + return type(self)._simple_new(new_range, name=self.name) + else: + maybe_range_indexer, remainder = np.divmod(values - values[0], diff) + if ( + lib.is_range_indexer( + maybe_range_indexer, len(maybe_range_indexer) + ) + and not remainder.any() + ): + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) def _view(self) -> Self: @@ -897,12 +910,19 @@ def symmetric_difference( result = result.rename(result_name) return result + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + if other.dtype.kind == "i": + other = self._shallow_copy(other._values, name=other.name) + return super()._join_empty(other, how=how, sort=sort) + def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # This currently only gets called for the monotonic increasing case if not isinstance(other, type(self)): - maybe_ri = self._shallow_copy(other._values) + maybe_ri = self._shallow_copy(other._values, name=other.name) if not isinstance(maybe_ri, type(self)): return super()._join_monotonic(other, how=how) other = maybe_ri @@ -1260,10 +1280,4 @@ def take( # type: ignore[override] if self.start != 0: taken += self.start - if len(taken) == 1: - start = taken[0] - return self._simple_new( - range(start, start + self.step, self.step), name=self.name - ) - else: - return self._shallow_copy(taken, name=self.name) + return self._shallow_copy(taken, name=self.name) diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index ca3af607c0a38..09db30b1d4c51 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -207,9 +207,15 @@ def test_join_self(self, join_type): [-1, -1, 0, 1], "outer", ], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"], ], ) -@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))]) +@pytest.mark.parametrize( + "right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)] +) def test_join_preserves_rangeindex( left, right, expected, expected_lidx, expected_ridx, how, right_type ): diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 2b81d4794627d..528315078277d 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -608,6 +608,26 @@ def test_range_index_rsub_by_const(self): tm.assert_index_equal(result, expected) +def test_reindex_1_value_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([2]) + expected = RangeIndex(2, 4, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([1], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_reindex_empty_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([]) + expected = RangeIndex(0, 0, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + def test_reindex_returns_rangeindex(): ri = RangeIndex(2, name="foo") result, result_indexer = ri.reindex([1, 2, 3]) From 67f19987a86d11d046e4d7f63d58caca3b217a42 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:09:59 -0800 Subject: [PATCH 04/10] Fix self.name --- pandas/core/indexes/range.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 573bd6cf14bcb..4fa075d10c9e0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -476,17 +476,17 @@ def _shallow_copy(self, values, name: Hashable = no_default): # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype if len(values) == 0: - return type(self)._simple_new(_empty_range, name=self.name) + return type(self)._simple_new(_empty_range, name=name) elif len(values) == 1: start = values[0] new_range = range(start, start + self.step, self.step) - return type(self)._simple_new(new_range, name=self.name) + return type(self)._simple_new(new_range, name=name) diff = values[1] - values[0] if not missing.isna(diff) and diff != 0: if len(values) == 2: # Can skip is_range_indexer check new_range = range(values[0], values[-1] + diff, diff) - return type(self)._simple_new(new_range, name=self.name) + return type(self)._simple_new(new_range, name=name) else: maybe_range_indexer, remainder = np.divmod(values - values[0], diff) if ( From 2cc41fc25fc245cf4cb96716685b5e19a770f719 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:55:15 -0800 Subject: [PATCH 05/10] FIx error message --- pandas/tests/indexing/test_loc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 9c33d15c01cd6..61a3a7fbe87f2 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -508,7 +508,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" + msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] From c7da6acc720613e6494868ea0e87a40db4d6e280 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:41:44 -0800 Subject: [PATCH 06/10] Fix hdf test --- pandas/tests/io/pytables/test_append.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 529d6d789596f..72abbcec63357 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -967,6 +967,8 @@ def test_append_to_multiple_min_itemsize(setup_path): } ) expected = df.iloc[[0]] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(list(range(len(expected.index)))) with ensure_clean_store(setup_path) as store: store.append_to_multiple( From 028fbd4d8944d0bd5a724c8a41f5ace9fc500110 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Mar 2024 13:12:19 -0800 Subject: [PATCH 07/10] PERF: RangeIndex.__getitem__ with integers return RangeIndex --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 85570cd4a6744..8408f609bdc99 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -255,7 +255,7 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) -- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) +- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) - Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) From ccd781d93825816a6c0c940885b43fcccc1c2b47 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Mar 2024 13:12:29 -0800 Subject: [PATCH 08/10] PERF: RangeIndex.__getitem__ with integers return RangeIndex --- pandas/core/indexes/range.py | 14 +++++++------- pandas/tests/indexes/ranges/test_range.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fcb47de2b3c0d..4c6513e7a0fca 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1117,17 +1117,17 @@ def __getitem__(self, key): ) elif com.is_bool_indexer(key): if isinstance(getattr(key, "dtype", None), ExtensionDtype): - np_key = key.to_numpy(dtype=bool, na_value=False) + key = key.to_numpy(dtype=bool, na_value=False) else: - np_key = np.asarray(key, dtype=bool) - check_array_indexer(self._range, np_key) # type: ignore[arg-type] + key = np.asarray(key, dtype=bool) + check_array_indexer(self._range, key) # type: ignore[arg-type] # Short circuit potential _shallow_copy check - if np_key.all(): + if key.all(): return self._simple_new(self._range, name=self.name) - elif not np_key.any(): + elif not key.any(): return self._simple_new(_empty_range, name=self.name) - return self.take(np.flatnonzero(np_key)) - return super().__getitem__(key) + key = np.flatnonzero(key) + return self.take(key) def _getitem_slice(self, slobj: slice) -> Self: """ diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index d99ba1b624775..3040b4c13dc17 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -730,3 +730,25 @@ def test_getitem_boolmask_wrong_length(): ri = RangeIndex(4, name="foo") with pytest.raises(IndexError, match="Boolean index has wrong length"): ri[[True]] + + +def test_getitem_integers_return_rangeindex(): + result = RangeIndex(0, 10, 2, name="foo")[[0, -1]] + expected = RangeIndex(start=0, stop=16, step=8, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex(0, 10, 2, name="foo")[[3]] + expected = RangeIndex(start=6, stop=8, step=2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_empty_return_rangeindex(): + result = RangeIndex(0, 10, 2, name="foo")[[]] + expected = RangeIndex(start=0, stop=0, step=1, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_integers_return_index(): + result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]] + expected = Index([0, 2, 8], dtype="int64", name="foo") + tm.assert_index_equal(result, expected) From 09b9445ba5f04cc1c18bdaa3481dc071eaf3dd41 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Mar 2024 14:00:23 -0800 Subject: [PATCH 09/10] Handle ellipse --- pandas/core/indexes/range.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 4c6513e7a0fca..22b3de199083c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1098,6 +1098,8 @@ def __getitem__(self, key): """ Conserve RangeIndex type for scalar and slice keys. """ + if key is Ellipsis: + key = slice(None) if isinstance(key, slice): return self._getitem_slice(key) elif is_integer(key): @@ -1127,7 +1129,11 @@ def __getitem__(self, key): elif not key.any(): return self._simple_new(_empty_range, name=self.name) key = np.flatnonzero(key) - return self.take(key) + try: + return self.take(key) + except TypeError: + # Have Index.__getitem__ raise its exception + return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: """ From bfa25bac586158f32aab60817bff18c8c851a269 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Mar 2024 14:20:05 -0800 Subject: [PATCH 10/10] Catch ValueError --- pandas/core/indexes/range.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 22b3de199083c..3fc3f7b4d50bb 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1131,8 +1131,7 @@ def __getitem__(self, key): key = np.flatnonzero(key) try: return self.take(key) - except TypeError: - # Have Index.__getitem__ raise its exception + except (TypeError, ValueError): return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: