From 532c43d71ff8e2bb0c76e0333aadae04f4981dbe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:07:08 -0800 Subject: [PATCH 1/3] PERF: Return RangeIndex from RangeIndex.join when possible --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 1 - pandas/core/indexes/range.py | 36 ++++++++++++++++ pandas/tests/indexes/ranges/test_join.py | 52 ++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8bb051b6228ce..d3fb00d700bce 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -252,6 +252,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`?`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c17e01b85fa84..0701bed7cd9a4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4588,7 +4588,6 @@ def _get_leaf_sorter( ) return join_index, left_indexer, right_indexer - @final def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8a2e3fbf500a4..7735701741a5f 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -53,6 +53,7 @@ from pandas._typing import ( Axis, Dtype, + JoinHow, NaPosition, Self, npt, @@ -888,6 +889,41 @@ def symmetric_difference( result = result.rename(result_name) return result + def _join_monotonic( + self, other: Index, how: JoinHow = "left" + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + # This currently only gets called for the monotonic increasing case + if not isinstance(other, type(self)): + maybe_ri = self._shallow_copy(other._values) + if not isinstance(maybe_ri, type(self)): + return super()._join_monotonic(other, how=how) + other = maybe_ri + + if self.equals(other): + ret_index = other if how == "right" else self + return ret_index, None, None + + if how == "left": + join_index = self + lidx = None + ridx = join_index.get_indexer(other) + elif how == "right": + join_index = other + lidx = join_index.get_indexer(self) + ridx = None + elif how == "inner": + join_index = self.intersection(other) + lidx = self.get_indexer(join_index) + ridx = other.get_indexer(join_index) + elif how == "outer": + join_index = self.union(other) + lidx = self.get_indexer(join_index) + ridx = other.get_indexer(join_index) + + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + # -------------------------------------------------------------------- # error: Return type "Index" of "delete" incompatible with return type diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index 682b5c8def9ff..a346faff8bdce 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( Index, @@ -175,3 +176,54 @@ def test_join_self(self, join_type): index = RangeIndex(start=0, stop=20, step=2) joined = index.join(index, how=join_type) assert index is joined + + +@pytest.mark.parametrize( + "left, right, expected, expected_lidx, expected_ridx, how", + [ + [RangeIndex(2), RangeIndex(3), RangeIndex(2), None, [0, 1, -1], "left"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "left"], + [RangeIndex(2), RangeIndex(20, 22), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1], None, "right"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "right"], + [ + RangeIndex(2), + RangeIndex(20, 22), + RangeIndex(20, 22), + [-1, -1], + None, + "right", + ], + [RangeIndex(2), RangeIndex(3), RangeIndex(2), [0, 1], [0, 1], "inner"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "inner"], + [RangeIndex(2), RangeIndex(1, 3), RangeIndex(1, 2), [1], [0], "inner"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1, -1], [0, 1, 2], "outer"], + [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "outer"], + [ + RangeIndex(2), + RangeIndex(2, 4), + RangeIndex(4), + [0, 1, -1, -1], + [-1, -1, 0, 1], + "outer", + ], + ], +) +@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))]) +def test_join_preserves_rangeindex( + left, right, expected, expected_lidx, expected_ridx, how, right_type +): + result, lidx, ridx = left.join(right_type(right), how=how, return_indexers=True) + tm.assert_index_equal(result, expected, exact=True) + + if expected_lidx is None: + assert lidx is expected_lidx + else: + exp_lidx = np.array(expected_lidx, dtype=np.intp) + tm.assert_numpy_array_equal(lidx, exp_lidx) + + if expected_ridx is None: + assert ridx is expected_ridx + else: + exp_ridx = np.array(expected_ridx, dtype=np.intp) + tm.assert_numpy_array_equal(ridx, exp_ridx) From 762b71e29e9718fc94b801b25afde3b1d2150597 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:09:00 -0800 Subject: [PATCH 2/3] whatsnew number --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d3fb00d700bce..c1edee4354058 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -252,7 +252,7 @@ Performance improvements - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) -- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`?`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``) From c544036eed1d6d7ca7a721a5ec0a9bf3befeb83d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:13:33 -0800 Subject: [PATCH 3/3] Fix indexer --- pandas/core/indexes/range.py | 4 ++-- pandas/tests/indexes/ranges/test_join.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 7735701741a5f..ecba0a0f3d3be 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -906,10 +906,10 @@ def _join_monotonic( if how == "left": join_index = self lidx = None - ridx = join_index.get_indexer(other) + ridx = other.get_indexer(join_index) elif how == "right": join_index = other - lidx = join_index.get_indexer(self) + lidx = self.get_indexer(join_index) ridx = None elif how == "inner": join_index = self.intersection(other) diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index a346faff8bdce..ca3af607c0a38 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -181,10 +181,10 @@ def test_join_self(self, join_type): @pytest.mark.parametrize( "left, right, expected, expected_lidx, expected_ridx, how", [ - [RangeIndex(2), RangeIndex(3), RangeIndex(2), None, [0, 1, -1], "left"], + [RangeIndex(2), RangeIndex(3), RangeIndex(2), None, [0, 1], "left"], [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "left"], [RangeIndex(2), RangeIndex(20, 22), RangeIndex(2), None, [-1, -1], "left"], - [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1], None, "right"], + [RangeIndex(2), RangeIndex(3), RangeIndex(3), [0, 1, -1], None, "right"], [RangeIndex(2), RangeIndex(2), RangeIndex(2), None, None, "right"], [ RangeIndex(2),