Skip to content

Commit 622f31c

Browse files
authored
PERF: Index.join to maintain cached attributes in more cases (#57023)
* Index.join result name * whatsnew * update test * Index._wrap_join_result to maintain cached attributes if possible * Index._wrap_join_result to maintain cached attributes if possible * whatsnew * allow indexers to be None * gh ref * rename variables for clarity
1 parent 3c96b8f commit 622f31c

File tree

5 files changed

+53
-35
lines changed

5 files changed

+53
-35
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ Performance improvements
105105
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
106106
- Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`)
107107
- Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
108+
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
108109
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
109110
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
110111
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)

pandas/core/frame.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -8012,19 +8012,17 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:
80128012
left = self
80138013

80148014
# GH#31623, only operate on shared columns
8015-
cols, lcols, rcols = left.columns.join(
8016-
right.columns, how="inner", level=None, return_indexers=True
8015+
cols, lcol_indexer, rcol_indexer = left.columns.join(
8016+
right.columns, how="inner", return_indexers=True
80178017
)
80188018

8019-
new_left = left.iloc[:, lcols]
8020-
new_right = right.iloc[:, rcols]
8019+
new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer]
8020+
new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer]
80218021
result = op(new_left, new_right)
80228022

80238023
# Do the join on the columns instead of using left._align_for_op
80248024
# to avoid constructing two potentially large/sparse DataFrames
8025-
join_columns, _, _ = left.columns.join(
8026-
right.columns, how="outer", level=None, return_indexers=True
8027-
)
8025+
join_columns = left.columns.join(right.columns, how="outer")
80288026

80298027
if result.columns.has_duplicates:
80308028
# Avoid reindexing with a duplicate axis.

pandas/core/indexes/base.py

+34-20
Original file line numberDiff line numberDiff line change
@@ -5023,7 +5023,9 @@ def _join_monotonic(
50235023
ridx = self._left_indexer_unique(other)
50245024
else:
50255025
join_array, lidx, ridx = self._left_indexer(other)
5026-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how)
5026+
join_index, lidx, ridx = self._wrap_join_result(
5027+
join_array, other, lidx, ridx, how
5028+
)
50275029
elif how == "right":
50285030
if self.is_unique:
50295031
# We can perform much better than the general case
@@ -5032,40 +5034,52 @@ def _join_monotonic(
50325034
ridx = None
50335035
else:
50345036
join_array, ridx, lidx = other._left_indexer(self)
5035-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how)
5037+
join_index, lidx, ridx = self._wrap_join_result(
5038+
join_array, other, lidx, ridx, how
5039+
)
50365040
elif how == "inner":
50375041
join_array, lidx, ridx = self._inner_indexer(other)
5038-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how)
5042+
join_index, lidx, ridx = self._wrap_join_result(
5043+
join_array, other, lidx, ridx, how
5044+
)
50395045
elif how == "outer":
50405046
join_array, lidx, ridx = self._outer_indexer(other)
5041-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how)
5047+
join_index, lidx, ridx = self._wrap_join_result(
5048+
join_array, other, lidx, ridx, how
5049+
)
50425050

50435051
lidx = None if lidx is None else ensure_platform_int(lidx)
50445052
ridx = None if ridx is None else ensure_platform_int(ridx)
50455053
return join_index, lidx, ridx
50465054

5047-
def _wrap_joined_index(
5055+
def _wrap_join_result(
50485056
self,
50495057
joined: ArrayLike,
50505058
other: Self,
5051-
lidx: npt.NDArray[np.intp],
5052-
ridx: npt.NDArray[np.intp],
5059+
lidx: npt.NDArray[np.intp] | None,
5060+
ridx: npt.NDArray[np.intp] | None,
50535061
how: JoinHow,
5054-
) -> Self:
5062+
) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
50555063
assert other.dtype == self.dtype
5056-
names = other.names if how == "right" else self.names
5057-
if isinstance(self, ABCMultiIndex):
5058-
# error: Incompatible return value type (got "MultiIndex",
5059-
# expected "Self")
5060-
mask = lidx == -1
5061-
join_idx = self.take(lidx)
5062-
right = cast("MultiIndex", other.take(ridx))
5063-
join_index = join_idx.putmask(mask, right)._sort_levels_monotonic()
5064-
return join_index.set_names(names) # type: ignore[return-value]
5064+
5065+
if lidx is not None and lib.is_range_indexer(lidx, len(self)):
5066+
lidx = None
5067+
if ridx is not None and lib.is_range_indexer(ridx, len(other)):
5068+
ridx = None
5069+
5070+
# return self or other if possible to maintain cached attributes
5071+
if lidx is None:
5072+
join_index = self
5073+
elif ridx is None:
5074+
join_index = other
50655075
else:
5066-
return self._constructor._with_infer(
5067-
joined, name=names[0], dtype=self.dtype
5068-
)
5076+
join_index = self._constructor._with_infer(joined, dtype=self.dtype)
5077+
5078+
names = other.names if how == "right" else self.names
5079+
if join_index.names != names:
5080+
join_index = join_index.set_names(names)
5081+
5082+
return join_index, lidx, ridx
50695083

50705084
@final
50715085
@cache_readonly

pandas/core/indexes/datetimelike.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -735,18 +735,20 @@ def _get_join_freq(self, other):
735735
freq = self.freq
736736
return freq
737737

738-
def _wrap_joined_index(
738+
def _wrap_join_result(
739739
self,
740740
joined,
741741
other,
742-
lidx: npt.NDArray[np.intp],
743-
ridx: npt.NDArray[np.intp],
742+
lidx: npt.NDArray[np.intp] | None,
743+
ridx: npt.NDArray[np.intp] | None,
744744
how: JoinHow,
745-
):
745+
) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
746746
assert other.dtype == self.dtype, (other.dtype, self.dtype)
747-
result = super()._wrap_joined_index(joined, other, lidx, ridx, how)
748-
result._data._freq = self._get_join_freq(other)
749-
return result
747+
join_index, lidx, ridx = super()._wrap_join_result(
748+
joined, other, lidx, ridx, how
749+
)
750+
join_index._data._freq = self._get_join_freq(other)
751+
return join_index, lidx, ridx
750752

751753
def _get_engine_target(self) -> np.ndarray:
752754
# engine methods and libjoin methods need dt64/td64 values cast to i8

pandas/tests/indexes/multi/test_join.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ def test_join_level(idx, other, join_type):
3535

3636
assert join_index.equals(join_index2)
3737
tm.assert_numpy_array_equal(lidx, lidx2)
38-
tm.assert_numpy_array_equal(ridx, ridx2)
38+
if ridx is None:
39+
assert ridx == ridx2
40+
else:
41+
tm.assert_numpy_array_equal(ridx, ridx2)
3942
tm.assert_numpy_array_equal(join_index2.values, exp_values)
4043

4144

0 commit comments

Comments
 (0)