From db1f094fafffa6e0af110e5ee9cb727800151394 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 18 Jan 2024 19:57:47 -0500 Subject: [PATCH 1/9] Index.join result name --- doc/source/whatsnew/v2.3.0.rst | 3 +- pandas/core/indexes/base.py | 23 +++++++++------ pandas/core/indexes/datetimelike.py | 10 +++++-- pandas/tests/indexes/numeric/test_join.py | 34 +++++++++++------------ 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 629b044f24f90..d805573b1f0fe 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -113,8 +113,9 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`DataFrame.join` setting result name to ``None`` in a number of cases (:issue:`55815`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - +- Categorical ^^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 92283300c062a..61a612d54c15c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4711,6 +4711,10 @@ def _join_via_get_indexer( except TypeError: pass + names = other.names if how == "right" else self.names + if join_index.names != names: + join_index = join_index.set_names(names) + if join_index is self: lindexer = None else: @@ -5017,7 +5021,7 @@ def _join_monotonic( ridx = self._left_indexer_unique(other) else: join_array, lidx, ridx = self._left_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) elif how == "right": if self.is_unique: # We can perform much better than the general case @@ -5026,13 +5030,13 @@ def _join_monotonic( ridx = None else: join_array, ridx, lidx = other._left_indexer(self) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) elif how == "inner": join_array, lidx, ridx = self._inner_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) elif how == "outer": join_array, lidx, ridx = self._outer_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx) + join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) @@ -5044,21 +5048,22 @@ def _wrap_joined_index( other: Self, lidx: npt.NDArray[np.intp], ridx: npt.NDArray[np.intp], + how: JoinHow, ) -> Self: assert other.dtype == self.dtype - + names = other.names if how == "right" else self.names if isinstance(self, ABCMultiIndex): - name = self.names if self.names == other.names else None # error: Incompatible return value type (got "MultiIndex", # expected "Self") mask = lidx == -1 join_idx = self.take(lidx) right = cast("MultiIndex", other.take(ridx)) join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() - return join_index.set_names(name) # type: ignore[return-value] + return join_index.set_names(names) # type: ignore[return-value] else: - name = get_op_result_name(self, other) - return self._constructor._with_infer(joined, name=name, dtype=self.dtype) + return self._constructor._with_infer( + joined, name=names[0], dtype=self.dtype + ) @final @cache_readonly diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index cad8737a987d4..192a3d60717b9 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -75,6 +75,7 @@ from pandas._typing import ( Axis, + JoinHow, Self, npt, ) @@ -735,10 +736,15 @@ def _get_join_freq(self, other): return freq def _wrap_joined_index( - self, joined, other, lidx: npt.NDArray[np.intp], ridx: npt.NDArray[np.intp] + self, + joined, + other, + lidx: npt.NDArray[np.intp], + ridx: npt.NDArray[np.intp], + how: JoinHow, ): assert other.dtype == self.dtype, (other.dtype, self.dtype) - result = super()._wrap_joined_index(joined, other, lidx, ridx) + result = super()._wrap_joined_index(joined, other, lidx, ridx, how) result._data._freq = self._get_join_freq(other) return result diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 9839f40861d55..4389e3e046069 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -21,9 +21,9 @@ def test_join_non_unique(self): tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_inner(self): - index = Index(range(0, 20, 2), dtype=np.int64) - other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) - other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + index = Index(range(0, 20, 2), dtype=np.int64, name="lhs") + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64, name="rhs") + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64, name="rhs") # not monotonic res, lidx, ridx = index.join(other, how="inner", return_indexers=True) @@ -34,7 +34,7 @@ def test_join_inner(self): lidx = lidx.take(ind) ridx = ridx.take(ind) - eres = Index([2, 12], dtype=np.int64) + eres = Index([2, 12], dtype=np.int64, name="lhs") elidx = np.array([1, 6], dtype=np.intp) eridx = np.array([4, 1], dtype=np.intp) @@ -46,7 +46,7 @@ def test_join_inner(self): # monotonic res, lidx, ridx = index.join(other_mono, how="inner", return_indexers=True) - res2 = index.intersection(other_mono) + res2 = index.intersection(other_mono).set_names(["lhs"]) tm.assert_index_equal(res, res2) elidx = np.array([1, 6], dtype=np.intp) @@ -57,9 +57,9 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_left(self): - index = Index(range(0, 20, 2), dtype=np.int64) - other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) - other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + index = Index(range(0, 20, 2), dtype=np.int64, name="lhs") + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64, name="rhs") + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64, name="rhs") # not monotonic res, lidx, ridx = index.join(other, how="left", return_indexers=True) @@ -80,10 +80,10 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # non-unique - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = Index([1, 1, 2, 5], name="rhs") + idx2 = Index([1, 2, 5, 7, 9], name="lhs") res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) - eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + eres = Index([1, 1, 2, 5, 7, 9], name="lhs") # 1 is in idx2, so it should be x2 eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_index_equal(res, eres) @@ -91,9 +91,9 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_right(self): - index = Index(range(0, 20, 2), dtype=np.int64) - other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64) - other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64) + index = Index(range(0, 20, 2), dtype=np.int64, name="lhs") + other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64, name="rhs") + other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64, name="rhs") # not monotonic res, lidx, ridx = index.join(other, how="right", return_indexers=True) @@ -115,10 +115,10 @@ def test_join_right(self): assert ridx is None # non-unique - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) + idx = Index([1, 1, 2, 5], name="lhs") + idx2 = Index([1, 2, 5, 7, 9], name="rhs") res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) - eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 + eres = Index([1, 1, 2, 5, 7, 9], name="rhs") # 1 is in idx2, so it should be x2 elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_index_equal(res, eres) From 7ed52076bc2ec408683bf0e95b70aef674250c29 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 18 Jan 2024 20:02:32 -0500 Subject: [PATCH 2/9] whatsnew --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index d805573b1f0fe..a18953fcb37ed 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -113,7 +113,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Fixed bug in :meth:`DataFrame.join` setting result name to ``None`` in a number of cases (:issue:`55815`) +- Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - From 5bd6856e0c06c0fd4a82ad30c04147c2189ca70e Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 19 Jan 2024 17:56:18 -0500 Subject: [PATCH 3/9] update test --- pandas/tests/indexes/numeric/test_join.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/indexes/numeric/test_join.py b/pandas/tests/indexes/numeric/test_join.py index 4389e3e046069..4737af987a04e 100644 --- a/pandas/tests/indexes/numeric/test_join.py +++ b/pandas/tests/indexes/numeric/test_join.py @@ -28,12 +28,6 @@ def test_join_inner(self): # not monotonic res, lidx, ridx = index.join(other, how="inner", return_indexers=True) - # no guarantee of sortedness, so sort for comparison purposes - ind = res.argsort() - res = res.take(ind) - lidx = lidx.take(ind) - ridx = ridx.take(ind) - eres = Index([2, 12], dtype=np.int64, name="lhs") elidx = np.array([1, 6], dtype=np.intp) eridx = np.array([4, 1], dtype=np.intp) From b654f8f60150595be3d3cf9690f11a28bc2deb46 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 19 Jan 2024 20:02:10 -0500 Subject: [PATCH 4/9] Index._wrap_join_result to maintain cached attributes if possible --- pandas/core/indexes/base.py | 54 ++++++++++++++++--------- pandas/core/indexes/datetimelike.py | 16 ++++---- pandas/tests/indexes/multi/test_join.py | 5 ++- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 61a612d54c15c..58ea1607847e3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5021,7 +5021,9 @@ def _join_monotonic( ridx = self._left_indexer_unique(other) else: join_array, lidx, ridx = self._left_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) elif how == "right": if self.is_unique: # We can perform much better than the general case @@ -5030,40 +5032,52 @@ def _join_monotonic( ridx = None else: join_array, ridx, lidx = other._left_indexer(self) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) elif how == "inner": join_array, lidx, ridx = self._inner_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) elif how == "outer": join_array, lidx, ridx = self._outer_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) return join_index, lidx, ridx - def _wrap_joined_index( + def _wrap_join_result( self, joined: ArrayLike, other: Self, - lidx: npt.NDArray[np.intp], - ridx: npt.NDArray[np.intp], + lidx: npt.NDArray[np.intp] | None, + ridx: npt.NDArray[np.intp] | None, how: JoinHow, - ) -> Self: + ) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: assert other.dtype == self.dtype - names = other.names if how == "right" else self.names - if isinstance(self, ABCMultiIndex): - # error: Incompatible return value type (got "MultiIndex", - # expected "Self") - mask = lidx == -1 - join_idx = self.take(lidx) - right = cast("MultiIndex", other.take(ridx)) - join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() - return join_index.set_names(names) # type: ignore[return-value] + + if lidx is not None and lib.is_range_indexer(lidx, len(self)): + lidx = None + if ridx is not None and lib.is_range_indexer(ridx, len(other)): + ridx = None + + # return self or other if possible to maintain cached attributes + if lidx is None: + join_index = self + elif ridx is None: + join_index = other else: - return self._constructor._with_infer( - joined, name=names[0], dtype=self.dtype - ) + join_index = self._constructor._with_infer(joined, dtype=self.dtype) + + names = other.names if how == "right" else self.names + if join_index.names != names: + join_index = join_index.set_names(names) + + return join_index, lidx, ridx @final @cache_readonly diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 192a3d60717b9..ae13edab3a35a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -735,18 +735,20 @@ def _get_join_freq(self, other): freq = self.freq return freq - def _wrap_joined_index( + def _wrap_join_result( self, joined, other, - lidx: npt.NDArray[np.intp], - ridx: npt.NDArray[np.intp], + lidx: npt.NDArray[np.intp] | None, + ridx: npt.NDArray[np.intp] | None, how: JoinHow, - ): + ) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: assert other.dtype == self.dtype, (other.dtype, self.dtype) - result = super()._wrap_joined_index(joined, other, lidx, ridx, how) - result._data._freq = self._get_join_freq(other) - return result + join_index, lidx, ridx = super()._wrap_join_result( + joined, other, lidx, ridx, how + ) + join_index._data._freq = self._get_join_freq(other) + return join_index, lidx, ridx def _get_engine_target(self) -> np.ndarray: # engine methods and libjoin methods need dt64/td64 values cast to i8 diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 3fb428fecea41..85f15795cdfb5 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -35,7 +35,10 @@ def test_join_level(idx, other, join_type): assert join_index.equals(join_index2) tm.assert_numpy_array_equal(lidx, lidx2) - tm.assert_numpy_array_equal(ridx, ridx2) + if ridx is None: + assert ridx == ridx2 + else: + tm.assert_numpy_array_equal(ridx, ridx2) tm.assert_numpy_array_equal(join_index2.values, exp_values) From 9dc472d155296719cba2ef89ae87f7dea6594294 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 20 Jan 2024 08:48:40 -0500 Subject: [PATCH 5/9] Index._wrap_join_result to maintain cached attributes if possible --- pandas/core/indexes/base.py | 54 ++++++++++++++++--------- pandas/core/indexes/datetimelike.py | 16 ++++---- pandas/tests/indexes/multi/test_join.py | 5 ++- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 61a612d54c15c..58ea1607847e3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5021,7 +5021,9 @@ def _join_monotonic( ridx = self._left_indexer_unique(other) else: join_array, lidx, ridx = self._left_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) elif how == "right": if self.is_unique: # We can perform much better than the general case @@ -5030,40 +5032,52 @@ def _join_monotonic( ridx = None else: join_array, ridx, lidx = other._left_indexer(self) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) elif how == "inner": join_array, lidx, ridx = self._inner_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) elif how == "outer": join_array, lidx, ridx = self._outer_indexer(other) - join_index = self._wrap_joined_index(join_array, other, lidx, ridx, how) + join_index, lidx, ridx = self._wrap_join_result( + join_array, other, lidx, ridx, how + ) lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) return join_index, lidx, ridx - def _wrap_joined_index( + def _wrap_join_result( self, joined: ArrayLike, other: Self, - lidx: npt.NDArray[np.intp], - ridx: npt.NDArray[np.intp], + lidx: npt.NDArray[np.intp] | None, + ridx: npt.NDArray[np.intp] | None, how: JoinHow, - ) -> Self: + ) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: assert other.dtype == self.dtype - names = other.names if how == "right" else self.names - if isinstance(self, ABCMultiIndex): - # error: Incompatible return value type (got "MultiIndex", - # expected "Self") - mask = lidx == -1 - join_idx = self.take(lidx) - right = cast("MultiIndex", other.take(ridx)) - join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() - return join_index.set_names(names) # type: ignore[return-value] + + if lidx is not None and lib.is_range_indexer(lidx, len(self)): + lidx = None + if ridx is not None and lib.is_range_indexer(ridx, len(other)): + ridx = None + + # return self or other if possible to maintain cached attributes + if lidx is None: + join_index = self + elif ridx is None: + join_index = other else: - return self._constructor._with_infer( - joined, name=names[0], dtype=self.dtype - ) + join_index = self._constructor._with_infer(joined, dtype=self.dtype) + + names = other.names if how == "right" else self.names + if join_index.names != names: + join_index = join_index.set_names(names) + + return join_index, lidx, ridx @final @cache_readonly diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 192a3d60717b9..ae13edab3a35a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -735,18 +735,20 @@ def _get_join_freq(self, other): freq = self.freq return freq - def _wrap_joined_index( + def _wrap_join_result( self, joined, other, - lidx: npt.NDArray[np.intp], - ridx: npt.NDArray[np.intp], + lidx: npt.NDArray[np.intp] | None, + ridx: npt.NDArray[np.intp] | None, how: JoinHow, - ): + ) -> tuple[Self, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: assert other.dtype == self.dtype, (other.dtype, self.dtype) - result = super()._wrap_joined_index(joined, other, lidx, ridx, how) - result._data._freq = self._get_join_freq(other) - return result + join_index, lidx, ridx = super()._wrap_join_result( + joined, other, lidx, ridx, how + ) + join_index._data._freq = self._get_join_freq(other) + return join_index, lidx, ridx def _get_engine_target(self) -> np.ndarray: # engine methods and libjoin methods need dt64/td64 values cast to i8 diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 3fb428fecea41..85f15795cdfb5 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -35,7 +35,10 @@ def test_join_level(idx, other, join_type): assert join_index.equals(join_index2) tm.assert_numpy_array_equal(lidx, lidx2) - tm.assert_numpy_array_equal(ridx, ridx2) + if ridx is None: + assert ridx == ridx2 + else: + tm.assert_numpy_array_equal(ridx, ridx2) tm.assert_numpy_array_equal(join_index2.values, exp_values) From 02f773ec0d566b8425ce978eb1ca06746e0233a6 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 22 Jan 2024 20:45:14 -0500 Subject: [PATCH 6/9] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fdd757355d757..75761e3326e90 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -107,7 +107,7 @@ Performance improvements - Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) -- +- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: From 1dd35e414299e2b5357d6551dfead6989473fd03 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 22 Jan 2024 21:14:30 -0500 Subject: [PATCH 7/9] allow indexers to be None --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e093d551f3ead..f4848f83a0d92 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8016,8 +8016,8 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: right.columns, how="inner", level=None, return_indexers=True ) - new_left = left.iloc[:, lcols] - new_right = right.iloc[:, rcols] + new_left = left if lcols is None else left.iloc[:, lcols] + new_right = right if rcols is None else right.iloc[:, rcols] result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op From 43fe32238bf5f365ceffdccb87f41e9a47ab4760 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 22 Jan 2024 21:36:43 -0500 Subject: [PATCH 8/9] gh ref --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75761e3326e90..7393ce7a9971d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -105,9 +105,9 @@ Performance improvements - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`) - Performance improvement in :meth:`DataFrame.join` with ``how="left"`` or ``how="right"`` and ``sort=True`` (:issue:`56919`) - Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`) +- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) -- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_300.bug_fixes: From a43e4d454dc19c51ea45addc3d7a5a61055b30b7 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 23 Jan 2024 19:47:26 -0500 Subject: [PATCH 9/9] rename variables for clarity --- pandas/core/frame.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4848f83a0d92..97f4eaa7c208a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8012,19 +8012,17 @@ def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: left = self # GH#31623, only operate on shared columns - cols, lcols, rcols = left.columns.join( - right.columns, how="inner", level=None, return_indexers=True + cols, lcol_indexer, rcol_indexer = left.columns.join( + right.columns, how="inner", return_indexers=True ) - new_left = left if lcols is None else left.iloc[:, lcols] - new_right = right if rcols is None else right.iloc[:, rcols] + new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] + new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] result = op(new_left, new_right) # Do the join on the columns instead of using left._align_for_op # to avoid constructing two potentially large/sparse DataFrames - join_columns, _, _ = left.columns.join( - right.columns, how="outer", level=None, return_indexers=True - ) + join_columns = left.columns.join(right.columns, how="outer") if result.columns.has_duplicates: # Avoid reindexing with a duplicate axis.