Skip to content

Commit a2f890d

Browse files
lukemanleynoatamir
authored andcommitted
PERF: join/merge on subset of MultiIndex (pandas-dev#48611)
1 parent 4f0ebef commit a2f890d

File tree

3 files changed

+18
-11
lines changed

3 files changed

+18
-11
lines changed

asv_bench/benchmarks/join_merge.py

+12
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,18 @@ def time_left_outer_join_index(self):
159159
self.left.join(self.right, on="jim")
160160

161161

162+
class JoinMultiindexSubset:
163+
def setup(self):
164+
N = 100_000
165+
mi1 = MultiIndex.from_arrays([np.arange(N)] * 4, names=["a", "b", "c", "d"])
166+
mi2 = MultiIndex.from_arrays([np.arange(N)] * 2, names=["a", "b"])
167+
self.left = DataFrame({"col1": 1}, index=mi1)
168+
self.right = DataFrame({"col2": 2}, index=mi2)
169+
170+
def time_join_multiindex_subset(self):
171+
self.left.join(self.right)
172+
173+
162174
class JoinEmpty:
163175
def setup(self):
164176
N = 100_000

doc/source/whatsnew/v1.6.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ Performance improvements
141141
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
142142
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
143143
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
144+
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
144145
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
145146
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
146147
- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)

pandas/core/reshape/merge.py

+5-11
Original file line numberDiff line numberDiff line change
@@ -1669,7 +1669,7 @@ def restore_dropped_levels_multijoin(
16691669
Returns the levels, labels and names of a multi-index to multi-index join.
16701670
Depending on the type of join, this method restores the appropriate
16711671
dropped levels of the joined multi-index.
1672-
The method relies on lidx, rindexer which hold the index positions of
1672+
The method relies on lindexer, rindexer which hold the index positions of
16731673
left and right, where a join was feasible
16741674
16751675
Parameters
@@ -1715,15 +1715,6 @@ def _convert_to_multiindex(index: Index) -> MultiIndex:
17151715
join_codes = join_index.codes
17161716
join_names = join_index.names
17171717

1718-
# lindexer and rindexer hold the indexes where the join occurred
1719-
# for left and right respectively. If left/right is None then
1720-
# the join occurred on all indices of left/right
1721-
if lindexer is None:
1722-
lindexer = range(left.size)
1723-
1724-
if rindexer is None:
1725-
rindexer = range(right.size)
1726-
17271718
# Iterate through the levels that must be restored
17281719
for dropped_level_name in dropped_level_names:
17291720
if dropped_level_name in left.names:
@@ -1740,7 +1731,10 @@ def _convert_to_multiindex(index: Index) -> MultiIndex:
17401731
# Inject -1 in the codes list where a join was not possible
17411732
# IOW indexer[i]=-1
17421733
codes = idx.codes[name_idx]
1743-
restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
1734+
if indexer is None:
1735+
restore_codes = codes
1736+
else:
1737+
restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
17441738

17451739
join_levels = join_levels + [restore_levels]
17461740
join_codes = join_codes + [restore_codes]

0 commit comments

Comments
 (0)