Skip to content

Commit 81b5f1d

Browse files
authored
PERF: merge on sorted MultiIndex (#48504)
* merge on sorted multiindex performance * whatsnew * faster asv * additional asv cases * avoid going through multi._values
1 parent 3038c6e commit 81b5f1d

File tree

3 files changed

+41
-4
lines changed

3 files changed

+41
-4
lines changed

asv_bench/benchmarks/join_merge.py

+37
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
DataFrame,
77
MultiIndex,
88
Series,
9+
array,
910
concat,
1011
date_range,
1112
merge,
@@ -411,6 +412,42 @@ def time_multiby(self, direction, tolerance):
411412
)
412413

413414

415+
class MergeMultiIndex:
416+
params = [
417+
[
418+
("int64", "int64"),
419+
("datetime64[ns]", "int64"),
420+
("Int64", "Int64"),
421+
],
422+
["left", "right", "inner", "outer"],
423+
]
424+
param_names = ["dtypes", "how"]
425+
426+
def setup(self, dtypes, how):
427+
n = 100_000
428+
offset = 50_000
429+
mi1 = MultiIndex.from_arrays(
430+
[
431+
array(np.arange(n), dtype=dtypes[0]),
432+
array(np.arange(n), dtype=dtypes[1]),
433+
]
434+
)
435+
mi2 = MultiIndex.from_arrays(
436+
[
437+
array(np.arange(offset, n + offset), dtype=dtypes[0]),
438+
array(np.arange(offset, n + offset), dtype=dtypes[1]),
439+
]
440+
)
441+
self.df1 = DataFrame({"col1": 1}, index=mi1)
442+
self.df2 = DataFrame({"col2": 2}, index=mi2)
443+
444+
def time_merge_sorted_multiindex(self, dtypes, how):
445+
# copy to avoid MultiIndex._values caching
446+
df1 = self.df1.copy()
447+
df2 = self.df2.copy()
448+
merge(df1, df2, how=how, left_index=True, right_index=True)
449+
450+
414451
class Align:
415452
def setup(self):
416453
size = 5 * 10**5

doc/source/whatsnew/v1.6.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ Performance improvements
111111
- Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`)
112112
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
113113
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
114+
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
114115
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
115116
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
116117
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)

pandas/core/indexes/base.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -3557,6 +3557,7 @@ def _intersection(self, other: Index, sort: bool = False):
35573557
self.is_monotonic_increasing
35583558
and other.is_monotonic_increasing
35593559
and self._can_use_libjoin
3560+
and not isinstance(self, ABCMultiIndex)
35603561
):
35613562
try:
35623563
res_indexer, indexer, _ = self._inner_indexer(other)
@@ -4741,13 +4742,11 @@ def join(
47414742
else:
47424743
return self._join_non_unique(other, how=how)
47434744
elif (
4745+
# GH48504: exclude MultiIndex to avoid going through MultiIndex._values
47444746
self.is_monotonic_increasing
47454747
and other.is_monotonic_increasing
47464748
and self._can_use_libjoin
4747-
and (
4748-
not isinstance(self, ABCMultiIndex)
4749-
or not any(is_categorical_dtype(dtype) for dtype in self.dtypes)
4750-
)
4749+
and not isinstance(self, ABCMultiIndex)
47514750
and not is_categorical_dtype(self.dtype)
47524751
):
47534752
# Categorical is monotonic if data are ordered as categories, but join can

0 commit comments

Comments
 (0)