Skip to content

Commit 98bd8a8

Browse files
lukemanleyim-vinicius
authored and
im-vinicius
committed
PERF: concatenation of MultiIndexed objects (MultiIndex.append) (pandas-dev#53697)
* improve perf of MultiIndex.append * fix test * style
1 parent 8eca98b commit 98bd8a8

File tree

5 files changed

+53
-11
lines changed

5 files changed

+53
-11
lines changed

asv_bench/benchmarks/multiindex_object.py

+26
Original file line numberDiff line numberDiff line change
@@ -396,4 +396,30 @@ def time_putmask_all_different(self):
396396
self.midx.putmask(self.mask, self.midx_values_different)
397397

398398

399+
class Append:
400+
params = ["datetime64[ns]", "int64", "string"]
401+
param_names = ["dtype"]
402+
403+
def setup(self, dtype):
404+
N1 = 1000
405+
N2 = 500
406+
left_level1 = range(N1)
407+
right_level1 = range(N1, N1 + N1)
408+
409+
if dtype == "datetime64[ns]":
410+
level2 = date_range(start="2000-01-01", periods=N2)
411+
elif dtype == "int64":
412+
level2 = range(N2)
413+
elif dtype == "string":
414+
level2 = tm.makeStringIndex(N2)
415+
else:
416+
raise NotImplementedError
417+
418+
self.left = MultiIndex.from_product([left_level1, level2])
419+
self.right = MultiIndex.from_product([right_level1, level2])
420+
421+
def time_append(self, dtype):
422+
self.left.append(self.right)
423+
424+
399425
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ Performance improvements
325325
- Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`)
326326
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
327327
- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
328+
- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
328329
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
329330
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
330331
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)

pandas/core/indexes/multi.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,10 @@
8888
Categorical,
8989
ExtensionArray,
9090
)
91-
from pandas.core.arrays.categorical import factorize_from_iterables
91+
from pandas.core.arrays.categorical import (
92+
factorize_from_iterables,
93+
recode_for_categories,
94+
)
9295
import pandas.core.common as com
9396
from pandas.core.construction import sanitize_array
9497
import pandas.core.indexes.base as ibase
@@ -2145,14 +2148,28 @@ def append(self, other):
21452148
if all(
21462149
(isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other
21472150
):
2148-
arrays, names = [], []
2151+
codes = []
2152+
levels = []
2153+
names = []
21492154
for i in range(self.nlevels):
2150-
label = self._get_level_values(i)
2151-
appended = [o._get_level_values(i) for o in other]
2152-
arrays.append(label.append(appended))
2153-
single_label_name = all(label.name == x.name for x in appended)
2154-
names.append(label.name if single_label_name else None)
2155-
return MultiIndex.from_arrays(arrays, names=names)
2155+
level_values = self.levels[i]
2156+
for mi in other:
2157+
level_values = level_values.union(mi.levels[i])
2158+
level_codes = [
2159+
recode_for_categories(
2160+
mi.codes[i], mi.levels[i], level_values, copy=False
2161+
)
2162+
for mi in ([self, *other])
2163+
]
2164+
level_name = self.names[i]
2165+
if any(mi.names[i] != level_name for mi in other):
2166+
level_name = None
2167+
codes.append(np.concatenate(level_codes))
2168+
levels.append(level_values)
2169+
names.append(level_name)
2170+
return MultiIndex(
2171+
codes=codes, levels=levels, names=names, verify_integrity=False
2172+
)
21562173

21572174
to_concat = (self._values,) + tuple(k._values for k in other)
21582175
new_tuples = np.concatenate(to_concat)

pandas/tests/frame/methods/test_combine_first.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ def test_combine_first_duplicates_rows_for_nan_index_values():
510510
"y": [12.0, 13.0, np.nan, 14.0],
511511
},
512512
index=MultiIndex.from_arrays(
513-
[[1, 2, 3, 4], [np.nan, 5.0, 6.0, 7.0]], names=["a", "b"]
513+
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
514514
),
515515
)
516516
combined = df1.combine_first(df2)

pandas/tests/indexes/multi/test_setops.py

-2
Original file line numberDiff line numberDiff line change
@@ -558,8 +558,6 @@ def test_union_with_missing_values_on_both_sides(nulls_fixture):
558558
mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]])
559559
result = mi1.union(mi2)
560560
expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]])
561-
# We don't particularly care about having levels[0] be float64, but it is
562-
expected = expected.set_levels([expected.levels[0].astype(np.float64)])
563561
tm.assert_index_equal(result, expected)
564562

565563

0 commit comments

Comments
 (0)