From 1adef2cdc8324327087369db58553226e8b3a321 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Jun 2023 20:35:09 -0400 Subject: [PATCH 1/3] improve perf of MultiIndex.append --- asv_bench/benchmarks/multiindex_object.py | 26 ++++++++++++++++ doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/indexes/multi.py | 36 ++++++++++++++++++----- pandas/tests/indexes/multi/test_setops.py | 2 -- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 9c997b5386eaa..87dcdb16fa647 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -396,4 +396,30 @@ def time_putmask_all_different(self): self.midx.putmask(self.mask, self.midx_values_different) +class Append: + params = ["datetime64[ns]", "int64", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + N1 = 1000 + N2 = 500 + left_level1 = range(N1) + right_level1 = range(N1, N1 + N1) + + if dtype == "datetime64[ns]": + level2 = date_range(start="2000-01-01", periods=N2) + elif dtype == "int64": + level2 = range(N2) + elif dtype == "string": + level2 = tm.makeStringIndex(N2) + else: + raise NotImplementedError + + self.left = MultiIndex.from_product([left_level1, level2]) + self.right = MultiIndex.from_product([right_level1, level2]) + + def time_append(self, dtype): + self.left.append(self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e2f0904a78cf9..99ea21cf84fcf 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -319,6 +319,7 @@ Performance improvements - Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`) - Performance improvement in :class:`Series` reductions (:issue:`52341`) - Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`) +- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`) - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`) - Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`) - Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a1c240f72a28b..995a8f3fcb323 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -88,7 +88,10 @@ Categorical, ExtensionArray, ) -from pandas.core.arrays.categorical import factorize_from_iterables +from pandas.core.arrays.categorical import ( + factorize_from_iterables, + recode_for_categories, +) import pandas.core.common as com from pandas.core.construction import sanitize_array import pandas.core.indexes.base as ibase @@ -2145,14 +2148,31 @@ def append(self, other): if all( (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other ): - arrays, names = [], [] + codes = [] + levels = [] + names = [] for i in range(self.nlevels): - label = self._get_level_values(i) - appended = [o._get_level_values(i) for o in other] - arrays.append(label.append(appended)) - single_label_name = all(label.name == x.name for x in appended) - names.append(label.name if single_label_name else None) - return MultiIndex.from_arrays(arrays, names=names) + level_values = self.levels[i] + for mi in other: + level_values = level_values.union(mi.levels[i]) + level_codes = [ + recode_for_categories( + mi.codes[i], + mi.levels[i], + level_values, + copy=False, + ) + for mi in ([self, *other]) + ] + level_name = self.names[i] + if any(mi.names[i] != level_name for mi in other): + level_name = None + codes.append(np.concatenate(level_codes)) + levels.append(level_values) + names.append(level_name) + return MultiIndex( + codes=codes, levels=levels, names=names, verify_integrity=False + ) to_concat = (self._values,) + tuple(k._values for k in other) new_tuples = np.concatenate(to_concat) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 147247d23c2c6..fd0928b82ecbf 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -558,8 +558,6 @@ def test_union_with_missing_values_on_both_sides(nulls_fixture): mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]]) result = mi1.union(mi2) expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]]) - # We don't particularly care about having levels[0] be float64, but it is - expected = expected.set_levels([expected.levels[0].astype(np.float64)]) tm.assert_index_equal(result, expected) From b109df494638e7ed4346f5bb73f77bd9ebd6fc1d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 15 Jun 2023 22:00:30 -0400 Subject: [PATCH 2/3] fix test --- pandas/tests/frame/methods/test_combine_first.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 7983aace587c6..9b9c3f88277e8 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -510,7 +510,7 @@ def test_combine_first_duplicates_rows_for_nan_index_values(): "y": [12.0, 13.0, np.nan, 14.0], }, index=MultiIndex.from_arrays( - [[1, 2, 3, 4], [np.nan, 5.0, 6.0, 7.0]], names=["a", "b"] + [[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"] ), ) combined = df1.combine_first(df2) From af98e344f1f09a7d1f58048865a1f8923facc227 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 19 Jun 2023 21:01:18 -0400 Subject: [PATCH 3/3] style --- pandas/core/indexes/multi.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 995a8f3fcb323..5181623c0c327 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2157,10 +2157,7 @@ def append(self, other): level_values = level_values.union(mi.levels[i]) level_codes = [ recode_for_categories( - mi.codes[i], - mi.levels[i], - level_values, - copy=False, + mi.codes[i], mi.levels[i], level_values, copy=False ) for mi in ([self, *other]) ]