Skip to content

PERF: concatenation of MultiIndexed objects (MultiIndex.append) #53697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions asv_bench/benchmarks/multiindex_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,4 +396,30 @@ def time_putmask_all_different(self):
self.midx.putmask(self.mask, self.midx_values_different)


class Append:
params = ["datetime64[ns]", "int64", "string"]
param_names = ["dtype"]

def setup(self, dtype):
N1 = 1000
N2 = 500
left_level1 = range(N1)
right_level1 = range(N1, N1 + N1)

if dtype == "datetime64[ns]":
level2 = date_range(start="2000-01-01", periods=N2)
elif dtype == "int64":
level2 = range(N2)
elif dtype == "string":
level2 = tm.makeStringIndex(N2)
else:
raise NotImplementedError

self.left = MultiIndex.from_product([left_level1, level2])
self.right = MultiIndex.from_product([right_level1, level2])

def time_append(self, dtype):
self.left.append(self.right)


from .pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ Performance improvements
- Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`)
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
Expand Down
33 changes: 25 additions & 8 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@
Categorical,
ExtensionArray,
)
from pandas.core.arrays.categorical import factorize_from_iterables
from pandas.core.arrays.categorical import (
factorize_from_iterables,
recode_for_categories,
)
import pandas.core.common as com
from pandas.core.construction import sanitize_array
import pandas.core.indexes.base as ibase
Expand Down Expand Up @@ -2145,14 +2148,28 @@ def append(self, other):
if all(
(isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other
):
arrays, names = [], []
codes = []
levels = []
names = []
for i in range(self.nlevels):
label = self._get_level_values(i)
appended = [o._get_level_values(i) for o in other]
arrays.append(label.append(appended))
single_label_name = all(label.name == x.name for x in appended)
names.append(label.name if single_label_name else None)
return MultiIndex.from_arrays(arrays, names=names)
level_values = self.levels[i]
for mi in other:
level_values = level_values.union(mi.levels[i])
level_codes = [
recode_for_categories(
mi.codes[i], mi.levels[i], level_values, copy=False
)
for mi in ([self, *other])
]
level_name = self.names[i]
if any(mi.names[i] != level_name for mi in other):
level_name = None
codes.append(np.concatenate(level_codes))
levels.append(level_values)
names.append(level_name)
return MultiIndex(
codes=codes, levels=levels, names=names, verify_integrity=False
)

to_concat = (self._values,) + tuple(k._values for k in other)
new_tuples = np.concatenate(to_concat)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def test_combine_first_duplicates_rows_for_nan_index_values():
"y": [12.0, 13.0, np.nan, 14.0],
},
index=MultiIndex.from_arrays(
[[1, 2, 3, 4], [np.nan, 5.0, 6.0, 7.0]], names=["a", "b"]
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
),
)
combined = df1.combine_first(df2)
Expand Down
2 changes: 0 additions & 2 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,8 +558,6 @@ def test_union_with_missing_values_on_both_sides(nulls_fixture):
mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]])
result = mi1.union(mi2)
expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]])
# We don't particularly care about having levels[0] be float64, but it is
expected = expected.set_levels([expected.levels[0].astype(np.float64)])
tm.assert_index_equal(result, expected)


Expand Down