Skip to content

Commit e3d0126

Browse files
authored
PERF: MultiIndex.union (#48752)
1 parent 4b124de commit e3d0126

File tree

4 files changed

+5
-47
lines changed

4 files changed

+5
-47
lines changed

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ Performance improvements
140140
- Performance improvement in :meth:`.DataFrameGroupBy.median` and :meth:`.SeriesGroupBy.median` and :meth:`.GroupBy.cumprod` for nullable dtypes (:issue:`37493`)
141141
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
142142
- Performance improvement in :meth:`MultiIndex.size` (:issue:`48723`)
143-
- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
143+
- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`, :issue:`48752`)
144144
- Performance improvement in :meth:`MultiIndex.difference` (:issue:`48606`)
145145
- Performance improvement in :class:`MultiIndex` set operations with sort=None (:issue:`49010`)
146146
- Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`)

pandas/_libs/lib.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@ def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
5959
def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
6060
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
6161
def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
62-
def fast_unique_multiple(left: np.ndarray, right: np.ndarray) -> list: ...
6362
def map_infer(
6463
arr: np.ndarray,
6564
f: Callable[[Any], Any],

pandas/_libs/lib.pyx

-40
Original file line numberDiff line numberDiff line change
@@ -310,46 +310,6 @@ def item_from_zerodim(val: object) -> object:
310310
return val
311311

312312

313-
@cython.wraparound(False)
314-
@cython.boundscheck(False)
315-
def fast_unique_multiple(ndarray left, ndarray right) -> list:
316-
"""
317-
Generate a list indices we have to add to the left to get the union
318-
of both arrays.
319-
320-
Parameters
321-
----------
322-
left : np.ndarray
323-
Left array that is used as base.
324-
right : np.ndarray
325-
right array that is checked for values that are not in left.
326-
right can not have duplicates.
327-
328-
Returns
329-
-------
330-
list of indices that we have to add to the left array.
331-
"""
332-
cdef:
333-
Py_ssize_t j, n
334-
list indices = []
335-
set table = set()
336-
object val, stub = 0
337-
338-
n = len(left)
339-
for j in range(n):
340-
val = left[j]
341-
if val not in table:
342-
table.add(val)
343-
344-
n = len(right)
345-
for j in range(n):
346-
val = right[j]
347-
if val not in table:
348-
indices.append(j)
349-
350-
return indices
351-
352-
353313
@cython.wraparound(False)
354314
@cython.boundscheck(False)
355315
def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list:

pandas/core/indexes/multi.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -3688,7 +3688,7 @@ def _union(self, other, sort) -> MultiIndex:
36883688
or other.has_duplicates
36893689
):
36903690
# This is only necessary if both sides have nans or other has dups,
3691-
# fast_unique_multiple is faster
3691+
# otherwise difference is faster
36923692
result = super()._union(other, sort)
36933693

36943694
if isinstance(result, MultiIndex):
@@ -3698,10 +3698,9 @@ def _union(self, other, sort) -> MultiIndex:
36983698
)
36993699

37003700
else:
3701-
rvals = other._values.astype(object, copy=False)
3702-
right_missing = lib.fast_unique_multiple(self._values, rvals)
3703-
if right_missing:
3704-
result = self.append(other.take(right_missing))
3701+
right_missing = other.difference(self, sort=False)
3702+
if len(right_missing):
3703+
result = self.append(right_missing)
37053704
else:
37063705
result = self._get_reconciled_name_object(other)
37073706

0 commit comments

Comments
 (0)