Skip to content

Commit 282a0fb

Browse files
authored
BUG: MultiIndex.union dropping duplicates from result (#38977)
1 parent 0fd4aa7 commit 282a0fb

File tree

5 files changed

+40
-15
lines changed

5 files changed

+40
-15
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,7 @@ Interval
885885
Indexing
886886
^^^^^^^^
887887

888-
- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`)
888+
- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`)
889889
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
890890
- Bug in :meth:`Series.loc` raising ``ValueError`` when input was filtered with a boolean list and values to set were a list with lower dimension (:issue:`20438`)
891891
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)

pandas/_libs/lib.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def item_from_zerodim(val: object) -> object:
291291

292292
@cython.wraparound(False)
293293
@cython.boundscheck(False)
294-
def fast_unique_multiple(list arrays, sort: bool = True) -> list:
294+
def fast_unique_multiple(list arrays, sort: bool = True):
295295
"""
296296
Generate a list of unique values from a list of arrays.
297297

pandas/core/indexes/multi.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -3557,14 +3557,20 @@ def equal_levels(self, other: MultiIndex) -> bool:
35573557

35583558
def _union(self, other, sort) -> MultiIndex:
35593559
other, result_names = self._convert_can_do_setop(other)
3560+
if (
3561+
any(-1 in code for code in self.codes)
3562+
and any(-1 in code for code in self.codes)
3563+
or self.has_duplicates
3564+
or other.has_duplicates
3565+
):
3566+
# This is only necessary if both sides have nans or one has dups,
3567+
# fast_unique_multiple is faster
3568+
result = super()._union(other, sort)
3569+
else:
3570+
rvals = other._values.astype(object, copy=False)
3571+
result = lib.fast_unique_multiple([self._values, rvals], sort=sort)
35603572

3561-
# We could get here with CategoricalIndex other
3562-
rvals = other._values.astype(object, copy=False)
3563-
uniq_tuples = lib.fast_unique_multiple([self._values, rvals], sort=sort)
3564-
3565-
return MultiIndex.from_arrays(
3566-
zip(*uniq_tuples), sortorder=0, names=result_names
3567-
)
3573+
return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names)
35683574

35693575
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
35703576
return is_object_dtype(dtype)

pandas/tests/indexes/multi/test_setops.py

+25
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
import pandas as pd
55
from pandas import (
6+
CategoricalIndex,
67
Index,
8+
IntervalIndex,
79
MultiIndex,
810
Series,
911
)
@@ -508,3 +510,26 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture):
508510
result = mi1.intersection(mi2)
509511
expected = MultiIndex.from_arrays([[3.0, nulls_fixture], [1, 2]])
510512
tm.assert_index_equal(result, expected)
513+
514+
515+
def test_union_nan_got_duplicated():
516+
# GH#38977
517+
mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]])
518+
mi2 = MultiIndex.from_arrays([[1.0, np.nan, 3.0], [2, 3, 4]])
519+
result = mi1.union(mi2)
520+
tm.assert_index_equal(result, mi2)
521+
522+
523+
def test_union_duplicates(index):
524+
# GH#38977
525+
if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)):
526+
# No duplicates in empty indexes
527+
return
528+
values = index.unique().values.tolist()
529+
mi1 = MultiIndex.from_arrays([values, [1] * len(values)])
530+
mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)])
531+
result = mi1.union(mi2)
532+
tm.assert_index_equal(result, mi2.sort_values())
533+
534+
result = mi2.union(mi1)
535+
tm.assert_index_equal(result, mi2.sort_values())

pandas/tests/libs/test_lib.py

-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import pytest
33

44
from pandas._libs import (
5-
Timestamp,
65
lib,
76
writers as libwriters,
87
)
@@ -43,11 +42,6 @@ def test_fast_unique_multiple_list_gen_sort(self):
4342
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
4443
tm.assert_numpy_array_equal(np.array(out), expected)
4544

46-
def test_fast_unique_multiple_unsortable_runtimewarning(self):
47-
arr = [np.array(["foo", Timestamp("2000")])]
48-
with tm.assert_produces_warning(RuntimeWarning):
49-
lib.fast_unique_multiple(arr, sort=None)
50-
5145

5246
class TestIndexing:
5347
def test_maybe_indices_to_slice_left_edge(self):

0 commit comments

Comments
 (0)