diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a72f78f3ca30d..badbc88302d6b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -843,7 +843,7 @@ Interval Indexing ^^^^^^^^ -- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) +- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cbef4ed44dc06..d7e15bb2ad197 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -291,7 +291,7 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True) -> list: +def fast_unique_multiple(list arrays, sort: bool = True): """ Generate a list of unique values from a list of arrays. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1a3719233a1da..eb72355fce583 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3574,14 +3574,20 @@ def equal_levels(self, other: MultiIndex) -> bool: def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) + if ( + any(-1 in code for code in self.codes) + and any(-1 in code for code in self.codes) + or self.has_duplicates + or other.has_duplicates + ): + # This is only necessary if both sides have nans or one has dups, + # fast_unique_multiple is faster + result = super()._union(other, sort) + else: + rvals = other._values.astype(object, copy=False) + result = lib.fast_unique_multiple([self._values, rvals], sort=sort) - # We could get here with CategoricalIndex other - rvals = other._values.astype(object, copy=False) - uniq_tuples = lib.fast_unique_multiple([self._values, rvals], sort=sort) - - return MultiIndex.from_arrays( - zip(*uniq_tuples), sortorder=0, names=result_names - ) + return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 0b59e832ce3a8..eb456bee39dbf 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -3,7 +3,9 @@ import pandas as pd from pandas import ( + CategoricalIndex, Index, + IntervalIndex, MultiIndex, Series, ) @@ -508,3 +510,26 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture): result = mi1.intersection(mi2) expected = MultiIndex.from_arrays([[3.0, nulls_fixture], [1, 2]]) tm.assert_index_equal(result, expected) + + +def test_union_nan_got_duplicated(): + # GH#38977 + mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]]) + mi2 = MultiIndex.from_arrays([[1.0, np.nan, 3.0], [2, 3, 4]]) + result = mi1.union(mi2) + tm.assert_index_equal(result, mi2) + + +def test_union_duplicates(index): + # GH#38977 + if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): + # No duplicates in empty indexes + return + values = index.unique().values.tolist() + mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) + mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) + result = mi1.union(mi2) + tm.assert_index_equal(result, mi2.sort_values()) + + result = mi2.union(mi1) + tm.assert_index_equal(result, mi2.sort_values()) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 67bd5b309b634..5b7e90fe16d8f 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -2,7 +2,6 @@ import pytest from pandas._libs import ( - Timestamp, lib, writers as libwriters, ) @@ -43,11 +42,6 @@ def test_fast_unique_multiple_list_gen_sort(self): out = lib.fast_unique_multiple_list_gen(gen, sort=False) tm.assert_numpy_array_equal(np.array(out), expected) - def test_fast_unique_multiple_unsortable_runtimewarning(self): - arr = [np.array(["foo", Timestamp("2000")])] - with tm.assert_produces_warning(RuntimeWarning): - lib.fast_unique_multiple(arr, sort=None) - class TestIndexing: def test_maybe_indices_to_slice_left_edge(self):