From eb34853bef992b3ffc1477f0f2d63fd5d40cbac8 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 20:17:49 +0100 Subject: [PATCH 01/13] Multiindex.union --- pandas/core/indexes/multi.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1da355c31987e..31c46c47fdb01 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3522,14 +3522,9 @@ def equal_levels(self, other) -> bool: def _union(self, other, sort): other, result_names = self._convert_can_do_setop(other) + result = super()._union(other, sort) - # We could get here with CategoricalIndex other - rvals = other._values.astype(object, copy=False) - uniq_tuples = lib.fast_unique_multiple([self._values, rvals], sort=sort) - - return MultiIndex.from_arrays( - zip(*uniq_tuples), sortorder=0, names=result_names - ) + return MultiIndex.from_tuples(result, sortorder=0, names=result_names) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) From eab8d1f8b298c9b26548deadfa16d21f73ba76fd Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 20:16:54 +0100 Subject: [PATCH 02/13] Remove old code --- pandas/_libs/lib.pyx | 48 ----------------------------------- pandas/tests/libs/test_lib.py | 5 ---- 2 files changed, 53 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0f796b2b65dff..45b500c8d9400 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -250,54 +250,6 @@ def item_from_zerodim(val: object) -> object: return val -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True): - """ - Generate a list of unique values from a list of arrays. - - Parameters - ---------- - list : array-like - List of array-like objects. - sort : bool - Whether or not to sort the resulting unique list. - - Returns - ------- - list of unique values - """ - cdef: - ndarray[object] buf - Py_ssize_t k = len(arrays) - Py_ssize_t i, j, n - list uniques = [] - dict table = {} - object val, stub = 0 - - for i in range(k): - buf = arrays[i] - n = len(buf) - for j in range(n): - val = buf[j] - if val not in table: - table[val] = stub - uniques.append(val) - - if sort is None: - try: - uniques.sort() - except TypeError: - warnings.warn( - "The values in the array are unorderable. " - "Pass `sort=False` to suppress this warning.", - RuntimeWarning, - ) - pass - - return uniques - - @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index da3e18c8d9634..6a55befec95ed 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -39,11 +39,6 @@ def test_fast_unique_multiple_list_gen_sort(self): out = lib.fast_unique_multiple_list_gen(gen, sort=False) tm.assert_numpy_array_equal(np.array(out), expected) - def test_fast_unique_multiple_unsortable_runtimewarning(self): - arr = [np.array(["foo", Timestamp("2000")])] - with tm.assert_produces_warning(RuntimeWarning): - lib.fast_unique_multiple(arr, sort=None) - class TestIndexing: def test_maybe_indices_to_slice_left_edge(self): From 693da3861b742497402b76df3ff2193adf923211 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 20:38:54 +0100 Subject: [PATCH 03/13] Add testcases --- pandas/tests/indexes/multi/test_setops.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index d5b29527ee08e..95f43cf5fbaf8 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import Index, MultiIndex, Series +from pandas import Index, MultiIndex, Series, IntervalIndex, CategoricalIndex import pandas._testing as tm @@ -483,3 +483,24 @@ def test_intersection_different_names(): mi2 = MultiIndex.from_arrays([[1], [3]]) result = mi.intersection(mi2) tm.assert_index_equal(result, mi2) + + +def test_union_nan_got_duplicated(): + # GH + mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]]) + mi2 = MultiIndex.from_arrays([[1.0, np.nan, 3.0], [2, 3, 4]]) + result = mi1.union(mi2) + tm.assert_index_equal(result, mi2) + + +def test_union_duplicates(index): + # GH + # Index has to be sorted as of now. + if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): + # No duplicates in empty indexes + return + values = index.unique().sort_values().values.tolist() + mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) + mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) + result = mi1.union(mi2) + tm.assert_index_equal(result, mi2.sort_values()) From b5c22645e8a597e7268d215f7479ca40065ba984 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 20:43:05 +0100 Subject: [PATCH 04/13] Add gh reference and whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/tests/indexes/multi/test_setops.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b4b98ec0403a8..252a9a2aec817 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -256,6 +256,7 @@ MultiIndex - Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`) - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`) - Bug in :meth:`MultiIndex.intersection` always returning empty when intersecting with :class:`CategoricalIndex` (:issue:`38653`) +- Bug in :meth:`MultiIndex.union` dropping duplicates from result (:issue:`38977`) I/O ^^^ diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 95f43cf5fbaf8..56b0226e311d4 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -486,7 +486,7 @@ def test_intersection_different_names(): def test_union_nan_got_duplicated(): - # GH + # GH#38977 mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]]) mi2 = MultiIndex.from_arrays([[1.0, np.nan, 3.0], [2, 3, 4]]) result = mi1.union(mi2) @@ -494,7 +494,7 @@ def test_union_nan_got_duplicated(): def test_union_duplicates(index): - # GH + # GH#38977 # Index has to be sorted as of now. if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): # No duplicates in empty indexes From f15bac53b8205ccb4a9ec05a7022a13954d7e0e7 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 20:44:08 +0100 Subject: [PATCH 05/13] Reorder imports --- pandas/tests/indexes/multi/test_setops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 56b0226e311d4..2f9557d78015d 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import Index, MultiIndex, Series, IntervalIndex, CategoricalIndex +from pandas import CategoricalIndex, Index, IntervalIndex, MultiIndex, Series import pandas._testing as tm From 661f0159e587f749762c9429b45f6812335c3a0b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 20:51:15 +0100 Subject: [PATCH 06/13] Remove import --- pandas/tests/libs/test_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 6a55befec95ed..f95d7dbc9a8b1 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas._libs import Timestamp, lib, writers as libwriters +from pandas._libs import lib, writers as libwriters from pandas import Index import pandas._testing as tm From d83f3c40c69059254af0d800257bc17e32c7715f Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 10 Apr 2021 00:54:05 +0200 Subject: [PATCH 07/13] Fix bug in index.union with duplicates and not a subset of each other --- pandas/core/indexes/base.py | 6 +++--- pandas/tests/indexes/test_setops.py | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a0727500ecc81..2b7425e16e247 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2979,8 +2979,8 @@ def _union(self, other: Index, sort): value_list.extend([x for x in rvals if x not in value_set]) return Index(value_list)._values # do type inference here - elif not other.is_unique and not self.is_unique: - # self and other both have duplicates + elif not other.is_unique: + # other has duplicates # error: Argument 1 to "union_with_duplicates" has incompatible type # "Union[ExtensionArray, ndarray]"; expected "ndarray" @@ -2989,7 +2989,7 @@ def _union(self, other: Index, sort): result = algos.union_with_duplicates(lvals, rvals) # type: ignore[arg-type] return _maybe_try_sort(result, sort) - # Either other or self is not unique + # Self may have duplicates # find indexes of things in "other" that are not in "self" if self.is_unique: indexer = self.get_indexer(other) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 5cff23943b57d..8cadbd1a25864 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -579,6 +579,29 @@ def test_union_nan_in_both(dup): tm.assert_index_equal(result, expected) +@pytest.mark.parametrize( + "cls", + [ + Int64Index, + Float64Index, + DatetimeIndex, + TimedeltaIndex, + lambda x: Index(x, dtype=object), + ], +) +def test_union_with_duplicate_index_not_subset_and_non_monotonic(cls): + # GH#36289 + a = cls([1, 0, 0, 2]) + b = cls([0, 1]) + expected = cls([0, 0, 1, 2]) + + result = a.union(b) + tm.assert_index_equal(result, expected) + + result = a.union(b) + tm.assert_index_equal(result, expected) + + class TestSetOpsUnsorted: # These may eventually belong in a dtype-specific test_setops, or # parametrized over a more general fixture From bb855f2addcb36255c9d50479adba466fcaaa31c Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 10 Apr 2021 00:58:50 +0200 Subject: [PATCH 08/13] Adjust tests to cover all cases --- pandas/tests/indexes/multi/test_setops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 8ec4f3ea0aa34..ef2501abc8dfc 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -510,11 +510,10 @@ def test_union_nan_got_duplicated(): def test_union_duplicates(index): # GH#38977 - # Index has to be sorted as of now. if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)): # No duplicates in empty indexes return - values = index.unique().sort_values().values.tolist() + values = index.unique().values.tolist() mi1 = MultiIndex.from_arrays([values, [1] * len(values)]) mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) result = mi1.union(mi2) From 52b40cf948cd5013aafd0a9100beea497197028d Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 13 Apr 2021 18:09:39 +0200 Subject: [PATCH 09/13] Move whatsnew --- doc/source/whatsnew/v1.3.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3496f6530fb5f..1e08908a26819 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -672,7 +672,7 @@ Interval Indexing ^^^^^^^^ -- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) +- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`38745`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) @@ -714,7 +714,6 @@ MultiIndex - Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in result (:issue:`38623`) - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differently ordered (:issue:`38439`) - Bug in :meth:`MultiIndex.intersection` always returning empty when intersecting with :class:`CategoricalIndex` (:issue:`38653`) -- Bug in :meth:`MultiIndex.union` dropping duplicates from result (:issue:`38977`) I/O ^^^ From f77120cc5ff93943f619bdff8eb659393dbbbf26 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 14 Apr 2021 20:22:58 +0200 Subject: [PATCH 10/13] Adjust test cases --- pandas/tests/indexes/multi/test_setops.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index ef2501abc8dfc..ecd4b4c384ccd 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -518,3 +518,6 @@ def test_union_duplicates(index): mi2 = MultiIndex.from_arrays([[values[0]] + values, [1] * (len(values) + 1)]) result = mi1.union(mi2) tm.assert_index_equal(result, mi2.sort_values()) + + result = mi2.union(mi1) + tm.assert_index_equal(result, mi2.sort_values()) From 3d62a2db948c97b1c5aedaf39a70a1fbb3ec45ad Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 1 May 2021 01:32:33 +0200 Subject: [PATCH 11/13] Use from arrays --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3041a94c20e48..5a3eb6010d63d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3570,7 +3570,7 @@ def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) result = super()._union(other, sort) - return MultiIndex.from_tuples(result, sortorder=0, names=result_names) + return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) From 3a82a453b079104186dd7457cbd69c0eb03fdf7b Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 1 May 2021 18:40:30 +0200 Subject: [PATCH 12/13] Add old logic partly back in --- pandas/core/indexes/multi.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5a3eb6010d63d..6a0b5dd1463f3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3568,7 +3568,11 @@ def equal_levels(self, other: MultiIndex) -> bool: def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) - result = super()._union(other, sort) + if self.hasnans or other.hasnans: + result = super()._union(other, sort) + else: + rvals = other._values.astype(object, copy=False) + result = lib.fast_unique_multiple([self._values, rvals], sort=sort) return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) From 692272cb48ceb9011a1be6719796e8baa60896b5 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 1 May 2021 18:51:54 +0200 Subject: [PATCH 13/13] Only dispatch up in certain cases --- pandas/_libs/lib.pyx | 48 ++++++++++++++++++++++++++++++++++++ pandas/core/indexes/multi.py | 9 ++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 2e6a50e29c1c1..34b4e2c313d3a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -268,6 +268,54 @@ def item_from_zerodim(val: object) -> object: return val +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple(list arrays, sort: bool = True): + """ + Generate a list of unique values from a list of arrays. + + Parameters + ---------- + list : array-like + List of array-like objects. + sort : bool + Whether or not to sort the resulting unique list. + + Returns + ------- + list of unique values + """ + cdef: + ndarray[object] buf + Py_ssize_t k = len(arrays) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i in range(k): + buf = arrays[i] + n = len(buf) + for j in range(n): + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + + if sort is None: + try: + uniques.sort() + except TypeError: + warnings.warn( + "The values in the array are unorderable. " + "Pass `sort=False` to suppress this warning.", + RuntimeWarning, + ) + pass + + return uniques + + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6a0b5dd1463f3..6144cfb8a20af 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3568,7 +3568,14 @@ def equal_levels(self, other: MultiIndex) -> bool: def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) - if self.hasnans or other.hasnans: + if ( + any(-1 in code for code in self.codes) + and any(-1 in code for code in self.codes) + or self.has_duplicates + or other.has_duplicates + ): + # This is only necessary if both sides have nans or one has dups, + # fast_unique_multiple is faster result = super()._union(other, sort) else: rvals = other._values.astype(object, copy=False)