diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 41db72612a66b..9fc111eb203e3 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -419,6 +419,8 @@ Interval Indexing ^^^^^^^^ + +- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 74fb0e2bd54fb..04eef635dc79b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1866,3 +1866,31 @@ def _sort_tuples(values: np.ndarray): arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] + + +def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray: + """ + Extracts the union from lvals and rvals with respect to duplicates and nans in + both arrays. + + Parameters + ---------- + lvals: np.ndarray + left values which is ordered in front. + rvals: np.ndarray + right values ordered after lvals. + + Returns + ------- + np.ndarray containing the unsorted union of both arrays + """ + indexer = [] + l_count = value_counts(lvals, dropna=False) + r_count = value_counts(rvals, dropna=False) + l_count, r_count = l_count.align(r_count, fill_value=0) + unique_array = unique(np.append(lvals, rvals)) + if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals): + unique_array = pd_array(unique_array) + for i, value in enumerate(unique_array): + indexer += [i] * int(max(l_count[value], r_count[value])) + return unique_array.take(indexer) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 30190ef950af5..44c9b33ae51c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2939,32 +2939,44 @@ def _union(self, other: Index, sort): lvals = self._values rvals = other._values - if sort is None and self.is_monotonic and other.is_monotonic: + if ( + sort is None + and self.is_monotonic + and other.is_monotonic + and not (self.has_duplicates and other.has_duplicates) + ): + # Both are unique and monotonic, so can use outer join try: - result = self._outer_indexer(lvals, rvals)[0] + return self._outer_indexer(lvals, rvals)[0] except (TypeError, IncompatibleFrequency): # incomparable objects - result = list(lvals) + value_list = list(lvals) # worth making this faster? a very unusual case value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - result = Index(result)._values # do type inference here - else: - # find indexes of things in "other" that are not in "self" - if self.is_unique: - indexer = self.get_indexer(other) - missing = (indexer == -1).nonzero()[0] - else: - missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) + value_list.extend([x for x in rvals if x not in value_set]) + return Index(value_list)._values # do type inference here - if len(missing) > 0: - other_diff = algos.take_nd(rvals, missing, allow_fill=False) - result = concat_compat((lvals, other_diff)) + elif not other.is_unique and not self.is_unique: + # self and other both have duplicates + result = algos.union_with_duplicates(lvals, rvals) + return _maybe_try_sort(result, sort) - else: - result = lvals + # Either other or self is not unique + # find indexes of things in "other" that are not in "self" + if self.is_unique: + indexer = self.get_indexer(other) + missing = (indexer == -1).nonzero()[0] + else: + missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) + + if len(missing) > 0: + other_diff = algos.take_nd(rvals, missing, allow_fill=False) + result = concat_compat((lvals, other_diff)) + else: + result = lvals + if not self.is_monotonic or not other.is_monotonic: result = _maybe_try_sort(result, sort) return result diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index f847bf58f463f..5b9528d185827 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -348,3 +348,25 @@ def test_intersection_equal_duplicates(self): idx_dup = idx.append(idx) result = idx_dup.intersection(idx_dup) tm.assert_index_equal(result, idx) + + def test_union_duplicates(self): + # GH#36289 + idx = pd.period_range("2011-01-01", periods=2) + idx_dup = idx.append(idx) + + idx2 = pd.period_range("2011-01-02", periods=2) + idx2_dup = idx2.append(idx2) + result = idx_dup.union(idx2_dup) + + expected = PeriodIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index b2bab2e720146..5cff23943b57d 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -501,6 +501,84 @@ def check_intersection_commutative(left, right): assert idx.intersection(idx_non_unique).is_unique +@pytest.mark.parametrize( + "cls", + [ + Int64Index, + Float64Index, + DatetimeIndex, + CategoricalIndex, + lambda x: CategoricalIndex(x, categories=set(x)), + TimedeltaIndex, + lambda x: Index(x, dtype=object), + UInt64Index, + ], +) +def test_union_duplicate_index_subsets_of_each_other(cls): + # GH#31326 + a = cls([1, 2, 2, 3]) + b = cls([3, 3, 4]) + expected = cls([1, 2, 2, 3, 3, 4]) + if isinstance(a, CategoricalIndex): + expected = Index([1, 2, 2, 3, 3, 4]) + result = a.union(b) + tm.assert_index_equal(result, expected) + result = a.union(b, sort=False) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "cls", + [ + Int64Index, + Float64Index, + DatetimeIndex, + CategoricalIndex, + TimedeltaIndex, + lambda x: Index(x, dtype=object), + ], +) +def test_union_with_duplicate_index_and_non_monotonic(cls): + # GH#36289 + a = cls([1, 0, 0]) + b = cls([0, 1]) + expected = cls([0, 0, 1]) + + result = a.union(b) + tm.assert_index_equal(result, expected) + + result = a.union(b) + tm.assert_index_equal(result, expected) + + +def test_union_duplicate_index_different_dtypes(): + # GH#36289 + a = Index([1, 2, 2, 3]) + b = Index(["1", "0", "0"]) + expected = Index([1, 2, 2, 3, "1", "0", "0"]) + result = a.union(b, sort=False) + tm.assert_index_equal(result, expected) + + +def test_union_same_value_duplicated_in_both(): + # GH#36289 + a = Index([0, 0, 1]) + b = Index([0, 0, 1, 2]) + result = a.union(b) + expected = Index([0, 0, 1, 2]) + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dup", [1, np.nan]) +def test_union_nan_in_both(dup): + # GH#36289 + a = Index([np.nan, 1, 2, 2]) + b = Index([np.nan, dup, 1, 2]) + result = a.union(b, sort=False) + expected = Index([np.nan, dup, 1.0, 2.0, 2.0]) + tm.assert_index_equal(result, expected) + + class TestSetOpsUnsorted: # These may eventually belong in a dtype-specific test_setops, or # parametrized over a more general fixture diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 27201055dfa5d..26d336bee65ea 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2416,3 +2416,12 @@ def test_diff_low_precision_int(self, dtype): result = algos.diff(arr, 1) expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32") tm.assert_numpy_array_equal(result, expected) + + +def test_union_with_duplicates(): + # GH#36289 + lvals = np.array([3, 1, 3, 4]) + rvals = np.array([2, 3, 1, 1]) + result = algos.union_with_duplicates(lvals, rvals) + expected = np.array([3, 3, 1, 1, 4, 2]) + tm.assert_numpy_array_equal(result, expected)