BUG: Index.union() inconsistent with non-unique Indexes (#36299)

phofl · web-flow · commit 9eaf63f66fea · 2021-03-03T21:01:09.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -423,6 +423,8 @@ Interval
 
 Indexing
 ^^^^^^^^
+
+- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`)
 - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
 - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
 - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1866,3 +1866,31 @@ def _sort_tuples(values: np.ndarray):
     arrays, _ = to_arrays(values, None)
     indexer = lexsort_indexer(arrays, orders=True)
     return values[indexer]
+
+
+def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray:
+    """
+    Extracts the union from lvals and rvals with respect to duplicates and nans in
+    both arrays.
+
+    Parameters
+    ----------
+    lvals: np.ndarray
+        left values which is ordered in front.
+    rvals: np.ndarray
+        right values ordered after lvals.
+
+    Returns
+    -------
+    np.ndarray containing the unsorted union of both arrays
+    """
+    indexer = []
+    l_count = value_counts(lvals, dropna=False)
+    r_count = value_counts(rvals, dropna=False)
+    l_count, r_count = l_count.align(r_count, fill_value=0)
+    unique_array = unique(np.append(lvals, rvals))
+    if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals):
+        unique_array = pd_array(unique_array)
+    for i, value in enumerate(unique_array):
+        indexer += [i] * int(max(l_count[value], r_count[value]))
+    return unique_array.take(indexer)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -2939,32 +2939,44 @@ def _union(self, other: Index, sort):
         lvals = self._values
         rvals = other._values
 
-        if sort is None and self.is_monotonic and other.is_monotonic:
+        if (
+            sort is None
+            and self.is_monotonic
+            and other.is_monotonic
+            and not (self.has_duplicates and other.has_duplicates)
+        ):
+            # Both are unique and monotonic, so can use outer join
             try:
-                result = self._outer_indexer(lvals, rvals)[0]
+                return self._outer_indexer(lvals, rvals)[0]
             except (TypeError, IncompatibleFrequency):
                 # incomparable objects
-                result = list(lvals)
+                value_list = list(lvals)
 
                 # worth making this faster? a very unusual case
                 value_set = set(lvals)
-                result.extend([x for x in rvals if x not in value_set])
-                result = Index(result)._values  # do type inference here
-        else:
-            # find indexes of things in "other" that are not in "self"
-            if self.is_unique:
-                indexer = self.get_indexer(other)
-                missing = (indexer == -1).nonzero()[0]
-            else:
-                missing = algos.unique1d(self.get_indexer_non_unique(other)[1])
+                value_list.extend([x for x in rvals if x not in value_set])
+                return Index(value_list)._values  # do type inference here
 
-            if len(missing) > 0:
-                other_diff = algos.take_nd(rvals, missing, allow_fill=False)
-                result = concat_compat((lvals, other_diff))
+        elif not other.is_unique and not self.is_unique:
+            # self and other both have duplicates
+            result = algos.union_with_duplicates(lvals, rvals)
+            return _maybe_try_sort(result, sort)
 
-            else:
-                result = lvals
+        # Either other or self is not unique
+        # find indexes of things in "other" that are not in "self"
+        if self.is_unique:
+            indexer = self.get_indexer(other)
+            missing = (indexer == -1).nonzero()[0]
+        else:
+            missing = algos.unique1d(self.get_indexer_non_unique(other)[1])
+
+        if len(missing) > 0:
+            other_diff = algos.take_nd(rvals, missing, allow_fill=False)
+            result = concat_compat((lvals, other_diff))
+        else:
+            result = lvals
 
+        if not self.is_monotonic or not other.is_monotonic:
             result = _maybe_try_sort(result, sort)
 
         return result
diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py
@@ -348,3 +348,25 @@ def test_intersection_equal_duplicates(self):
         idx_dup = idx.append(idx)
         result = idx_dup.intersection(idx_dup)
         tm.assert_index_equal(result, idx)
+
+    def test_union_duplicates(self):
+        # GH#36289
+        idx = pd.period_range("2011-01-01", periods=2)
+        idx_dup = idx.append(idx)
+
+        idx2 = pd.period_range("2011-01-02", periods=2)
+        idx2_dup = idx2.append(idx2)
+        result = idx_dup.union(idx2_dup)
+
+        expected = PeriodIndex(
+            [
+                "2011-01-01",
+                "2011-01-01",
+                "2011-01-02",
+                "2011-01-02",
+                "2011-01-03",
+                "2011-01-03",
+            ],
+            freq="D",
+        )
+        tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
@@ -501,6 +501,84 @@ def check_intersection_commutative(left, right):
     assert idx.intersection(idx_non_unique).is_unique
 
 
+@pytest.mark.parametrize(
+    "cls",
+    [
+        Int64Index,
+        Float64Index,
+        DatetimeIndex,
+        CategoricalIndex,
+        lambda x: CategoricalIndex(x, categories=set(x)),
+        TimedeltaIndex,
+        lambda x: Index(x, dtype=object),
+        UInt64Index,
+    ],
+)
+def test_union_duplicate_index_subsets_of_each_other(cls):
+    # GH#31326
+    a = cls([1, 2, 2, 3])
+    b = cls([3, 3, 4])
+    expected = cls([1, 2, 2, 3, 3, 4])
+    if isinstance(a, CategoricalIndex):
+        expected = Index([1, 2, 2, 3, 3, 4])
+    result = a.union(b)
+    tm.assert_index_equal(result, expected)
+    result = a.union(b, sort=False)
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "cls",
+    [
+        Int64Index,
+        Float64Index,
+        DatetimeIndex,
+        CategoricalIndex,
+        TimedeltaIndex,
+        lambda x: Index(x, dtype=object),
+    ],
+)
+def test_union_with_duplicate_index_and_non_monotonic(cls):
+    # GH#36289
+    a = cls([1, 0, 0])
+    b = cls([0, 1])
+    expected = cls([0, 0, 1])
+
+    result = a.union(b)
+    tm.assert_index_equal(result, expected)
+
+    result = a.union(b)
+    tm.assert_index_equal(result, expected)
+
+
+def test_union_duplicate_index_different_dtypes():
+    # GH#36289
+    a = Index([1, 2, 2, 3])
+    b = Index(["1", "0", "0"])
+    expected = Index([1, 2, 2, 3, "1", "0", "0"])
+    result = a.union(b, sort=False)
+    tm.assert_index_equal(result, expected)
+
+
+def test_union_same_value_duplicated_in_both():
+    # GH#36289
+    a = Index([0, 0, 1])
+    b = Index([0, 0, 1, 2])
+    result = a.union(b)
+    expected = Index([0, 0, 1, 2])
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("dup", [1, np.nan])
+def test_union_nan_in_both(dup):
+    # GH#36289
+    a = Index([np.nan, 1, 2, 2])
+    b = Index([np.nan, dup, 1, 2])
+    result = a.union(b, sort=False)
+    expected = Index([np.nan, dup, 1.0, 2.0, 2.0])
+    tm.assert_index_equal(result, expected)
+
+
 class TestSetOpsUnsorted:
     # These may eventually belong in a dtype-specific test_setops, or
     #  parametrized over a more general fixture
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -2416,3 +2416,12 @@ def test_diff_low_precision_int(self, dtype):
         result = algos.diff(arr, 1)
         expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
         tm.assert_numpy_array_equal(result, expected)
+
+
+def test_union_with_duplicates():
+    # GH#36289
+    lvals = np.array([3, 1, 3, 4])
+    rvals = np.array([2, 3, 1, 1])
+    result = algos.union_with_duplicates(lvals, rvals)
+    expected = np.array([3, 3, 1, 1, 4, 2])
+    tm.assert_numpy_array_equal(result, expected)