Skip to content

Commit 9eaf63f

Browse files
authored
BUG: Index.union() inconsistent with non-unique Indexes (#36299)
1 parent cf306c2 commit 9eaf63f

File tree

6 files changed

+168
-17
lines changed

6 files changed

+168
-17
lines changed

doc/source/whatsnew/v1.3.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,8 @@ Interval
423423

424424
Indexing
425425
^^^^^^^^
426+
427+
- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`)
426428
- Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`)
427429
- Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`)
428430
- Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`)

pandas/core/algorithms.py

+28
Original file line numberDiff line numberDiff line change
@@ -1866,3 +1866,31 @@ def _sort_tuples(values: np.ndarray):
18661866
arrays, _ = to_arrays(values, None)
18671867
indexer = lexsort_indexer(arrays, orders=True)
18681868
return values[indexer]
1869+
1870+
1871+
def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray:
1872+
"""
1873+
Extracts the union from lvals and rvals with respect to duplicates and nans in
1874+
both arrays.
1875+
1876+
Parameters
1877+
----------
1878+
lvals: np.ndarray
1879+
left values which is ordered in front.
1880+
rvals: np.ndarray
1881+
right values ordered after lvals.
1882+
1883+
Returns
1884+
-------
1885+
np.ndarray containing the unsorted union of both arrays
1886+
"""
1887+
indexer = []
1888+
l_count = value_counts(lvals, dropna=False)
1889+
r_count = value_counts(rvals, dropna=False)
1890+
l_count, r_count = l_count.align(r_count, fill_value=0)
1891+
unique_array = unique(np.append(lvals, rvals))
1892+
if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals):
1893+
unique_array = pd_array(unique_array)
1894+
for i, value in enumerate(unique_array):
1895+
indexer += [i] * int(max(l_count[value], r_count[value]))
1896+
return unique_array.take(indexer)

pandas/core/indexes/base.py

+29-17
Original file line numberDiff line numberDiff line change
@@ -2939,32 +2939,44 @@ def _union(self, other: Index, sort):
29392939
lvals = self._values
29402940
rvals = other._values
29412941

2942-
if sort is None and self.is_monotonic and other.is_monotonic:
2942+
if (
2943+
sort is None
2944+
and self.is_monotonic
2945+
and other.is_monotonic
2946+
and not (self.has_duplicates and other.has_duplicates)
2947+
):
2948+
# Both are unique and monotonic, so can use outer join
29432949
try:
2944-
result = self._outer_indexer(lvals, rvals)[0]
2950+
return self._outer_indexer(lvals, rvals)[0]
29452951
except (TypeError, IncompatibleFrequency):
29462952
# incomparable objects
2947-
result = list(lvals)
2953+
value_list = list(lvals)
29482954

29492955
# worth making this faster? a very unusual case
29502956
value_set = set(lvals)
2951-
result.extend([x for x in rvals if x not in value_set])
2952-
result = Index(result)._values # do type inference here
2953-
else:
2954-
# find indexes of things in "other" that are not in "self"
2955-
if self.is_unique:
2956-
indexer = self.get_indexer(other)
2957-
missing = (indexer == -1).nonzero()[0]
2958-
else:
2959-
missing = algos.unique1d(self.get_indexer_non_unique(other)[1])
2957+
value_list.extend([x for x in rvals if x not in value_set])
2958+
return Index(value_list)._values # do type inference here
29602959

2961-
if len(missing) > 0:
2962-
other_diff = algos.take_nd(rvals, missing, allow_fill=False)
2963-
result = concat_compat((lvals, other_diff))
2960+
elif not other.is_unique and not self.is_unique:
2961+
# self and other both have duplicates
2962+
result = algos.union_with_duplicates(lvals, rvals)
2963+
return _maybe_try_sort(result, sort)
29642964

2965-
else:
2966-
result = lvals
2965+
# Either other or self is not unique
2966+
# find indexes of things in "other" that are not in "self"
2967+
if self.is_unique:
2968+
indexer = self.get_indexer(other)
2969+
missing = (indexer == -1).nonzero()[0]
2970+
else:
2971+
missing = algos.unique1d(self.get_indexer_non_unique(other)[1])
2972+
2973+
if len(missing) > 0:
2974+
other_diff = algos.take_nd(rvals, missing, allow_fill=False)
2975+
result = concat_compat((lvals, other_diff))
2976+
else:
2977+
result = lvals
29672978

2979+
if not self.is_monotonic or not other.is_monotonic:
29682980
result = _maybe_try_sort(result, sort)
29692981

29702982
return result

pandas/tests/indexes/period/test_setops.py

+22
Original file line numberDiff line numberDiff line change
@@ -348,3 +348,25 @@ def test_intersection_equal_duplicates(self):
348348
idx_dup = idx.append(idx)
349349
result = idx_dup.intersection(idx_dup)
350350
tm.assert_index_equal(result, idx)
351+
352+
def test_union_duplicates(self):
353+
# GH#36289
354+
idx = pd.period_range("2011-01-01", periods=2)
355+
idx_dup = idx.append(idx)
356+
357+
idx2 = pd.period_range("2011-01-02", periods=2)
358+
idx2_dup = idx2.append(idx2)
359+
result = idx_dup.union(idx2_dup)
360+
361+
expected = PeriodIndex(
362+
[
363+
"2011-01-01",
364+
"2011-01-01",
365+
"2011-01-02",
366+
"2011-01-02",
367+
"2011-01-03",
368+
"2011-01-03",
369+
],
370+
freq="D",
371+
)
372+
tm.assert_index_equal(result, expected)

pandas/tests/indexes/test_setops.py

+78
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,84 @@ def check_intersection_commutative(left, right):
501501
assert idx.intersection(idx_non_unique).is_unique
502502

503503

504+
@pytest.mark.parametrize(
505+
"cls",
506+
[
507+
Int64Index,
508+
Float64Index,
509+
DatetimeIndex,
510+
CategoricalIndex,
511+
lambda x: CategoricalIndex(x, categories=set(x)),
512+
TimedeltaIndex,
513+
lambda x: Index(x, dtype=object),
514+
UInt64Index,
515+
],
516+
)
517+
def test_union_duplicate_index_subsets_of_each_other(cls):
518+
# GH#31326
519+
a = cls([1, 2, 2, 3])
520+
b = cls([3, 3, 4])
521+
expected = cls([1, 2, 2, 3, 3, 4])
522+
if isinstance(a, CategoricalIndex):
523+
expected = Index([1, 2, 2, 3, 3, 4])
524+
result = a.union(b)
525+
tm.assert_index_equal(result, expected)
526+
result = a.union(b, sort=False)
527+
tm.assert_index_equal(result, expected)
528+
529+
530+
@pytest.mark.parametrize(
531+
"cls",
532+
[
533+
Int64Index,
534+
Float64Index,
535+
DatetimeIndex,
536+
CategoricalIndex,
537+
TimedeltaIndex,
538+
lambda x: Index(x, dtype=object),
539+
],
540+
)
541+
def test_union_with_duplicate_index_and_non_monotonic(cls):
542+
# GH#36289
543+
a = cls([1, 0, 0])
544+
b = cls([0, 1])
545+
expected = cls([0, 0, 1])
546+
547+
result = a.union(b)
548+
tm.assert_index_equal(result, expected)
549+
550+
result = a.union(b)
551+
tm.assert_index_equal(result, expected)
552+
553+
554+
def test_union_duplicate_index_different_dtypes():
555+
# GH#36289
556+
a = Index([1, 2, 2, 3])
557+
b = Index(["1", "0", "0"])
558+
expected = Index([1, 2, 2, 3, "1", "0", "0"])
559+
result = a.union(b, sort=False)
560+
tm.assert_index_equal(result, expected)
561+
562+
563+
def test_union_same_value_duplicated_in_both():
564+
# GH#36289
565+
a = Index([0, 0, 1])
566+
b = Index([0, 0, 1, 2])
567+
result = a.union(b)
568+
expected = Index([0, 0, 1, 2])
569+
tm.assert_index_equal(result, expected)
570+
571+
572+
@pytest.mark.parametrize("dup", [1, np.nan])
573+
def test_union_nan_in_both(dup):
574+
# GH#36289
575+
a = Index([np.nan, 1, 2, 2])
576+
b = Index([np.nan, dup, 1, 2])
577+
result = a.union(b, sort=False)
578+
expected = Index([np.nan, dup, 1.0, 2.0, 2.0])
579+
tm.assert_index_equal(result, expected)
580+
581+
504582
class TestSetOpsUnsorted:
505583
# These may eventually belong in a dtype-specific test_setops, or
506584
# parametrized over a more general fixture

pandas/tests/test_algos.py

+9
Original file line numberDiff line numberDiff line change
@@ -2416,3 +2416,12 @@ def test_diff_low_precision_int(self, dtype):
24162416
result = algos.diff(arr, 1)
24172417
expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
24182418
tm.assert_numpy_array_equal(result, expected)
2419+
2420+
2421+
def test_union_with_duplicates():
2422+
# GH#36289
2423+
lvals = np.array([3, 1, 3, 4])
2424+
rvals = np.array([2, 3, 1, 1])
2425+
result = algos.union_with_duplicates(lvals, rvals)
2426+
expected = np.array([3, 3, 1, 1, 4, 2])
2427+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)