Skip to content

Commit c34da50

Browse files
authored
BUG/PERF: algos.union_with_duplicates losing EA dtypes (#48900)
1 parent f47d82b commit c34da50

File tree

5 files changed

+55
-10
lines changed

5 files changed

+55
-10
lines changed

asv_bench/benchmarks/index_object.py

+9
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ def time_datetime_difference_disjoint(self):
6565
self.datetime_left.difference(self.datetime_right)
6666

6767

68+
class UnionWithDuplicates:
69+
def setup(self):
70+
self.left = Index(np.repeat(np.arange(1000), 100))
71+
self.right = Index(np.tile(np.arange(500, 1500), 50))
72+
73+
def time_union_with_duplicates(self):
74+
self.left.union(self.right)
75+
76+
6877
class Range:
6978
def setup(self):
7079
self.idx_inc = RangeIndex(start=0, stop=10**6, step=3)

doc/source/whatsnew/v1.6.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ Performance improvements
140140
- Performance improvement in :meth:`MultiIndex.difference` (:issue:`48606`)
141141
- Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`)
142142
- Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`)
143+
- Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`)
143144
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
144145
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
145146
- Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`)
@@ -223,7 +224,7 @@ MultiIndex
223224
- Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`)
224225
- Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`)
225226
- Bug in :meth:`MultiIndex.intersection` losing extension array (:issue:`48604`)
226-
- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`)
227+
- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`, :issue:`48900`)
227228
- Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
228229
- Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`)
229230
-

pandas/core/algorithms.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -1970,7 +1970,9 @@ def _sort_tuples(
19701970
return original_values[indexer]
19711971

19721972

1973-
def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
1973+
def union_with_duplicates(
1974+
lvals: ArrayLike | Index, rvals: ArrayLike | Index
1975+
) -> ArrayLike | Index:
19741976
"""
19751977
Extracts the union from lvals and rvals with respect to duplicates and nans in
19761978
both arrays.
@@ -1991,13 +1993,21 @@ def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
19911993
-----
19921994
Caller is responsible for ensuring lvals.dtype == rvals.dtype.
19931995
"""
1994-
indexer = []
1996+
from pandas import Series
1997+
19951998
l_count = value_counts(lvals, dropna=False)
19961999
r_count = value_counts(rvals, dropna=False)
19972000
l_count, r_count = l_count.align(r_count, fill_value=0)
1998-
unique_array = unique(concat_compat([lvals, rvals]))
1999-
unique_array = ensure_wrapped_if_datetimelike(unique_array)
2000-
2001-
for i, value in enumerate(unique_array):
2002-
indexer += [i] * int(max(l_count.at[value], r_count.at[value]))
2003-
return unique_array.take(indexer)
2001+
final_count = np.maximum(l_count.values, r_count.values)
2002+
final_count = Series(final_count, index=l_count.index, dtype="int", copy=False)
2003+
if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex):
2004+
unique_vals = lvals.append(rvals).unique()
2005+
else:
2006+
if isinstance(lvals, ABCIndex):
2007+
lvals = lvals._values
2008+
if isinstance(rvals, ABCIndex):
2009+
rvals = rvals._values
2010+
unique_vals = unique(concat_compat([lvals, rvals]))
2011+
unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
2012+
repeats = final_count.reindex(unique_vals).values
2013+
return np.repeat(unique_vals, repeats)

pandas/core/indexes/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3444,7 +3444,7 @@ def _union(self, other: Index, sort):
34443444

34453445
elif not other.is_unique:
34463446
# other has duplicates
3447-
result_dups = algos.union_with_duplicates(lvals, rvals)
3447+
result_dups = algos.union_with_duplicates(self, other)
34483448
return _maybe_try_sort(result_dups, sort)
34493449

34503450
# Self may have duplicates; other already checked as unique

pandas/tests/indexes/multi/test_setops.py

+25
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,31 @@ def test_union_keep_ea_dtype(any_numeric_ea_dtype, val):
578578
tm.assert_index_equal(result, expected)
579579

580580

581+
@pytest.mark.parametrize("dupe_val", [3, pd.NA])
582+
def test_union_with_duplicates_keep_ea_dtype(dupe_val, any_numeric_ea_dtype):
583+
# GH48900
584+
mi1 = MultiIndex.from_arrays(
585+
[
586+
Series([1, dupe_val, 2], dtype=any_numeric_ea_dtype),
587+
Series([1, dupe_val, 2], dtype=any_numeric_ea_dtype),
588+
]
589+
)
590+
mi2 = MultiIndex.from_arrays(
591+
[
592+
Series([2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype),
593+
Series([2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype),
594+
]
595+
)
596+
result = mi1.union(mi2)
597+
expected = MultiIndex.from_arrays(
598+
[
599+
Series([1, 2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype),
600+
Series([1, 2, dupe_val, dupe_val], dtype=any_numeric_ea_dtype),
601+
]
602+
)
603+
tm.assert_index_equal(result, expected)
604+
605+
581606
def test_union_duplicates(index, request):
582607
# GH#38977
583608
if index.empty or isinstance(index, (IntervalIndex, CategoricalIndex)):

0 commit comments

Comments
 (0)