Skip to content

Commit 23c20de

Browse files
authored
BUG: outer join on equal indexes not sorting (#56426)
* outer join on equal indexes to sort by default * whatsnew * fix test * remove Index._join_precedence
1 parent cb56347 commit 23c20de

File tree

10 files changed

+43
-52
lines changed

10 files changed

+43
-52
lines changed

doc/source/whatsnew/v2.2.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ These are bug fixes that might have notable behavior changes.
246246

247247
In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not
248248
always return a result that followed the documented sort behavior. pandas now
249-
follows the documented sort behavior in merge and join operations (:issue:`54611`).
249+
follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`).
250250

251251
As documented, ``sort=True`` sorts the join keys lexicographically in the resulting
252252
:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the

pandas/core/computation/align.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def _align_core(terms):
110110
ax, itm = axis, items
111111

112112
if not axes[ax].is_(itm):
113-
axes[ax] = axes[ax].join(itm, how="outer")
113+
axes[ax] = axes[ax].union(itm)
114114

115115
for i, ndim in ndims.items():
116116
for axis, items in zip(range(ndim), axes):

pandas/core/indexes/base.py

+11-20
Original file line numberDiff line numberDiff line change
@@ -368,9 +368,6 @@ class Index(IndexOpsMixin, PandasObject):
368368
Index([1, 2, 3], dtype='uint8')
369369
"""
370370

371-
# To hand over control to subclasses
372-
_join_precedence = 1
373-
374371
# similar to __array_priority__, positions Index after Series and DataFrame
375372
# but before ExtensionArray. Should NOT be overridden by subclasses.
376373
__pandas_priority__ = 2000
@@ -4564,6 +4561,7 @@ def join(
45644561
Index([1, 2, 3, 4, 5, 6], dtype='int64')
45654562
"""
45664563
other = ensure_index(other)
4564+
sort = sort or how == "outer"
45674565

45684566
if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex):
45694567
if (self.tz is None) ^ (other.tz is None):
@@ -4614,15 +4612,6 @@ def join(
46144612
rindexer = np.array([])
46154613
return join_index, None, rindexer
46164614

4617-
if self._join_precedence < other._join_precedence:
4618-
flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
4619-
how = flip.get(how, how)
4620-
join_index, lidx, ridx = other.join(
4621-
self, how=how, level=level, return_indexers=True
4622-
)
4623-
lidx, ridx = ridx, lidx
4624-
return join_index, lidx, ridx
4625-
46264615
if self.dtype != other.dtype:
46274616
dtype = self._find_common_type_compat(other)
46284617
this = self.astype(dtype, copy=False)
@@ -4666,18 +4655,20 @@ def _join_via_get_indexer(
46664655
# Note: at this point we have checked matching dtypes
46674656

46684657
if how == "left":
4669-
join_index = self
4658+
join_index = self.sort_values() if sort else self
46704659
elif how == "right":
4671-
join_index = other
4660+
join_index = other.sort_values() if sort else other
46724661
elif how == "inner":
46734662
join_index = self.intersection(other, sort=sort)
46744663
elif how == "outer":
4675-
# TODO: sort=True here for backwards compat. It may
4676-
# be better to use the sort parameter passed into join
4677-
join_index = self.union(other)
4678-
4679-
if sort and how in ["left", "right"]:
4680-
join_index = join_index.sort_values()
4664+
try:
4665+
join_index = self.union(other, sort=sort)
4666+
except TypeError:
4667+
join_index = self.union(other)
4668+
try:
4669+
join_index = _maybe_try_sort(join_index, sort)
4670+
except TypeError:
4671+
pass
46814672

46824673
if join_index is self:
46834674
lindexer = None

pandas/core/indexes/datetimelike.py

-2
Original file line numberDiff line numberDiff line change
@@ -442,8 +442,6 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, ABC):
442442
_is_monotonic_decreasing = Index.is_monotonic_decreasing
443443
_is_unique = Index.is_unique
444444

445-
_join_precedence = 10
446-
447445
@property
448446
def unit(self) -> str:
449447
return self._data.unit

pandas/core/reshape/merge.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,7 @@ def __init__(
759759
self.on = com.maybe_make_list(on)
760760

761761
self.suffixes = suffixes
762-
self.sort = sort
762+
self.sort = sort or how == "outer"
763763

764764
self.left_index = left_index
765765
self.right_index = right_index
@@ -1694,9 +1694,6 @@ def get_join_indexers(
16941694
elif not sort and how in ["left", "outer"]:
16951695
return _get_no_sort_one_missing_indexer(left_n, False)
16961696

1697-
if not sort and how == "outer":
1698-
sort = True
1699-
17001697
# get left & right join labels and num. of levels at each location
17011698
mapped = (
17021699
_factorize_keys(left_keys[n], right_keys[n], sort=sort)

pandas/tests/indexes/multi/test_join.py

+5-8
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,11 @@ def test_join_level_corner_case(idx):
5151

5252

5353
def test_join_self(idx, join_type):
54-
joined = idx.join(idx, how=join_type)
55-
tm.assert_index_equal(joined, idx)
54+
result = idx.join(idx, how=join_type)
55+
expected = idx
56+
if join_type == "outer":
57+
expected = expected.sort_values()
58+
tm.assert_index_equal(result, expected)
5659

5760

5861
def test_join_multi():
@@ -89,12 +92,6 @@ def test_join_multi():
8992
tm.assert_numpy_array_equal(ridx, exp_ridx)
9093

9194

92-
def test_join_self_unique(idx, join_type):
93-
if idx.is_unique:
94-
joined = idx.join(idx, how=join_type)
95-
assert (idx == joined).all()
96-
97-
9895
def test_join_multi_wrong_order():
9996
# GH 25760
10097
# GH 28956

pandas/tests/indexes/test_base.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -987,8 +987,11 @@ def test_slice_keep_name(self):
987987
indirect=True,
988988
)
989989
def test_join_self(self, index, join_type):
990-
joined = index.join(index, how=join_type)
991-
assert index is joined
990+
result = index.join(index, how=join_type)
991+
expected = index
992+
if join_type == "outer":
993+
expected = expected.sort_values()
994+
tm.assert_index_equal(result, expected)
992995

993996
@pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"])
994997
def test_str_attribute(self, method):
@@ -1072,10 +1075,8 @@ def test_outer_join_sort(self):
10721075
with tm.assert_produces_warning(RuntimeWarning):
10731076
result = left_index.join(right_index, how="outer")
10741077

1075-
# right_index in this case because DatetimeIndex has join precedence
1076-
# over int64 Index
10771078
with tm.assert_produces_warning(RuntimeWarning):
1078-
expected = right_index.astype(object).union(left_index.astype(object))
1079+
expected = left_index.astype(object).union(right_index.astype(object))
10791080

10801081
tm.assert_index_equal(result, expected)
10811082

pandas/tests/indexes/test_old_base.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
period_range,
3131
)
3232
import pandas._testing as tm
33+
import pandas.core.algorithms as algos
3334
from pandas.core.arrays import BaseMaskedArray
3435

3536

@@ -653,7 +654,10 @@ def test_join_self_unique(self, join_type, simple_index):
653654
idx = simple_index
654655
if idx.is_unique:
655656
joined = idx.join(idx, how=join_type)
656-
assert (idx == joined).all()
657+
expected = simple_index
658+
if join_type == "outer":
659+
expected = algos.safe_sort(expected)
660+
tm.assert_index_equal(joined, expected)
657661

658662
def test_map(self, simple_index):
659663
# callable

pandas/tests/reshape/merge/test_merge.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -1838,6 +1838,9 @@ def test_merge_empty(self, left_empty, how, exp):
18381838
elif exp == "empty_cross":
18391839
expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64")
18401840

1841+
if how == "outer":
1842+
expected = expected.sort_values("A", ignore_index=True)
1843+
18411844
tm.assert_frame_equal(result, expected)
18421845

18431846

@@ -2913,16 +2916,13 @@ def test_merge_combinations(
29132916
expected = expected["key"].repeat(repeats.values)
29142917
expected = expected.to_frame()
29152918
elif how == "outer":
2916-
if on_index and left_unique and left["key"].equals(right["key"]):
2917-
expected = DataFrame({"key": left["key"]})
2918-
else:
2919-
left_counts = left["key"].value_counts()
2920-
right_counts = right["key"].value_counts()
2921-
expected_counts = left_counts.mul(right_counts, fill_value=1)
2922-
expected_counts = expected_counts.astype(np.intp)
2923-
expected = expected_counts.index.values.repeat(expected_counts.values)
2924-
expected = DataFrame({"key": expected})
2925-
expected = expected.sort_values("key")
2919+
left_counts = left["key"].value_counts()
2920+
right_counts = right["key"].value_counts()
2921+
expected_counts = left_counts.mul(right_counts, fill_value=1)
2922+
expected_counts = expected_counts.astype(np.intp)
2923+
expected = expected_counts.index.values.repeat(expected_counts.values)
2924+
expected = DataFrame({"key": expected})
2925+
expected = expected.sort_values("key")
29262926

29272927
if on_index:
29282928
expected = expected.set_index("key")

pandas/tests/series/test_arithmetic.py

+3
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,9 @@ def test_series_add_tz_mismatch_converts_to_utc(self):
754754
uts2 = ser2.tz_convert("utc")
755755
expected = uts1 + uts2
756756

757+
# sort since input indexes are not equal
758+
expected = expected.sort_index()
759+
757760
assert result.index.tz is timezone.utc
758761
tm.assert_series_equal(result, expected)
759762

0 commit comments

Comments
 (0)