Skip to content

Commit a97d90a

Browse files
authored
BUG: join not working correctly with MultiIndex and one dimension categorical (#38621)
1 parent 4e93eb6 commit a97d90a

File tree

3 files changed

+64
-4
lines changed

3 files changed

+64
-4
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ Reshaping
319319
- Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`)
320320
- Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`)
321321
- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`)
322+
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
322323
-
323324

324325
Sparse

pandas/core/indexes/base.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -3850,7 +3850,16 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False)
38503850
return self._join_non_unique(
38513851
other, how=how, return_indexers=return_indexers
38523852
)
3853-
elif self.is_monotonic and other.is_monotonic:
3853+
elif (
3854+
self.is_monotonic
3855+
and other.is_monotonic
3856+
and (
3857+
not isinstance(self, ABCMultiIndex)
3858+
or not any(is_categorical_dtype(dtype) for dtype in self.dtypes)
3859+
)
3860+
):
3861+
# Categorical is monotonic if data are ordered as categories, but join can
3862+
# not handle this in case of not lexicographically monotonic GH#38502
38543863
try:
38553864
return self._join_monotonic(
38563865
other, how=how, return_indexers=return_indexers

pandas/tests/reshape/merge/test_join.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,16 @@
22
import pytest
33

44
import pandas as pd
5-
from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
5+
from pandas import (
6+
Categorical,
7+
DataFrame,
8+
Index,
9+
MultiIndex,
10+
Series,
11+
Timestamp,
12+
concat,
13+
merge,
14+
)
615
import pandas._testing as tm
716
from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data
817

@@ -693,8 +702,8 @@ def test_join_datetime_string(self):
693702
result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
694703
expected = DataFrame(
695704
[
696-
[pd.Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
697-
[pd.Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
705+
[Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
706+
[Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
698707
],
699708
index=[2, 4],
700709
columns=["x", "y", "z", "a"],
@@ -831,3 +840,44 @@ def test_join_multiindex_one_level(join_type):
831840
index=pd.MultiIndex.from_tuples([(2, 1)], names=["b", "a"]),
832841
)
833842
tm.assert_frame_equal(result, expected)
843+
844+
845+
@pytest.mark.parametrize(
846+
"categories, values",
847+
[
848+
(["Y", "X"], ["Y", "X", "X"]),
849+
([2, 1], [2, 1, 1]),
850+
([2.5, 1.5], [2.5, 1.5, 1.5]),
851+
(
852+
[Timestamp("2020-12-31"), Timestamp("2019-12-31")],
853+
[Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")],
854+
),
855+
],
856+
)
857+
def test_join_multiindex_not_alphabetical_categorical(categories, values):
858+
# GH#38502
859+
left = DataFrame(
860+
{
861+
"first": ["A", "A"],
862+
"second": Categorical(categories, categories=categories),
863+
"value": [1, 2],
864+
}
865+
).set_index(["first", "second"])
866+
right = DataFrame(
867+
{
868+
"first": ["A", "A", "B"],
869+
"second": Categorical(values, categories=categories),
870+
"value": [3, 4, 5],
871+
}
872+
).set_index(["first", "second"])
873+
result = left.join(right, lsuffix="_left", rsuffix="_right")
874+
875+
expected = DataFrame(
876+
{
877+
"first": ["A", "A"],
878+
"second": Categorical(categories, categories=categories),
879+
"value_left": [1, 2],
880+
"value_right": [3, 4],
881+
}
882+
).set_index(["first", "second"])
883+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)