diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index cb97fdeccd579..2143ec01295e7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -316,6 +316,7 @@ Reshaping - Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`) - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`) - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) +- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - Sparse diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a2e9737f305ba..cb79c2c2abd1e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3848,7 +3848,16 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return self._join_non_unique( other, how=how, return_indexers=return_indexers ) - elif self.is_monotonic and other.is_monotonic: + elif ( + self.is_monotonic + and other.is_monotonic + and ( + not isinstance(self, ABCMultiIndex) + or not any(is_categorical_dtype(dtype) for dtype in self.dtypes) + ) + ): + # Categorical is monotonic if data are ordered as categories, but join can + # not handle this in case of not lexicographically monotonic GH#38502 try: return self._join_monotonic( other, how=how, return_indexers=return_indexers diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index f64283147764f..500d7000817af 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,7 +2,16 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat, merge +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + concat, + merge, +) import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data @@ -693,8 +702,8 @@ def test_join_datetime_string(self): result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ - [pd.Timestamp("2012-08-02 00:00:00"), "J", 1, 15], - [pd.Timestamp("2013-04-06 00:00:00"), "L", 2, 20], + [Timestamp("2012-08-02 00:00:00"), "J", 1, 15], + [Timestamp("2013-04-06 00:00:00"), "L", 2, 20], ], index=[2, 4], columns=["x", "y", "z", "a"], @@ -831,3 +840,44 @@ def test_join_multiindex_one_level(join_type): index=pd.MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "categories, values", + [ + (["Y", "X"], ["Y", "X", "X"]), + ([2, 1], [2, 1, 1]), + ([2.5, 1.5], [2.5, 1.5, 1.5]), + ( + [Timestamp("2020-12-31"), Timestamp("2019-12-31")], + [Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")], + ), + ], +) +def test_join_multiindex_not_alphabetical_categorical(categories, values): + # GH#38502 + left = DataFrame( + { + "first": ["A", "A"], + "second": Categorical(categories, categories=categories), + "value": [1, 2], + } + ).set_index(["first", "second"]) + right = DataFrame( + { + "first": ["A", "A", "B"], + "second": Categorical(values, categories=categories), + "value": [3, 4, 5], + } + ).set_index(["first", "second"]) + result = left.join(right, lsuffix="_left", rsuffix="_right") + + expected = DataFrame( + { + "first": ["A", "A"], + "second": Categorical(categories, categories=categories), + "value_left": [1, 2], + "value_right": [3, 4], + } + ).set_index(["first", "second"]) + tm.assert_frame_equal(result, expected)