From c077cabf9cca523e5c66e6bccc1a80108541dfac Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 21 Dec 2020 20:13:51 +0100 Subject: [PATCH 1/5] BUG: join not working correctly with MultiIndex and one dimension categorical --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/indexes/base.py | 8 ++++++- pandas/tests/reshape/merge/test_join.py | 31 ++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7671962018144..e42279d406838 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -266,7 +266,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - Sparse diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6e2bbc5e3a0e6..9f2cc9a71b407 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3683,7 +3683,13 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return self._join_non_unique( other, how=how, return_indexers=return_indexers ) - elif self.is_monotonic and other.is_monotonic: + elif ( + self.is_monotonic + and other.is_monotonic + and not any(is_categorical_dtype(dtype) for dtype in self.dtypes) + ): + # Categorical is monotonic if data are ordered as categories, but join can + # not handle this in case of not monotonic alphabetically GH#38502 try: return self._join_monotonic( other, how=how, return_indexers=return_indexers diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index ad07ced2fca66..0626ae39da1ea 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat, merge +from pandas import DataFrame, Index, MultiIndex, Series, concat, merge, Categorical import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data @@ -815,3 +815,32 @@ def test_join_cross(input_col, output_cols): result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y") expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) tm.assert_frame_equal(result, expected) + + +def test_join_multiindex_not_alphabetical_categorical(): + # GH#38502 + left = DataFrame( + { + "first": ["A", "A"], + "second": Categorical(["Y", "X"], categories=["Y", "X"]), + "value": [1, 2], + } + ).set_index(["first", "second"]) + right = DataFrame( + { + "first": ["A", "A", "B"], + "second": Categorical(["Y", "X", "X"], categories=["Y", "X"]), + "value": [3, 4, 5], + } + ).set_index(["first", "second"]) + result = left.join(right, lsuffix="_left", rsuffix="_right") + + expected = DataFrame( + { + "first": ["A", "A"], + "second": Categorical(["Y", "X"], categories=["Y", "X"]), + "value_left": [1, 2], + "value_right": [3, 4], + } + ).set_index(["first", "second"]) + tm.assert_frame_equal(result, expected) From 652bf235923d0cbe2a9db7310418d23cce2633e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 21 Dec 2020 20:29:56 +0100 Subject: [PATCH 2/5] Adjust code --- pandas/core/indexes/base.py | 7 +++++-- pandas/tests/reshape/merge/test_join.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9ed07640d673c..b8a2173ff7d86 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3686,10 +3686,13 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) elif ( self.is_monotonic and other.is_monotonic - and not any(is_categorical_dtype(dtype) for dtype in self.dtypes) + and ( + not isinstance(self, ABCMultiIndex) + or not any(is_categorical_dtype(dtype) for dtype in self.dtypes) + ) ): # Categorical is monotonic if data are ordered as categories, but join can - # not handle this in case of not monotonic alphabetically GH#38502 + # not handle this in case of not alphabetically monotonic GH#38502 try: return self._join_monotonic( other, how=how, return_indexers=return_indexers diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 0626ae39da1ea..05b9228c34dc0 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat, merge, Categorical +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat, merge import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data From 62908e11fa892b3ee9f5906dc06f76d809d1cf77 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 24 Dec 2020 20:47:40 +0100 Subject: [PATCH 3/5] Add tests and clarify comment --- pandas/core/indexes/base.py | 2 +- pandas/tests/reshape/merge/test_join.py | 31 +++++++++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b8a2173ff7d86..d0090f36ff464 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3692,7 +3692,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) ) ): # Categorical is monotonic if data are ordered as categories, but join can - # not handle this in case of not alphabetically monotonic GH#38502 + # not handle this in case of not lexicographically monotonic GH#38502 try: return self._join_monotonic( other, how=how, return_indexers=return_indexers diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 05b9228c34dc0..6c4bd7b498770 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,7 +2,16 @@ import pytest import pandas as pd -from pandas import Categorical, DataFrame, Index, MultiIndex, Series, concat, merge +from pandas import ( + Categorical, + DataFrame, + Index, + MultiIndex, + Series, + concat, + merge, + Timestamp, +) import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data @@ -817,19 +826,31 @@ def test_join_cross(input_col, output_cols): tm.assert_frame_equal(result, expected) -def test_join_multiindex_not_alphabetical_categorical(): +@pytest.mark.parametrize( + "categories, values", + [ + (["Y", "X"], ["Y", "X", "X"]), + ([2, 1], [2, 1, 1]), + ([2.5, 1.5], [2.5, 1.5, 1.5]), + ( + [Timestamp("2020-12-31"), Timestamp("2019-12-31")], + [Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")], + ), + ], +) +def test_join_multiindex_not_alphabetical_categorical(categories, values): # GH#38502 left = DataFrame( { "first": ["A", "A"], - "second": Categorical(["Y", "X"], categories=["Y", "X"]), + "second": Categorical(categories, categories=categories), "value": [1, 2], } ).set_index(["first", "second"]) right = DataFrame( { "first": ["A", "A", "B"], - "second": Categorical(["Y", "X", "X"], categories=["Y", "X"]), + "second": Categorical(values, categories=categories), "value": [3, 4, 5], } ).set_index(["first", "second"]) @@ -838,7 +859,7 @@ def test_join_multiindex_not_alphabetical_categorical(): expected = DataFrame( { "first": ["A", "A"], - "second": Categorical(["Y", "X"], categories=["Y", "X"]), + "second": Categorical(categories, categories=categories), "value_left": [1, 2], "value_right": [3, 4], } From 8630eee61ba8cde4a15a18853bc78334f95deb95 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 24 Dec 2020 20:54:43 +0100 Subject: [PATCH 4/5] Reorder imports --- pandas/tests/reshape/merge/test_join.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 6c4bd7b498770..562fc918c5175 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -8,9 +8,9 @@ Index, MultiIndex, Series, + Timestamp, concat, merge, - Timestamp, ) import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data From ae5097bc0c9aff9c8d6d2bff0244c2eeb1c8b3da Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 24 Dec 2020 21:24:30 +0100 Subject: [PATCH 5/5] Fix od --- pandas/tests/reshape/merge/test_join.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 562fc918c5175..85963283b4cad 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -702,8 +702,8 @@ def test_join_datetime_string(self): result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ - [pd.Timestamp("2012-08-02 00:00:00"), "J", 1, 15], - [pd.Timestamp("2013-04-06 00:00:00"), "L", 2, 20], + [Timestamp("2012-08-02 00:00:00"), "J", 1, 15], + [Timestamp("2013-04-06 00:00:00"), "L", 2, 20], ], index=[2, 4], columns=["x", "y", "z", "a"],