From 8c8f752de522b7df3c8ed5bf5a11167df39b3117 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 18 Feb 2020 18:11:11 +0000 Subject: [PATCH 1/6] add int8_t and int16_t to join_t, make sure correct index is returned for categorical joins --- pandas/_libs/join.pyx | 2 ++ pandas/core/indexes/base.py | 3 +++ pandas/tests/reshape/merge/test_merge.py | 14 ++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 093c53790cd35..6a5baa9896f05 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -242,6 +242,8 @@ ctypedef fused join_t: float64_t float32_t object + int8_t + int16_t int32_t int64_t uint64_t diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 14ee21ea5614c..c7bf9e713d452 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3822,6 +3822,9 @@ def _join_monotonic(self, other, how="left", return_indexers=False): join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) + if self._typ == "categoricalindex": + join_index = self._create_from_codes(join_index) + if return_indexers: lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4f2cd878df613..673437aa45325 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2163,3 +2163,17 @@ def test_merge_datetime_upcast_dtype(): } ) tm.assert_frame_equal(result, expected) + + +def test_categorical_non_unique_monotonic(): + # GH 28189 + df = DataFrame(range(4), columns=["value"], index=CategoricalIndex(["1"] * 4)) + df2 = DataFrame([[6]], columns=["value"], index=CategoricalIndex(["1"])) + + result = merge(df, df2, how="left", left_index=True, right_index=True) + expected = DataFrame( + [[0, 6], [1, 6], [2, 6], [3, 6]], + columns=["value_x", "value_y"], + index=CategoricalIndex(["1"] * 4), + ) + tm.assert_frame_equal(expected, result) From 98f5d9d565a828ddfdd1d0fb81465f6a64d8db82 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 19 Feb 2020 14:27:43 +0000 Subject: [PATCH 2/6] refactor --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/indexes/base.py | 3 --- pandas/core/indexes/category.py | 5 +++++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 13827e8fc4c33..c5629d0b03060 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -102,7 +102,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Datetimelike diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c7bf9e713d452..14ee21ea5614c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3822,9 +3822,6 @@ def _join_monotonic(self, other, how="left", return_indexers=False): join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) - if self._typ == "categoricalindex": - join_index = self._create_from_codes(join_index) - if return_indexers: lidx = None if lidx is None else ensure_platform_int(lidx) ridx = None if ridx is None else ensure_platform_int(ridx) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index adb2ed9211bfe..ea282227ddbf7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -28,6 +28,7 @@ from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.extension import ExtensionIndex, inherit_names import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @@ -781,6 +782,10 @@ def _delegate_method(self, name: str, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return self._create_from_codes(joined, name=name) + CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() From fd371c13b374111de9bf7ad57b464a9336e89df9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 20 Feb 2020 16:58:42 +0000 Subject: [PATCH 3/6] type --- pandas/core/indexes/category.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ea282227ddbf7..0d57f2fa476e7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -782,7 +782,9 @@ def _delegate_method(self, name: str, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) - def _wrap_joined_index(self, joined, other): + def _wrap_joined_index( + self, joined: np.ndarray, other: "CategoricalIndex" + ) -> "CategoricalIndex": name = get_op_result_name(self, other) return self._create_from_codes(joined, name=name) From fb58340330657be7d4e5831c92ee61961229055c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 21 Feb 2020 17:03:23 +0000 Subject: [PATCH 4/6] test for int8 and int16 --- pandas/tests/reshape/merge/test_merge.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 673437aa45325..194e3eadea1b5 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2165,15 +2165,21 @@ def test_merge_datetime_upcast_dtype(): tm.assert_frame_equal(result, expected) -def test_categorical_non_unique_monotonic(): +@pytest.mark.parametrize("n_categories", [5, 128]) +def test_categorical_non_unique_monotonic(n_categories): # GH 28189 - df = DataFrame(range(4), columns=["value"], index=CategoricalIndex(["1"] * 4)) - df2 = DataFrame([[6]], columns=["value"], index=CategoricalIndex(["1"])) + left_index = CategoricalIndex([0] + list(range(n_categories))) + df = DataFrame(range(n_categories + 1), columns=["value"], index=left_index) + df2 = DataFrame( + [[6]], + columns=["value"], + index=CategoricalIndex([0], categories=np.arange(n_categories)), + ) result = merge(df, df2, how="left", left_index=True, right_index=True) expected = DataFrame( - [[0, 6], [1, 6], [2, 6], [3, 6]], + [[i, 6.0] if i < 2 else [i, np.nan] for i in range(n_categories + 1)], columns=["value_x", "value_y"], - index=CategoricalIndex(["1"] * 4), + index=left_index, ) tm.assert_frame_equal(expected, result) From 44673c6dfdb6ab13bb3a4b61f4206f0572f52673 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 25 Feb 2020 10:49:04 +0000 Subject: [PATCH 5/6] add comment to test --- pandas/tests/reshape/merge/test_merge.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 106170e104735..803c662a0569f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2168,6 +2168,8 @@ def test_merge_datetime_upcast_dtype(): @pytest.mark.parametrize("n_categories", [5, 128]) def test_categorical_non_unique_monotonic(n_categories): # GH 28189 + # With n_categories as 5, we test the int8 case, + # with n_categories as 128, we test the int16 case. left_index = CategoricalIndex([0] + list(range(n_categories))) df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index) df2 = DataFrame( From 4a994d53d01328bc87fa5348f2a32bbea51d78a5 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 25 Feb 2020 10:53:50 +0000 Subject: [PATCH 6/6] clarify --- pandas/tests/reshape/merge/test_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 803c662a0569f..d80e2e7afceef 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2168,8 +2168,8 @@ def test_merge_datetime_upcast_dtype(): @pytest.mark.parametrize("n_categories", [5, 128]) def test_categorical_non_unique_monotonic(n_categories): # GH 28189 - # With n_categories as 5, we test the int8 case, - # with n_categories as 128, we test the int16 case. + # With n_categories as 5, we test the int8 case is hit in libjoin, + # with n_categories as 128 we test the int16 case. left_index = CategoricalIndex([0] + list(range(n_categories))) df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index) df2 = DataFrame(