diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 705c335acfb48..822b7dc2e5caa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -103,9 +103,10 @@ Bug fixes Categorical ^^^^^^^^^^^ + +- Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - -- Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index dfa7aa708d681..f696591cf3bd1 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -254,6 +254,8 @@ ctypedef fused join_t: float64_t float32_t object + int8_t + int16_t int32_t int64_t uint64_t diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index caa6a9a93141f..11b603f3478ae 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -28,6 +28,7 @@ from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.extension import ExtensionIndex, inherit_names import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @@ -785,6 +786,12 @@ def _delegate_method(self, name: str, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) + def _wrap_joined_index( + self, joined: np.ndarray, other: "CategoricalIndex" + ) -> "CategoricalIndex": + name = get_op_result_name(self, other) + return self._create_from_codes(joined, name=name) + CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4f2cd878df613..d80e2e7afceef 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2163,3 +2163,25 @@ def test_merge_datetime_upcast_dtype(): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("n_categories", [5, 128]) +def test_categorical_non_unique_monotonic(n_categories): + # GH 28189 + # With n_categories as 5, we test the int8 case is hit in libjoin, + # with n_categories as 128 we test the int16 case. + left_index = CategoricalIndex([0] + list(range(n_categories))) + df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index) + df2 = DataFrame( + [[6]], + columns=["value"], + index=CategoricalIndex([0], categories=np.arange(n_categories)), + ) + + result = merge(df1, df2, how="left", left_index=True, right_index=True) + expected = DataFrame( + [[i, 6.0] if i < 2 else [i, np.nan] for i in range(n_categories + 1)], + columns=["value_x", "value_y"], + index=left_index, + ) + tm.assert_frame_equal(expected, result)