diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 50a5bf383de77..17070b06a47d0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -535,6 +535,7 @@ Reshaping - Bug in :func:`get_dummies` that selected object and categorical dtypes but not string (:issue:`44965`) - Bug in :meth:`DataFrame.align` when aligning a :class:`MultiIndex` to a :class:`Series` with another :class:`MultiIndex` (:issue:`46001`) - Bug in concanenation with ``IntegerDtype``, or ``FloatingDtype`` arrays where the resulting dtype did not mirror the behavior of the non-nullable dtypes (:issue:`46379`) +- Bug in :func:`concat` between two :class:`DataFrame` with categorical indexes that have the same categories returning with indexes in improper order (:issue:`44099`) - Sparse diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c2bcd90ff10fb..1b0a453ec41c8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -571,7 +571,10 @@ def map(self, mapper): def _concat(self, to_concat: list[Index], name: Hashable) -> Index: # if calling index is category, don't check dtype of others try: - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + data = np.concatenate( + [self._is_dtype_compat(c).tolist() for c in to_concat] + ) + codes = Categorical(data, categories=self.categories).codes except TypeError: # not all to_concat elements are among our categories (or NA) from pandas.core.dtypes.concat import concat_compat diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 06c00123566ba..fea859a4e3da9 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -286,6 +286,18 @@ def test_map_str(self): # See test_map.py pass + def test_append(self): + # GH 44099 + # concat indexes which have the same categories + + ci1 = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]) + ci2 = CategoricalIndex(["b", "a", "c"], categories=["b", "a", "c"]) + expected = CategoricalIndex( + ["a", "b", "c", "b", "a", "c"], categories=["a", "b", "c"] + ) + result = ci1.append(ci2) + tm.assert_index_equal(result, expected) + class TestCategoricalIndex2: # Tests that are not overriding a test in Base diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index eb44b4889afb8..ef20f9f4b150c 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -18,6 +18,7 @@ import pandas as pd from pandas import ( + CategoricalIndex, DataFrame, Index, MultiIndex, @@ -502,6 +503,27 @@ def test_concat_duplicate_indices_raise(self): with pytest.raises(InvalidIndexError, match=msg): concat([df1, df2], axis=1) + def test_concat_with_categorical_indices(self): + # GH 44099 + # concat frames with categorical indices that have the same values + + df1 = DataFrame( + {"col1": ["a_val", "b_val", "c_val"]}, + index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + ) + df2 = DataFrame( + {"col1": ["b_val", "a_val", "c_val"]}, + index=CategoricalIndex(["b", "a", "c"], categories=["b", "a", "c"]), + ) + expected = DataFrame( + {"col1": ["a_val", "b_val", "c_val", "b_val", "a_val", "c_val"]}, + index=CategoricalIndex( + ["a", "b", "c", "b", "a", "c"], categories=["a", "b", "c"] + ), + ) + result = concat([df1, df2]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("pdt", [Series, DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["float"])