BUG: fix bug where appending unordered CategoricalIndex variables overrides index (#24845)

GivyBoy · GivyBoy · commit 1eb443e3d51c · 2022-08-08T08:46:17.000-04:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -390,6 +390,47 @@ upon serialization. (Related issue :issue:`12997`)
     # Roundtripping now works
     pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
 
+.. _whatsnew_150.notable_bug_fixes.concat_unordered_categoricalindex_overrides_indices:
+
+Concatenating two ``CategoricalIndex`` variables with ``ordered=False`` yields correct result
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Calling :meth:`DataFrame.concat` on two (or more) ``CategoricalIndex`` values with ``ordered=False`` would
+incorrectly initialize index values (:issue:`24845`). As a result, if a you are concatenating two ``DataFrames`` with
+``CategoricalIndex`` values with ``ordered=False``, the index of the second parameter of :meth:`DataFrame.concat`
+would disappear.
+
+.. ipython:: ipython
+
+    In [1]: category1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
+            category2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
+            df1 = pd.DataFrame({"A": [1, 2]}, index=category1)
+            df2 = pd.DataFrame({"B": [3, 4]}, index=category2)
+
+*Old Behavior*
+
+.. ipython:: ipython
+
+    In [2]: pd.concat((df1, df2))
+    Out[2]:
+         A
+    a    1
+    a    2
+    a    3
+    a    4
+
+*New Behavior*
+
+.. ipython:: ipython
+
+    In [3]: pd.concat((df1, df2))
+    Out[3]:
+         A
+    a    1
+    a    2
+    b    3
+    b    4
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.api_breaking:
 
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -571,14 +571,17 @@ def map(self, mapper):
 
     def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
         # if calling index is category, don't check dtype of others
+
         try:
-            codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
+            cat = Categorical._concat_same_type(
+                [self._is_dtype_compat(c) for c in to_concat]
+            )
         except TypeError:
             # not all to_concat elements are among our categories (or NA)
             from pandas.core.dtypes.concat import concat_compat
 
             res = concat_compat([x._values for x in to_concat])
+
             return Index(res, name=name)
         else:
-            cat = self._data._from_backing_data(codes)
             return type(self)._simple_new(cat, name=name)
diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py
@@ -238,3 +238,21 @@ def test_categorical_missing_from_one_frame(self):
             index=[0, 1, 2, 0, 1, 2],
         )
         tm.assert_frame_equal(result, expected)
+
+    def test_concat_categorical_same_categories_different_order(self):
+        # https://github.com/pandas-dev/pandas/issues/24845
+
+        c1 = pd.CategoricalIndex(["a", "a"], categories=["a", "b"], ordered=False)
+        c2 = pd.CategoricalIndex(["b", "b"], categories=["b", "a"], ordered=False)
+        c3 = pd.CategoricalIndex(
+            ["a", "a", "b", "b"], categories=["a", "b"], ordered=False
+        )
+
+        df1 = DataFrame({"A": [1, 2]}, index=c1)
+        df2 = DataFrame({"A": [3, 4]}, index=c2)
+
+        result = pd.concat((df1, df2))
+
+        expected = DataFrame({"A": [1, 2, 3, 4]}, index=c3)
+
+        tm.assert_frame_equal(result, expected)