diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 800599f728de1..4f1e67bad9a54 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -453,8 +453,11 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`) - +- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result + when all the categoricals had the same categories, but in a different order. + This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). +- Bug in ``Categorical.equals`` between two unordered categories with the same categories, but in a different order (:issue:`16603`) - Other diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index cd98064dee86e..5e6193d673756 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -339,7 +339,16 @@ def _maybe_unwrap(x): # identical categories - fastpath categories = first.categories ordered = first.ordered - new_codes = np.concatenate([c.codes for c in to_union]) + + if all(first.categories.equals(other.categories) + for other in to_union[1:]): + new_codes = np.concatenate([c.codes for c in to_union]) + else: + codes = [first.codes] + [_recode_for_categories(other.codes, + other.categories, + first.categories) + for other in to_union[1:]] + new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 85e3115e96f83..150410e404305 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -481,6 +481,15 @@ def test_concat_categorical(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])) + b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])) + result = pd.concat([a, b], ignore_index=True) + expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c'])) + tm.assert_series_equal(result, expected) + def test_concat_categorical_coercion(self): # GH 13524 diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 3211574f834f5..8743d11118200 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -129,6 +129,15 @@ def test_union_categorical_same_category(self): categories=['x', 'y', 'z']) tm.assert_categorical_equal(res, exp) + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) + c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2]) + expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False)