From dfe90d9732336bf1006a25c093aee22e6f9d16cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 5 Jan 2018 15:17:53 -0600 Subject: [PATCH 1/2] BUG: Fixed union_categoricals with unordered cats Closes https://github.com/pandas-dev/pandas/issues/19096 --- doc/source/whatsnew/v0.23.0.txt | 4 +++- pandas/core/dtypes/concat.py | 11 ++++++++++- pandas/tests/reshape/test_union_categoricals.py | 9 +++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5fd7c3e217928..1f843b925a860 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -376,7 +376,9 @@ Categorical ^^^^^^^^^^^ - -- +- Bug in :func:`pandas.api.types.union_categoricals` returning the wrong result + when all the categoricals had the same categories, but in a different order. + This affected :func:`pandas.concat` with Categorical data (:issue:`19096`). - Other diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index cd98064dee86e..5e6193d673756 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -339,7 +339,16 @@ def _maybe_unwrap(x): # identical categories - fastpath categories = first.categories ordered = first.ordered - new_codes = np.concatenate([c.codes for c in to_union]) + + if all(first.categories.equals(other.categories) + for other in to_union[1:]): + new_codes = np.concatenate([c.codes for c in to_union]) + else: + codes = [first.codes] + [_recode_for_categories(other.codes, + other.categories, + first.categories) + for other in to_union[1:]] + new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 3211574f834f5..8743d11118200 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -129,6 +129,15 @@ def test_union_categorical_same_category(self): categories=['x', 'y', 'z']) tm.assert_categorical_equal(res, exp) + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) + c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2]) + expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) From 4bed0d98393a4cec676d0d0ec36d817349b8f78d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 6 Jan 2018 11:37:23 -0600 Subject: [PATCH 2/2] TST: Added concat test --- pandas/tests/reshape/test_concat.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index f66cb12b11210..c57f44efb447d 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -481,6 +481,15 @@ def test_concat_categorical(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + def test_union_categorical_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19096 + a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])) + b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])) + result = pd.concat([a, b], ignore_index=True) + expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'], + categories=['a', 'b', 'c'])) + tm.assert_series_equal(result, expected) + def test_concat_categorical_coercion(self): # GH 13524