diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 59ddfe602c033..b1795cb37200c 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -695,6 +695,40 @@ The below raises ``TypeError`` because the categories are ordered and not identi Out[3]: TypeError: to union ordered Categoricals, all categories must be the same +``union_categoricals`` also works with a ``CategoricalIndex``, or ``Series`` containing +categorical data, but note that the resulting array will always be a plain ``Categorical`` + +.. ipython:: python + + a = pd.Series(["b", "c"], dtype='category') + b = pd.Series(["a", "b"], dtype='category') + union_categoricals([a, b]) + +.. note:: + + ``union_categoricals`` may recode the integer codes for categories + when combining categoricals. This is likely what you want, + but if you are relying on the exact numbering of the categories, be + aware. + + .. ipython:: python + + c1 = pd.Categorical(["b", "c"]) + c2 = pd.Categorical(["a", "b"]) + + c1 + # "b" is coded to 0 + c1.codes + + c2 + # "b" is coded to 1 + c2.codes + + c = union_categoricals([c1, c2]) + c + # "b" is coded to 0 throughout, same as c1, different from c2 + c.codes + .. _categorical.concat: Concatenation diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 8e20cfa83c405..f541de316661a 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -9,7 +9,7 @@ from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, - DatetimeIndex, Categorical) + DatetimeIndex, Categorical, CategoricalIndex) from pandas.types.concat import union_categoricals from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, @@ -1539,10 +1539,12 @@ def test_union_categorical(self): ] for a, b, combined in data: - result = union_categoricals([Categorical(a), Categorical(b)]) - expected = Categorical(combined) - tm.assert_categorical_equal(result, expected, - check_category_order=True) + for box in [Categorical, CategoricalIndex, Series]: + result = union_categoricals([box(Categorical(a)), + box(Categorical(b))]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, + check_category_order=True) # new categories ordered by appearance s = Categorical(['x', 'y', 'z']) @@ -1771,6 +1773,25 @@ def test_union_categoricals_sort_false(self): categories=['b', 'a', 'c'], ordered=True) tm.assert_categorical_equal(result, expected) + def test_union_categorical_unwrap(self): + # GH 14173 + c1 = Categorical(['a', 'b']) + c2 = pd.Series(['b', 'c'], dtype='category') + result = union_categoricals([c1, c2]) + expected = Categorical(['a', 'b', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c2 = CategoricalIndex(c2) + result = union_categoricals([c1, c2]) + tm.assert_categorical_equal(result, expected) + + c1 = Series(c1) + result = union_categoricals([c1, c2]) + tm.assert_categorical_equal(result, expected) + + with tm.assertRaises(TypeError): + union_categoricals([c1, ['a', 'b', 'c']]) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 8bdd71348a537..827eb160c452d 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -210,14 +210,15 @@ def _concat_asobject(to_concat): def union_categoricals(to_union, sort_categories=False): """ - Combine list-like of Categoricals, unioning categories. All + Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- - to_union : list-like of Categoricals + to_union : list-like of Categorical, CategoricalIndex, + or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. @@ -236,11 +237,20 @@ def union_categoricals(to_union, sort_categories=False): ValueError Emmpty list of categoricals passed """ - from pandas import Index, Categorical + from pandas import Index, Categorical, CategoricalIndex, Series if len(to_union) == 0: raise ValueError('No Categoricals to union') + def _maybe_unwrap(x): + if isinstance(x, (CategoricalIndex, Series)): + return x.values + elif isinstance(x, Categorical): + return x + else: + raise TypeError("all components to combine must be Categorical") + + to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)