pandas-dev · chris-b1 · Jun 4, 2016 · Jun 4, 2016 · Jun 5, 2016 · Jun 7, 2016
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,4 +1,8 @@
 from .pandas_vb_common import *
+try:
+    from pandas.types.concat import union_categoricals
+except ImportError:
+    pass
 import string
 
 
@@ -12,6 +16,17 @@ def time_concat_categorical(self):
         concat([self.s, self.s])
 
 
+class union_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.a = pd.Categorical((list('aabbcd') * 1000000))
+        self.b = pd.Categorical((list('bbcdjk') * 1000000))
+
+    def time_union_categorical(self):
+        union_categoricals([self.a, self.b])
+
+
 class categorical_value_counts(object):
     goal_time = 1
 

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -648,6 +648,29 @@ In this case the categories are not the same and so an error is raised:
 
 The same applies to ``df.append(df_different)``.
 
+.. _categorical.union:
+
+Unioning
+~~~~~~~~
+
+If you want to combine categoricals that do not necessarily have
+the same categories, the `union_categorical` function will
+combine a list-like of categoricals. The new categories
+will be the union of the categories being combined.
+
+.. ipython:: python
+
+    from pandas.types.concat import union_categoricals
+    a = pd.Categorical(["b", "c"])
+    b = pd.Categorical(["a", "b"])
+    union_categoricals([a, b])
+
+.. note::
+
+   `union_categoricals` only works with unordered categoricals
+   and will raise if any are orderd.
+
+
 Getting Data In/Out
 -------------------
 

diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -90,7 +90,7 @@ Other enhancements
 
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
-
+- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
 - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules.  New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -3943,6 +3943,38 @@ def f():
                                   'category', categories=list('cab'))})
         tm.assert_frame_equal(result, expected)
 
+    def test_union(self):
+        from pandas.types.concat import union_categoricals
+
+        s = Categorical(list('abc'))
+        s2 = Categorical(list('abd'))
+        result = union_categoricals([s, s2])
+        expected = Categorical(list('abcabd'))
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0,1,2])
+        s2 = Categorical([2,3,4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0,1,2,2,3,4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0,1.2,2])
+        s2 = Categorical([2,3.4,4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0,1.2,2,2,3.4,4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        # can't be ordered
+        s = Categorical([0,1.2,2], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
+        # must exactly match types
+        s = Categorical([0,1.2,2])
+        s2 = Categorical([2,3,4])
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
     def test_categorical_index_preserver(self):
 
         a = Series(np.arange(6, dtype='int64'))

diff --git a/pandas/types/concat.py b/pandas/types/concat.py
@@ -201,6 +201,46 @@ def convert_categorical(x):
         return Categorical(concatted, rawcats)
 
 
+def union_categoricals(to_union):
+    """
+    Combine list-like of Categoricals, unioning categories. All
+    must have the same dtype, and none can be ordered.
+
+    Parameters
+    ----------
+    to_union : list like of Categorical
+
+    Returns
+    -------
+    Categorical
+       A single array, categories will be ordered as they
+       appear in the list
+    """
+    from pandas import Index, Categorical
+
+    if any(c.ordered for c in to_union):
+        raise TypeError("Can only combine unordered Categoricals")
+
+    first = to_union[0]
+    if not all(com.is_dtype_equal(c.categories, first.categories)
+               for c in to_union):
+        raise TypeError("dtype of categories must be the same")
+
+    for i, c in enumerate(to_union):
+        if i == 0:
+            cats = c.categories.tolist()
+        else:
+            cats = cats + c.categories.difference(Index(cats)).tolist()
+
+    cats = Index(cats)
+    new_codes = []
+    for c in to_union:
+        indexer = cats.get_indexer(c.categories)
+        new_codes.append(indexer.take(c.codes))
+    codes = np.concatenate(new_codes)
+    return Categorical.from_codes(codes, cats)
+
+
 def _concat_datetime(to_concat, axis=0, typs=None):
     """
     provide concatenation of an datetimelike array of arrays each of which is a

diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -963,12 +963,17 @@ def assertNotIsInstance(obj, cls, msg=''):
 
 
 def assert_categorical_equal(left, right, check_dtype=True,
-                             obj='Categorical'):
+                             obj='Categorical', ignore_order=False):
     assertIsInstance(left, pd.Categorical, '[Categorical] ')
     assertIsInstance(right, pd.Categorical, '[Categorical] ')
 
-    assert_index_equal(left.categories, right.categories,
-                       obj='{0}.categories'.format(obj))
+    if ignore_order:
+        assert_index_equal(left.categories.sort_values(),
+                           right.categories.sort_values(),
+                           obj='{0}.categories'.format(obj))
+    else:
+        assert_index_equal(left.categories, right.categories,
+                           obj='{0}.categories'.format(obj))
     assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
                              obj='{0}.codes'.format(obj))