pandas-dev · chris-b1 · Jul 29, 2016 · Jul 29, 2016 · Aug 2, 2016 · Aug 2, 2016
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -656,7 +656,7 @@ Unioning
 .. versionadded:: 0.19.0
 
 If you want to combine categoricals that do not necessarily have
-the same categories, the `union_categorical` function will
+the same categories, the ``union_categoricals`` function will
 combine a list-like of categoricals. The new categories
 will be the union of the categories being combined.
 
@@ -667,6 +667,14 @@ will be the union of the categories being combined.
     b = pd.Categorical(["a", "b"])
     union_categoricals([a, b])
 
+By default, the resulting categories will be ordered as
+they appear in the data. If you want the categories to
+be lexsorted, use ``sort_categories=True`` argument.
+
+.. ipython:: python
+
+    union_categoricals([a, b], sort_categories=True)
+
 .. note::
 
    In addition to the "easy" case of combining two categoricals of the same

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -336,7 +336,7 @@ Other enhancements
 - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
-- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
+- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`, :issue:`:13763`, issue:`13846')
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`).
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)

diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
@@ -989,6 +989,106 @@ def test_union_categoricals_ordered(self):
         with tm.assertRaisesRegexp(TypeError, msg):
             union_categoricals([c1, c2])
 
+    def test_union_categoricals_sort(self):
+        # GH 13846
+        c1 = Categorical(['x', 'y', 'z'])
+        c2 = Categorical(['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+                               categories=['a', 'b', 'c', 'x', 'y', 'z'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath
+        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath - skip resort
+        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['x', np.nan])
+        c2 = Categorical([np.nan, 'b'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['x', np.nan, np.nan, 'b'],
+                               categories=['b', 'x'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([np.nan])
+        c2 = Categorical([np.nan])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical([np.nan, np.nan], categories=[])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([])
+        c2 = Categorical([])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical([])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
+        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([c1, c2], sort_categories=True)
+
+    def test_union_categoricals_sort_false(self):
+        # GH 13846
+        c1 = Categorical(['x', 'y', 'z'])
+        c2 = Categorical(['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+                               categories=['x', 'y', 'z', 'a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath
+        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['b', 'a', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath - skip resort
+        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['x', np.nan])
+        c2 = Categorical([np.nan, 'b'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['x', np.nan, np.nan, 'b'],
+                               categories=['x', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([np.nan])
+        c2 = Categorical([np.nan])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical([np.nan, np.nan], categories=[])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([])
+        c2 = Categorical([])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical([])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
+        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['b', 'a', 'a', 'c'],
+                               categories=['b', 'a', 'c'], ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]

diff --git a/pandas/types/concat.py b/pandas/types/concat.py
@@ -211,29 +211,31 @@ def convert_categorical(x):
         return Categorical(concatted, rawcats)
 
 
-def union_categoricals(to_union):
+def union_categoricals(to_union, sort_categories=False):
     """
     Combine list-like of Categoricals, unioning categories. All
-    must have the same dtype, and none can be ordered.
+    categories must have the same dtype.
 
     .. versionadded:: 0.19.0
 
     Parameters
     ----------
     to_union : list-like of Categoricals
+    sort_categories : boolean, default False
+        If true, resulting categories will be lexsorted, otherwise
+        they will be ordered as they appear in the data.
 
     Returns
     -------
-    Categorical
-       A single array, categories will be ordered as they
-       appear in the list
+    result : Categorical
 
     Raises
     ------
     TypeError
         - all inputs do not have the same dtype
         - all inputs do not have the same ordered property
         - all inputs are ordered and their categories are not identical
+        - sort_categories=True and Categoricals are ordered
     ValueError
         Emmpty list of categoricals passed
     """
@@ -244,41 +246,51 @@ def union_categoricals(to_union):
 
     first = to_union[0]
 
-    if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
-               for c in to_union):
+    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
+               for other in to_union[1:]):
         raise TypeError("dtype of categories must be the same")
 
+    ordered = False
     if all(first.is_dtype_equal(other) for other in to_union[1:]):
-        return Categorical(np.concatenate([c.codes for c in to_union]),
-                           categories=first.categories, ordered=first.ordered,
-                           fastpath=True)
+        # identical categories - fastpath
+        categories = first.categories
+        ordered = first.ordered
+        new_codes = np.concatenate([c.codes for c in to_union])
+
+        if sort_categories and ordered:
+            raise TypeError("Cannot use sort_categories=True with "
+                            "ordered Categoricals")
+
+        if sort_categories and not categories.is_monotonic_increasing:
+            categories = categories.sort_values()
+            indexer = first.categories.get_indexer(categories)
+            new_codes = take_1d(indexer, new_codes, fill_value=-1)
     elif all(not c.ordered for c in to_union):
-        # not ordered
-        pass
+        # different categories - union and recode
+        cats = first.categories.append([c.categories for c in to_union[1:]])
+        categories = Index(cats.unique())
+        if sort_categories:
+            categories = categories.sort_values()
+
+        new_codes = []
+        for c in to_union:
+            if len(c.categories) > 0:
+                indexer = categories.get_indexer(c.categories)
+                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
+            else:
+                # must be all NaN
+                new_codes.append(c.codes)
+        new_codes = np.concatenate(new_codes)
     else:
-        # to show a proper error message
+        # ordered - to show a proper error message
         if all(c.ordered for c in to_union):
             msg = ("to union ordered Categoricals, "
                    "all categories must be the same")
             raise TypeError(msg)
         else:
             raise TypeError('Categorical.ordered must be the same')
 
-    cats = first.categories
-    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
-    categories = Index(unique_cats)
-
-    new_codes = []
-    for c in to_union:
-        if len(c.categories) > 0:
-            indexer = categories.get_indexer(c.categories)
-            new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
-        else:
-            # must be all NaN
-            new_codes.append(c.codes)
-
-    new_codes = np.concatenate(new_codes)
-    return Categorical(new_codes, categories=categories, ordered=False,
+    return Categorical(new_codes, categories=categories, ordered=ordered,
                        fastpath=True)