From c55966226603059dbebecf6d946d81029c5e668d Mon Sep 17 00:00:00 2001
From: sinhrks <sinhrks@gmail.com>
Date: Fri, 29 Jul 2016 06:23:43 -0400
Subject: [PATCH 1/5] ENH: add sort_categories argument to union_categoricals

---
 doc/source/categorical.rst        | 10 ++++-
 pandas/tools/tests/test_concat.py | 36 ++++++++++++++++++
 pandas/types/concat.py            | 63 +++++++++++++++++--------------
 3 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
index da9c707e07552..d59ad68c9ea83 100644
--- a/doc/source/categorical.rst
+++ b/doc/source/categorical.rst
@@ -656,7 +656,7 @@ Unioning
 .. versionadded:: 0.19.0
 
 If you want to combine categoricals that do not necessarily have
-the same categories, the `union_categorical` function will
+the same categories, the ``union_categoricals`` function will
 combine a list-like of categoricals. The new categories
 will be the union of the categories being combined.
 
@@ -667,6 +667,14 @@ will be the union of the categories being combined.
     b = pd.Categorical(["a", "b"])
     union_categoricals([a, b])
 
+By default, the resulting categories will be ordered as
+they appear in the data. If you want the categories to
+be lexsorted, use ``sort_categories=True`` argument.
+
+.. ipython:: python
+
+    union_categoricals([a, b], sort_categories=True)
+
 .. note::
 
    In addition to the "easy" case of combining two categoricals of the same
diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index dd5b4936c70bb..bcc5bd5759263 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -989,6 +989,42 @@ def test_union_categoricals_ordered(self):
         with tm.assertRaisesRegexp(TypeError, msg):
             union_categoricals([c1, c2])
 
+    def test_union_categoricals_sort(self):
+        # GH 13763
+        c1 = Categorical(['x', 'y', 'z'])
+        c2 = Categorical(['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+                               categories=['a', 'b', 'c', 'x', 'y', 'z'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath
+        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['x', np.nan])
+        c2 = Categorical([np.nan, 'b'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['x', np.nan, np.nan, 'b'],
+                               categories=['b', 'x'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([np.nan])
+        c2 = Categorical([np.nan])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical([np.nan, np.nan], categories=[])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([])
+        c2 = Categorical([])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical([])
+        tm.assert_categorical_equal(result, expected)
+
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index e860ba3e201e9..a02cb98811354 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -211,22 +211,23 @@ def convert_categorical(x):
         return Categorical(concatted, rawcats)
 
 
-def union_categoricals(to_union):
+def union_categoricals(to_union, sort_categories=False):
     """
     Combine list-like of Categoricals, unioning categories. All
-    must have the same dtype, and none can be ordered.
+    categories must have the same dtype.
 
     .. versionadded:: 0.19.0
 
     Parameters
     ----------
     to_union : list-like of Categoricals
+    sort_categories : boolean, default False
+        If true, resulting categories will be lexsorted, otherwise
+        they will be ordered as they appear in the data
 
     Returns
     -------
-    Categorical
-       A single array, categories will be ordered as they
-       appear in the list
+    result : Categorical
 
     Raises
     ------
@@ -244,19 +245,39 @@ def union_categoricals(to_union):
 
     first = to_union[0]
 
-    if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
-               for c in to_union):
+    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
+               for other in to_union[1:]):
         raise TypeError("dtype of categories must be the same")
 
+    ordered = False
     if all(first.is_dtype_equal(other) for other in to_union[1:]):
-        return Categorical(np.concatenate([c.codes for c in to_union]),
-                           categories=first.categories, ordered=first.ordered,
-                           fastpath=True)
+        # identical categories - fastpath
+        categories = first.categories
+        ordered = first.ordered
+        new_codes = np.concatenate([c.codes for c in to_union])
+
+        if sort_categories:
+            categories = categories.sort_values()
+            indexer = first.categories.get_indexer(categories)
+            new_codes = take_1d(indexer, new_codes, fill_value=-1)
     elif all(not c.ordered for c in to_union):
-        # not ordered
-        pass
+        # different categories - union and recode
+        cats = first.categories.append([c.categories for c in to_union[1:]])
+        categories = Index(cats.unique())
+        if sort_categories:
+            categories = categories.sort_values()
+
+        new_codes = []
+        for c in to_union:
+            if len(c.categories) > 0:
+                indexer = categories.get_indexer(c.categories)
+                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
+            else:
+                # must be all NaN
+                new_codes.append(c.codes)
+        new_codes = np.concatenate(new_codes)
     else:
-        # to show a proper error message
+        # ordered - to show a proper error message
         if all(c.ordered for c in to_union):
             msg = ("to union ordered Categoricals, "
                    "all categories must be the same")
@@ -264,21 +285,7 @@ def union_categoricals(to_union):
         else:
             raise TypeError('Categorical.ordered must be the same')
 
-    cats = first.categories
-    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
-    categories = Index(unique_cats)
-
-    new_codes = []
-    for c in to_union:
-        if len(c.categories) > 0:
-            indexer = categories.get_indexer(c.categories)
-            new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
-        else:
-            # must be all NaN
-            new_codes.append(c.codes)
-
-    new_codes = np.concatenate(new_codes)
-    return Categorical(new_codes, categories=categories, ordered=False,
+    return Categorical(new_codes, categories=categories, ordered=ordered,
                        fastpath=True)
 
 

From eea177711b62698ff7ecc267b4bc5b01563a9d20 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Fri, 29 Jul 2016 17:16:23 -0500
Subject: [PATCH 2/5] skip r-esort when possible on fastpath

---
 pandas/tools/tests/test_concat.py | 8 ++++++++
 pandas/types/concat.py            | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index bcc5bd5759263..4764ec7cbcc7a 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -1006,6 +1006,14 @@ def test_union_categoricals_sort(self):
                                categories=['a', 'b', 'c'])
         tm.assert_categorical_equal(result, expected)
 
+        # fastpath - skip resort
+        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
         c1 = Categorical(['x', np.nan])
         c2 = Categorical([np.nan, 'b'])
         result = union_categoricals([c1, c2], sort_categories=True)
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index a02cb98811354..5f61f27507ee8 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -256,7 +256,7 @@ def union_categoricals(to_union, sort_categories=False):
         ordered = first.ordered
         new_codes = np.concatenate([c.codes for c in to_union])
 
-        if sort_categories:
+        if sort_categories and not categories.is_monotonic_increasing:
             categories = categories.sort_values()
             indexer = first.categories.get_indexer(categories)
             new_codes = take_1d(indexer, new_codes, fill_value=-1)

From ecb2ae934baf22df763db83b15d3c2b1b56d71e4 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Tue, 2 Aug 2016 05:29:04 -0500
Subject: [PATCH 3/5] more tests; handle sorth with ordered

---
 pandas/tools/tests/test_concat.py | 59 ++++++++++++++++++++++++++++++-
 pandas/types/concat.py            |  7 +++-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index 4764ec7cbcc7a..48516b6cfb14d 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -990,7 +990,7 @@ def test_union_categoricals_ordered(self):
             union_categoricals([c1, c2])
 
     def test_union_categoricals_sort(self):
-        # GH 13763
+        # GH 13846
         c1 = Categorical(['x', 'y', 'z'])
         c2 = Categorical(['a', 'b', 'c'])
         result = union_categoricals([c1, c2], sort_categories=True)
@@ -1033,6 +1033,63 @@ def test_union_categoricals_sort(self):
         expected = Categorical([])
         tm.assert_categorical_equal(result, expected)
 
+        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
+        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([c1, c2], sort_categories=True)
+
+    def test_union_categoricals_sort_false(self):
+        # GH 13846
+        c1 = Categorical(['x', 'y', 'z'])
+        c2 = Categorical(['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
+                               categories=['x', 'y', 'z', 'a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath
+        c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['b', 'a', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath - skip resort
+        c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
+        c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['a', 'b', 'b', 'c'],
+                               categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['x', np.nan])
+        c2 = Categorical([np.nan, 'b'])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['x', np.nan, np.nan, 'b'],
+                               categories=['x', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([np.nan])
+        c2 = Categorical([np.nan])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical([np.nan, np.nan], categories=[])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([])
+        c2 = Categorical([])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical([])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
+        c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(['b', 'a', 'a', 'c'],
+                               categories=['b', 'a', 'c'], ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
index 5f61f27507ee8..0a985dd6141ae 100644
--- a/pandas/types/concat.py
+++ b/pandas/types/concat.py
@@ -223,7 +223,7 @@ def union_categoricals(to_union, sort_categories=False):
     to_union : list-like of Categoricals
     sort_categories : boolean, default False
         If true, resulting categories will be lexsorted, otherwise
-        they will be ordered as they appear in the data
+        they will be ordered as they appear in the data.
 
     Returns
     -------
@@ -235,6 +235,7 @@ def union_categoricals(to_union, sort_categories=False):
         - all inputs do not have the same dtype
         - all inputs do not have the same ordered property
         - all inputs are ordered and their categories are not identical
+        - sort_categories=True and Categoricals are ordered
     ValueError
         Emmpty list of categoricals passed
     """
@@ -256,6 +257,10 @@ def union_categoricals(to_union, sort_categories=False):
         ordered = first.ordered
         new_codes = np.concatenate([c.codes for c in to_union])
 
+        if sort_categories and ordered:
+            raise TypeError("Cannot use sort_categories=True with "
+                            "ordered Categoricals")
+
         if sort_categories and not categories.is_monotonic_increasing:
             categories = categories.sort_values()
             indexer = first.categories.get_indexer(categories)

From ff0bb5ea4126d70f9e6f5afa2cffb94fec452074 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Tue, 2 Aug 2016 05:49:24 -0500
Subject: [PATCH 4/5] add follow-up PRs to whatsnew

---
 doc/source/whatsnew/v0.19.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index 62091d7ff03ff..20430d3d5cc54 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -336,7 +336,7 @@ Other enhancements
 - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
 - The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
 - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
-- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
+- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`, :issue:`:13763`, issue:`13846')
 - ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
 - ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`).
 - ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)

From 3a710f082fd2e5f94866418eb624d3e3ae391791 Mon Sep 17 00:00:00 2001
From: Chris <cbartak@gmail.com>
Date: Tue, 2 Aug 2016 17:48:07 -0500
Subject: [PATCH 5/5] lint fix

---
 pandas/tools/tests/test_concat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py
index 48516b6cfb14d..968ea979f7c75 100644
--- a/pandas/tools/tests/test_concat.py
+++ b/pandas/tools/tests/test_concat.py
@@ -1089,7 +1089,6 @@ def test_union_categoricals_sort_false(self):
                                categories=['b', 'a', 'c'], ordered=True)
         tm.assert_categorical_equal(result, expected)
 
-
     def test_concat_bug_1719(self):
         ts1 = tm.makeTimeSeries()
         ts2 = tm.makeTimeSeries()[::2]