From 0c550e6b70e17d10c8f296578ce24a033e1283bc Mon Sep 17 00:00:00 2001
From: Kernc <kerncece@gmail.com>
Date: Fri, 17 Feb 2017 16:55:46 +0100
Subject: [PATCH 1/4] BUG: Fix .groupby(categorical, sort=False) failing

---
 doc/source/whatsnew/v0.20.0.txt          |  1 +
 pandas/core/groupby.py                   |  5 +++++
 pandas/tests/groupby/test_categorical.py | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index ae4a3d3c3d97f..4a86d949c9700 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -539,6 +539,7 @@ Bug Fixes
 
 - Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`)
 
+- Bug in ``.groupby`` where ```.groupby(categorical, sort=False)`` would raise ``ValueError`` due to non-matching categories (:issue:`13179`)
 
 
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index ba2de295fa0a9..72d1fc96547de 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -2315,6 +2315,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                 # groupby
                 else:
                     cat = self.grouper.unique()
+                    all_categories = self.grouper.categories
+                    cat.add_categories(
+                        all_categories[
+                            ~all_categories.isin(cat.categories)],
+                        inplace=True)  # GH-13179
                     self.grouper = self.grouper.reorder_categories(
                         cat.categories)
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index eebd0e0f490c1..cfcb531bedab8 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self):
 
             tm.assert_frame_equal(result, expected, check_index_type=True)
 
+    def test_groupby_preserve_categories(self):
+        # GH-13179
+        categories = list('abc')
+
+        # ordered=True
+        df = DataFrame({'A': pd.Categorical(list('ba'),
+                                            categories=categories,
+                                            ordered=True)})
+        index = pd.CategoricalIndex(categories, categories, ordered=True)
+        tm.assert_index_equal(df.groupby('A', sort=True).first().index, index)
+        tm.assert_index_equal(df.groupby('A', sort=False).first().index, index)
+
+        # ordered=False
+        df = DataFrame({'A': pd.Categorical(list('ba'),
+                                            categories=categories,
+                                            ordered=False)})
+        sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
+        nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
+                                           ordered=False)
+        tm.assert_index_equal(df.groupby('A', sort=True).first().index,
+                              sort_index)
+        tm.assert_index_equal(df.groupby('A', sort=False).first().index,
+                              nosort_index)
+
     def test_groupby_preserve_categorical_dtype(self):
         # GH13743, GH13854
         df = DataFrame({'A': [1, 2, 1, 1, 2],

From c81314690363e95f983604daa87b6823921b2df5 Mon Sep 17 00:00:00 2001
From: Kernc <kerncece@gmail.com>
Date: Fri, 17 Feb 2017 18:50:57 +0100
Subject: [PATCH 2/4] PERF: add asv for categorical grouping

---
 asv_bench/benchmarks/groupby.py | 37 +++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 03ff62568b405..59f55914ea4d3 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -492,6 +492,43 @@ def time_groupby_sum(self):
         self.df.groupby(['a'])['b'].sum()
 
 
+class groupby_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+        arr = np.random.random(N)
+
+        self.df = DataFrame(dict(
+            a=Categorical(np.random.randint(10000, size=N)),
+            b=arr))
+        self.df_ordered = DataFrame(dict(
+            a=Categorical(np.random.randint(10000, size=N), ordered=True),
+            b=arr))
+        self.df_extra_cat = DataFrame(dict(
+            a=Categorical(np.random.randint(100, size=N),
+                          categories=np.arange(10000)),
+            b=arr))
+
+    def time_groupby_sort(self):
+        self.df.groupby('a')['b'].count()
+
+    def time_groupby_nosort(self):
+        self.df.groupby('a', sort=False)['b'].count()
+
+    def time_groupby_ordered_sort(self):
+        self.df_ordered.groupby('a')['b'].count()
+
+    def time_groupby_ordered_nosort(self):
+        self.df_ordered.groupby('a', sort=False)['b'].count()
+
+    def time_groupby_extra_cat_sort(self):
+        self.df_extra_cat.groupby('a')['b'].count()
+
+    def time_groupby_extra_cat_nosort(self):
+        self.df_extra_cat.groupby('a', sort=False)['b'].count()
+
+
 class groupby_period(object):
     # GH 14338
     goal_time = 0.2

From 2aec326c769f5b6c55458d629043ab638ab0ae42 Mon Sep 17 00:00:00 2001
From: Kernc <kerncece@gmail.com>
Date: Sat, 18 Feb 2017 17:42:23 +0100
Subject: [PATCH 3/4] fixup! BUG: Fix .groupby(categorical, sort=False) failing

---
 doc/source/whatsnew/v0.20.0.txt | 38 +++++++++++++++++++++++++++++++--
 pandas/core/categorical.py      | 29 +++++++++++++++++++++++++
 pandas/core/groupby.py          | 23 +-------------------
 pandas/indexes/category.py      |  4 ++++
 4 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 4a86d949c9700..8682d418f2b8f 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -120,6 +120,42 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
 
+.. _whatsnew_0200.enhancements.groupy_categorical
+
+GroupBy on Categoricals
+^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous version, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
+
+Now, it works.
+
+.. ipython:: python
+
+  chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']]
+  df = pd.DataFrame({
+      'A': np.random.randint(100),
+      'B': np.random.randint(100),
+      'C': np.random.randint(100),
+      'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100),
+                                    categories=chromosomes,
+                                    ordered=True)})
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+  In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
+  ---------------------------------------------------------------------------
+  ValueError                                Traceback (most recent call last)
+  ...
+  ValueError: items in new_categories are not the same as in old categories
+
+New Behavior:
+
+.. ipython:: python
+
+  df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
@@ -160,7 +196,6 @@ Other enhancements
 
 .. _whatsnew_0200.api_breaking:
 
-
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -539,7 +574,6 @@ Bug Fixes
 
 - Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`)
 
-- Bug in ``.groupby`` where ```.groupby(categorical, sort=False)`` would raise ``ValueError`` due to non-matching categories (:issue:`13179`)
 
 
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 491db2e080953..60d72d69d346b 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -602,6 +602,35 @@ def _get_categories(self):
     categories = property(fget=_get_categories, fset=_set_categories,
                           doc=_categories_doc)
 
+    def _codes_for_groupby(self, sort):
+        """
+        Return a Categorical adjusted for groupby
+
+        Parameters
+        ----------
+        sort : boolean
+            The value of the sort paramter groupby was called with.
+
+        Returns
+        -------
+        Categorical
+            In case of sort=True, self is returned with original categories
+            preserved. In case of sort=False, the new categories are set
+            to the order of appearance in codes (unless ordered=True),
+            followed by any unrepresented categories in original order.
+        """
+        cat = self
+        # sort=False should order groups in as-encountered order (GH-8868)
+        if not sort:
+            cat = self.unique()
+            # But all categories should be present, including those missing
+            # from the data (GH-13179), which .unique() dropped
+            cat.add_categories(self.categories[
+                                   ~self.categories.isin(cat.categories)],
+                               inplace=True)
+            cat = self.reorder_categories(cat.categories)
+        return cat
+
     _ordered = None
 
     def set_ordered(self, value, inplace=False):
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 72d1fc96547de..0b3fcba1c1ba5 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -2300,28 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
-                # must have an ordered categorical
-                if self.sort:
-                    if not self.grouper.ordered:
-
-                        # technically we cannot group on an unordered
-                        # Categorical
-                        # but this a user convenience to do so; the ordering
-                        # is preserved and if it's a reduction it doesn't make
-                        # any difference
-                        pass
-
-                # fix bug #GH8868 sort=False being ignored in categorical
-                # groupby
-                else:
-                    cat = self.grouper.unique()
-                    all_categories = self.grouper.categories
-                    cat.add_categories(
-                        all_categories[
-                            ~all_categories.isin(cat.categories)],
-                        inplace=True)  # GH-13179
-                    self.grouper = self.grouper.reorder_categories(
-                        cat.categories)
+                self.grouper = self.grouper._codes_for_groupby(self.sort)
 
                 # we make a CategoricalIndex out of the cat grouper
                 # preserving the categories / ordered attributes
diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
index acb2758641a62..5299a094156cd 100644
--- a/pandas/indexes/category.py
+++ b/pandas/indexes/category.py
@@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name):
         result.name = name
         return result
 
+    def _codes_for_groupby(self, sort):
+        """ Return a Categorical adjusted for groupby """
+        return self.values._codes_for_groupby(sort)
+
     @classmethod
     def _add_comparison_methods(cls):
         """ add in comparison methods """

From 55733b8c497b896662daf687969c80a2628beef6 Mon Sep 17 00:00:00 2001
From: Kernc <kerncece@gmail.com>
Date: Sun, 19 Feb 2017 01:21:59 +0100
Subject: [PATCH 4/4] fixup! BUG: Fix .groupby(categorical, sort=False) failing

---
 doc/source/whatsnew/v0.20.0.txt |  2 +-
 pandas/core/categorical.py      | 37 ++++++++++++++++++++-------------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 8682d418f2b8f..fb8a708849abc 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -125,7 +125,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 GroupBy on Categoricals
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-In previous version, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
+In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
 
 Now, it works.
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
index 60d72d69d346b..c188f04d23873 100644
--- a/pandas/core/categorical.py
+++ b/pandas/core/categorical.py
@@ -604,7 +604,13 @@ def _get_categories(self):
 
     def _codes_for_groupby(self, sort):
         """
-        Return a Categorical adjusted for groupby
+        If sort=False, return a copy of self, coded with categories as
+        returned by .unique(), followed by any categories not appearing in
+        the data. If sort=True, return self.
+
+        This method is needed solely to ensure the categorical index of the
+        GroupBy result has categories in the order of appearance in the data
+        (GH-8868).
 
         Parameters
         ----------
@@ -614,21 +620,24 @@ def _codes_for_groupby(self, sort):
         Returns
         -------
         Categorical
-            In case of sort=True, self is returned with original categories
-            preserved. In case of sort=False, the new categories are set
-            to the order of appearance in codes (unless ordered=True),
-            followed by any unrepresented categories in original order.
+            If sort=False, the new categories are set to the order of
+            appearance in codes (unless ordered=True, in which case the
+            original order is preserved), followed by any unrepresented
+            categories in the original order.
         """
-        cat = self
+        if sort:
+            # Already sorted according to self.categories; all is fine
+            return self
+
         # sort=False should order groups in as-encountered order (GH-8868)
-        if not sort:
-            cat = self.unique()
-            # But all categories should be present, including those missing
-            # from the data (GH-13179), which .unique() dropped
-            cat.add_categories(self.categories[
-                                   ~self.categories.isin(cat.categories)],
-                               inplace=True)
-            cat = self.reorder_categories(cat.categories)
+        cat = self.unique()
+        # But for groupby to work, all categories should be present,
+        # including those missing from the data (GH-13179), which .unique()
+        # above dropped
+        cat.add_categories(
+            self.categories[~self.categories.isin(cat.categories)],
+            inplace=True)
+        cat = self.reorder_categories(cat.categories)
         return cat
 
     _ordered = None