BUG: Categorical.unique() preserves categories

kernc · AnkurDedania · commit fa3c2e4fcca3 · 2017-03-21T11:17:23.000-05:00
closes pandas-dev#13179 Author: Kernc <kerncece@gmail.com> Closes pandas-dev#15439 from kernc/Categorical.unique-nostrip-unused and squashes the following commits: 55733b8 [Kernc] fixup! BUG: Fix .groupby(categorical, sort=False) failing 2aec326 [Kernc] fixup! BUG: Fix .groupby(categorical, sort=False) failing c813146 [Kernc] PERF: add asv for categorical grouping 0c550e6 [Kernc] BUG: Fix .groupby(categorical, sort=False) failing
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -492,6 +492,43 @@ def time_groupby_sum(self):
         self.df.groupby(['a'])['b'].sum()
 
 
+class groupby_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+        arr = np.random.random(N)
+
+        self.df = DataFrame(dict(
+            a=Categorical(np.random.randint(10000, size=N)),
+            b=arr))
+        self.df_ordered = DataFrame(dict(
+            a=Categorical(np.random.randint(10000, size=N), ordered=True),
+            b=arr))
+        self.df_extra_cat = DataFrame(dict(
+            a=Categorical(np.random.randint(100, size=N),
+                          categories=np.arange(10000)),
+            b=arr))
+
+    def time_groupby_sort(self):
+        self.df.groupby('a')['b'].count()
+
+    def time_groupby_nosort(self):
+        self.df.groupby('a', sort=False)['b'].count()
+
+    def time_groupby_ordered_sort(self):
+        self.df_ordered.groupby('a')['b'].count()
+
+    def time_groupby_ordered_nosort(self):
+        self.df_ordered.groupby('a', sort=False)['b'].count()
+
+    def time_groupby_extra_cat_sort(self):
+        self.df_extra_cat.groupby('a')['b'].count()
+
+    def time_groupby_extra_cat_nosort(self):
+        self.df_extra_cat.groupby('a', sort=False)['b'].count()
+
+
 class groupby_period(object):
     # GH 14338
     goal_time = 0.2
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -120,6 +120,39 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
 
+.. _whatsnew_0200.enhancements.groupy_categorical
+
+GroupBy on Categoricals
+^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)
+
+.. ipython:: python
+
+  chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']]
+  df = pd.DataFrame({
+      'A': np.random.randint(100),
+      'B': np.random.randint(100),
+      'C': np.random.randint(100),
+      'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100),
+                                    categories=chromosomes,
+                                    ordered=True)})
+  df
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+  In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
+  ---------------------------------------------------------------------------
+  ValueError: items in new_categories are not the same as in old categories
+
+New Behavior:
+
+.. ipython:: python
+
+  df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements
@@ -163,7 +196,6 @@ Other enhancements
 
 .. _whatsnew_0200.api_breaking:
 
-
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -602,6 +602,46 @@ def _get_categories(self):
     categories = property(fget=_get_categories, fset=_set_categories,
                           doc=_categories_doc)
 
+    def _codes_for_groupby(self, sort):
+        """
+        If sort=False, return a copy of self, coded with categories as
+        returned by .unique(), followed by any categories not appearing in
+        the data. If sort=True, return self.
+
+        This method is needed solely to ensure the categorical index of the
+        GroupBy result has categories in the order of appearance in the data
+        (GH-8868).
+
+        Parameters
+        ----------
+        sort : boolean
+            The value of the sort paramter groupby was called with.
+
+        Returns
+        -------
+        Categorical
+            If sort=False, the new categories are set to the order of
+            appearance in codes (unless ordered=True, in which case the
+            original order is preserved), followed by any unrepresented
+            categories in the original order.
+        """
+
+        # Already sorted according to self.categories; all is fine
+        if sort:
+            return self
+
+        # sort=False should order groups in as-encountered order (GH-8868)
+        cat = self.unique()
+
+        # But for groupby to work, all categories should be present,
+        # including those missing from the data (GH-13179), which .unique()
+        # above dropped
+        cat.add_categories(
+            self.categories[~self.categories.isin(cat.categories)],
+            inplace=True)
+
+        return self.reorder_categories(cat.categories)
+
     _ordered = None
 
     def set_ordered(self, value, inplace=False):
@@ -1853,8 +1893,10 @@ def unique(self):
         # unlike np.unique, unique1d does not sort
         unique_codes = unique1d(self.codes)
         cat = self.copy()
+
         # keep nan in codes
         cat._codes = unique_codes
+
         # exclude nan from indexer for categories
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2300,23 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
-                # must have an ordered categorical
-                if self.sort:
-                    if not self.grouper.ordered:
-
-                        # technically we cannot group on an unordered
-                        # Categorical
-                        # but this a user convenience to do so; the ordering
-                        # is preserved and if it's a reduction it doesn't make
-                        # any difference
-                        pass
-
-                # fix bug #GH8868 sort=False being ignored in categorical
-                # groupby
-                else:
-                    cat = self.grouper.unique()
-                    self.grouper = self.grouper.reorder_categories(
-                        cat.categories)
+                self.grouper = self.grouper._codes_for_groupby(self.sort)
 
                 # we make a CategoricalIndex out of the cat grouper
                 # preserving the categories / ordered attributes
diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
@@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name):
         result.name = name
         return result
 
+    def _codes_for_groupby(self, sort):
+        """ Return a Categorical adjusted for groupby """
+        return self.values._codes_for_groupby(sort)
+
     @classmethod
     def _add_comparison_methods(cls):
         """ add in comparison methods """
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self):
 
             tm.assert_frame_equal(result, expected, check_index_type=True)
 
+    def test_groupby_preserve_categories(self):
+        # GH-13179
+        categories = list('abc')
+
+        # ordered=True
+        df = DataFrame({'A': pd.Categorical(list('ba'),
+                                            categories=categories,
+                                            ordered=True)})
+        index = pd.CategoricalIndex(categories, categories, ordered=True)
+        tm.assert_index_equal(df.groupby('A', sort=True).first().index, index)
+        tm.assert_index_equal(df.groupby('A', sort=False).first().index, index)
+
+        # ordered=False
+        df = DataFrame({'A': pd.Categorical(list('ba'),
+                                            categories=categories,
+                                            ordered=False)})
+        sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
+        nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
+                                           ordered=False)
+        tm.assert_index_equal(df.groupby('A', sort=True).first().index,
+                              sort_index)
+        tm.assert_index_equal(df.groupby('A', sort=False).first().index,
+                              nosort_index)
+
     def test_groupby_preserve_categorical_dtype(self):
         # GH13743, GH13854
         df = DataFrame({'A': [1, 2, 1, 1, 2],