CLN: clean up groupby / categorical (#21753)

jreback · web-flow · commit e2f2e2152539 · 2018-07-06T09:03:12.000-05:00
xref #21151
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -698,73 +698,6 @@ def _set_categories(self, categories, fastpath=False):
 
         self._dtype = new_dtype
 
-    def _codes_for_groupby(self, sort, observed):
-        """
-        Code the categories to ensure we can groupby for categoricals.
-
-        If observed=True, we return a new Categorical with the observed
-        categories only.
-
-        If sort=False, return a copy of self, coded with categories as
-        returned by .unique(), followed by any categories not appearing in
-        the data. If sort=True, return self.
-
-        This method is needed solely to ensure the categorical index of the
-        GroupBy result has categories in the order of appearance in the data
-        (GH-8868).
-
-        Parameters
-        ----------
-        sort : boolean
-            The value of the sort parameter groupby was called with.
-        observed : boolean
-            Account only for the observed values
-
-        Returns
-        -------
-        Categorical
-            If sort=False, the new categories are set to the order of
-            appearance in codes (unless ordered=True, in which case the
-            original order is preserved), followed by any unrepresented
-            categories in the original order.
-        """
-
-        # we only care about observed values
-        if observed:
-            unique_codes = unique1d(self.codes)
-            cat = self.copy()
-
-            take_codes = unique_codes[unique_codes != -1]
-            if self.ordered:
-                take_codes = np.sort(take_codes)
-
-            # we recode according to the uniques
-            categories = self.categories.take(take_codes)
-            codes = _recode_for_categories(self.codes,
-                                           self.categories,
-                                           categories)
-
-            # return a new categorical that maps our new codes
-            # and categories
-            dtype = CategoricalDtype(categories, ordered=self.ordered)
-            return type(self)(codes, dtype=dtype, fastpath=True)
-
-        # Already sorted according to self.categories; all is fine
-        if sort:
-            return self
-
-        # sort=False should order groups in as-encountered order (GH-8868)
-        cat = self.unique()
-
-        # But for groupby to work, all categories should be present,
-        # including those missing from the data (GH-13179), which .unique()
-        # above dropped
-        cat.add_categories(
-            self.categories[~self.categories.isin(cat.categories)],
-            inplace=True)
-
-        return self.reorder_categories(cat.categories)
-
     def _set_dtype(self, dtype):
         """Internal method for directly updating the CategoricalDtype
 
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
@@ -0,0 +1,99 @@
+import numpy as np
+from pandas.core.algorithms import unique1d
+from pandas.core.arrays.categorical import (
+    _recode_for_categories, CategoricalDtype, Categorical)
+
+
+def recode_for_groupby(c, sort, observed):
+    """
+    Code the categories to ensure we can groupby for categoricals.
+
+    If observed=True, we return a new Categorical with the observed
+    categories only.
+
+    If sort=False, return a copy of self, coded with categories as
+    returned by .unique(), followed by any categories not appearing in
+    the data. If sort=True, return self.
+
+    This method is needed solely to ensure the categorical index of the
+    GroupBy result has categories in the order of appearance in the data
+    (GH-8868).
+
+    Parameters
+    ----------
+    c : Categorical
+    sort : boolean
+        The value of the sort parameter groupby was called with.
+    observed : boolean
+        Account only for the observed values
+
+    Returns
+    -------
+    New Categorical
+        If sort=False, the new categories are set to the order of
+        appearance in codes (unless ordered=True, in which case the
+        original order is preserved), followed by any unrepresented
+        categories in the original order.
+    Categorical or None
+        If we are observed, return the original categorical, otherwise None
+    """
+
+    # we only care about observed values
+    if observed:
+        unique_codes = unique1d(c.codes)
+
+        take_codes = unique_codes[unique_codes != -1]
+        if c.ordered:
+            take_codes = np.sort(take_codes)
+
+        # we recode according to the uniques
+        categories = c.categories.take(take_codes)
+        codes = _recode_for_categories(c.codes,
+                                       c.categories,
+                                       categories)
+
+        # return a new categorical that maps our new codes
+        # and categories
+        dtype = CategoricalDtype(categories, ordered=c.ordered)
+        return Categorical(codes, dtype=dtype, fastpath=True), c
+
+    # Already sorted according to c.categories; all is fine
+    if sort:
+        return c, None
+
+    # sort=False should order groups in as-encountered order (GH-8868)
+    cat = c.unique()
+
+    # But for groupby to work, all categories should be present,
+    # including those missing from the data (GH-13179), which .unique()
+    # above dropped
+    cat = cat.add_categories(
+        c.categories[~c.categories.isin(cat.categories)])
+
+    return c.reorder_categories(cat.categories), None
+
+
+def recode_from_groupby(c, sort, ci):
+    """
+    Reverse the codes_to_groupby to account for sort / observed.
+
+    Parameters
+    ----------
+    c : Categorical
+    sort : boolean
+        The value of the sort parameter groupby was called with.
+    ci : CategoricalIndex
+        The codes / categories to recode
+
+    Returns
+    -------
+    CategoricalIndex
+    """
+
+    # we re-order to the original category orderings
+    if sort:
+        return ci.set_categories(c.categories)
+
+    # we are not sorting, so add unobserved to the end
+    return ci.add_categories(
+        c.categories[~c.categories.isin(ci.categories)])
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2994,9 +2994,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
-                self.all_grouper = self.grouper
-                self.grouper = self.grouper._codes_for_groupby(
-                    self.sort, observed)
+                from pandas.core.groupby.categorical import recode_for_groupby
+                self.grouper, self.all_grouper = recode_for_groupby(
+                    self.grouper, self.sort, observed)
                 categories = self.grouper.categories
 
                 # we make a CategoricalIndex out of the cat grouper
@@ -3073,17 +3073,9 @@ def labels(self):
     @cache_readonly
     def result_index(self):
         if self.all_grouper is not None:
-            all_categories = self.all_grouper.categories
-
-            # we re-order to the original category orderings
-            if self.sort:
-                return self.group_index.set_categories(all_categories)
-
-            # we are not sorting, so add unobserved to the end
-            categories = self.group_index.categories
-            return self.group_index.add_categories(
-                all_categories[~all_categories.isin(categories)])
-
+            from pandas.core.groupby.categorical import recode_from_groupby
+            return recode_from_groupby(self.all_grouper,
+                                       self.sort, self.group_index)
         return self.group_index
 
     @property