BUG: groupby with categorical and other columns

jreback · jreback · commit 69c5cf306e71 · 2018-04-09T07:48:48.000-07:00
closes pandas-dev#14942
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -482,6 +482,57 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
                'Taxes': -200,
                'Net result': 300}).sort_index()
 
+.. _whatsnew_0230.api_breaking.categorical_grouping:
+
+Categorical Grouping no longer expands to all possible groupers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for each grouper, not just the observed values. This is inconsistent with output for other dtypes, can potentially cast to different dtypes (as missing values are introduced), and could cause a huge frame to be generated. Pandas will now return only the observed values, regardless if grouping on a categorical column; note that the categorical dtype is *still* preserved. You will still have a categorical columns (:issue:`14942`)
+
+
+.. ipython:: python
+
+   cat1 = pd.Categorical(["a", "a", "b", "b"],
+                         categories=["a", "b", "z"], ordered=True)
+   cat2 = pd.Categorical(["c", "d", "c", "d"],
+                         categories=["c", "d", "y"], ordered=True)
+   df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+   df['C'] = ['foo', 'bar'] * 2
+   df
+
+Previous Behavior:
+
+.. code-block:: python
+
+  In [4]: df.groupby(['A', 'B', 'C']).count()
+  Out[4]:
+           values
+  A B C
+  a c bar     NaN
+      foo     1.0
+    d bar     1.0
+      foo     NaN
+    y bar     NaN
+      foo     NaN
+  b c bar     NaN
+      foo     1.0
+    d bar     1.0
+      foo     NaN
+    y bar     NaN
+      foo     NaN
+  z c bar     NaN
+      foo     NaN
+    d bar     NaN
+      foo     NaN
+    y bar     NaN
+      foo     NaN
+
+New Behavior:
+
+.. ipython:: python
+
+   df.groupby(['A', 'B', 'C']).count()
+
 .. _whatsnew_0230.api_breaking.deprecate_panel:
 
 Deprecate Panel
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2336,10 +2336,13 @@ def result_index(self):
         if not self.compressed and len(self.groupings) == 1:
             return self.groupings[0].group_index.rename(self.names[0])
 
-        return MultiIndex(levels=[ping.group_index for ping in self.groupings],
-                          labels=self.recons_labels,
-                          verify_integrity=False,
-                          names=self.names)
+        labels = self.recons_labels
+        levels = [ping.group_index for ping in self.groupings]
+        result = MultiIndex(levels=levels,
+                            labels=labels,
+                            verify_integrity=False,
+                            names=self.names)
+        return result.remove_unused_levels()
 
     def get_group_levels(self):
         if not self.compressed and len(self.groupings) == 1:
@@ -4151,7 +4154,7 @@ def first_not_none(values):
                                         not_indexed_same=not_indexed_same)
         elif self.grouper.groupings is not None:
             if len(self.grouper.groupings) > 1:
-                key_index = MultiIndex.from_tuples(keys, names=key_names)
+                key_index = self.grouper.result_index
 
             else:
                 ping = self.grouper.groupings[0]
@@ -4241,8 +4244,9 @@ def first_not_none(values):
 
                         # normally use vstack as its faster than concat
                         # and if we have mi-columns
-                        if isinstance(v.index,
-                                      MultiIndex) or key_index is None:
+                        if (isinstance(v.index, MultiIndex) or
+                                key_index is None or
+                                isinstance(key_index, MultiIndex)):
                             stacked_values = np.vstack(map(np.asarray, values))
                             result = DataFrame(stacked_values, index=key_index,
                                                columns=index)
@@ -4280,7 +4284,7 @@ def first_not_none(values):
                 else:
                     result = result._convert(datetime=True)
 
-                return self._reindex_output(result)
+                return result
 
             # values are not series or array-like but scalars
             else:
@@ -4661,7 +4665,7 @@ def _wrap_aggregated_output(self, output, names=None):
         if self.axis == 1:
             result = result.T
 
-        return self._reindex_output(result)._convert(datetime=True)
+        return result._convert(datetime=True)
 
     def _wrap_transformed_output(self, output, names=None):
         return DataFrame(output, index=self.obj.index)
@@ -4682,60 +4686,7 @@ def _wrap_agged_blocks(self, items, blocks):
         if self.axis == 1:
             result = result.T
 
-        return self._reindex_output(result)._convert(datetime=True)
-
-    def _reindex_output(self, result):
-        """
-        if we have categorical groupers, then we want to make sure that
-        we have a fully reindex-output to the levels. These may have not
-        participated in the groupings (e.g. may have all been
-        nan groups)
-
-        This can re-expand the output space
-        """
-        groupings = self.grouper.groupings
-        if groupings is None:
-            return result
-        elif len(groupings) == 1:
-            return result
-        elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
-                     for ping in groupings):
-            return result
-
-        levels_list = [ping.group_index for ping in groupings]
-        index, _ = MultiIndex.from_product(
-            levels_list, names=self.grouper.names).sortlevel()
-
-        if self.as_index:
-            d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
-            return result.reindex(**d)
-
-        # GH 13204
-        # Here, the categorical in-axis groupers, which need to be fully
-        # expanded, are columns in `result`. An idea is to do:
-        # result = result.set_index(self.grouper.names)
-        #                .reindex(index).reset_index()
-        # but special care has to be taken because of possible not-in-axis
-        # groupers.
-        # So, we manually select and drop the in-axis grouper columns,
-        # reindex `result`, and then reset the in-axis grouper columns.
-
-        # Select in-axis groupers
-        in_axis_grps = [(i, ping.name) for (i, ping)
-                        in enumerate(groupings) if ping.in_axis]
-        g_nums, g_names = zip(*in_axis_grps)
-
-        result = result.drop(labels=list(g_names), axis=1)
-
-        # Set a temp index and reindex (possibly expanding)
-        result = result.set_index(self.grouper.result_index
-                                  ).reindex(index, copy=False)
-
-        # Reset in-axis grouper columns
-        # (using level numbers `g_nums` because level names may not be unique)
-        result = result.reset_index(level=g_nums)
-
-        return result.reset_index(drop=True)
+        return result._convert(datetime=True)
 
     def _iterate_column_groupbys(self):
         for i, colname in enumerate(self._selected_obj.columns):
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py