BUG: fix groupby with multiple non-compressed categoricals

jreback · jreback · commit 6fdb40097b4b · 2014-07-09T18:56:59.000-04:00
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1361,7 +1361,9 @@ def get_group_levels(self):
         name_list = []
         for ping, labels in zip(self.groupings, recons_labels):
             labels = com._ensure_platform_int(labels)
-            name_list.append(ping.group_index.take(labels))
+            levels = ping.group_index.take(labels)
+
+            name_list.append(levels)
 
         return name_list
 
@@ -1707,6 +1709,11 @@ def levels(self):
     def names(self):
         return [self.binlabels.name]
 
+    @property
+    def groupings(self):
+        # for compat
+        return None
+
     def size(self):
         """
         Compute group sizes
@@ -2632,7 +2639,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if isinstance(values[0], DataFrame):
             return self._concat_objects(keys, values,
                                         not_indexed_same=not_indexed_same)
-        elif hasattr(self.grouper, 'groupings'):
+        elif self.grouper.groupings is not None:
             if len(self.grouper.groupings) > 1:
                 key_index = MultiIndex.from_tuples(keys, names=key_names)
 
@@ -3058,7 +3065,7 @@ def _wrap_aggregated_output(self, output, names=None):
         if self.axis == 1:
             result = result.T
 
-        return result.convert_objects()
+        return self._reindex_output(result).convert_objects()
 
     def _wrap_agged_blocks(self, items, blocks):
         if not self.as_index:
@@ -3080,7 +3087,27 @@ def _wrap_agged_blocks(self, items, blocks):
         if self.axis == 1:
             result = result.T
 
-        return result.convert_objects()
+        return self._reindex_output(result).convert_objects()
+
+    def _reindex_output(self, result):
+        """
+        if we have categorical groupers, then we want to make sure that
+        we have a fully reindex-output to the levels. These may have not participated in
+        the groupings (e.g. may have all been nan groups)
+
+        This can re-expand the output space
+        """
+        groupings = self.grouper.groupings
+        if groupings is None:
+            return result
+        elif len(groupings) == 1:
+            return result
+        elif not any([ping._was_factor for ping in groupings]):
+            return result
+
+        levels_list = [ ping._group_index for ping in groupings ]
+        index = MultiIndex.from_product(levels_list, names=self.grouper.names)
+        return result.reindex(**{ self.obj._get_axis_name(self.axis) : index, 'copy' : False }).sortlevel()
 
     def _iterate_column_groupbys(self):
         for i, colname in enumerate(self._selected_obj.columns):
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -797,11 +797,6 @@ def test_repr(self):
         self.assertEqual(exp,a.__unicode__())
 
 
-    def test_groupby(self):
-
-        result = self.cat['value_group'].unique()
-        result = self.cat.groupby(['value_group'])['value_group'].count()
-
     def test_groupby_sort(self):
 
         # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby
@@ -872,52 +867,52 @@ def test_groupby(self):
         cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"])
         data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats})
 
+        expected = DataFrame({ 'a' : Series([1,2,4,np.nan],index=Index(['a','b','c','d'],name='b')) })
         result = data.groupby("b").mean()
-        result = result["a"].values
-        exp = np.array([1,2,4,np.nan])
-        self.assert_numpy_array_equivalent(result, exp)
-
-        ### FIXME ###
-
-        #res = len(data.groupby("b"))
-        #self.assertEqual(res ,4)
+        tm.assert_frame_equal(result, expected)
 
         raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"])
         raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"])
         df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]})
-        gb = df.groupby("A")
 
-        #idx = gb.indices
-        #self.assertEqual(len(gb), 3)
-        #num = 0
-        #for _ in gb:
-        #    num +=1
-        #self.assertEqual(len(gb), 3)
-        #gb = df.groupby(["B"])
-        #idx2 = gb.indices
-        #self.assertEqual(len(gb), 3)
-        #num = 0
-        #for _ in gb:
-        #    num +=1
-        #self.assertEqual(len(gb), 3)
-        #gb = df.groupby(["A","B"])
-        #res = len(gb)
-        #idx3 = gb.indices
-        #self.assertEqual(res, 9)
-        #num = 0
-        #for _ in gb:
-        #    num +=1
-        #self.assertEqual(len(gb), 9)
+        # single grouper
+        gb = df.groupby("A")
+        expected = DataFrame({ 'values' : Series([3,7,np.nan],index=Index(['a','b','z'],name='A')) })
+        result = gb.sum()
+        tm.assert_frame_equal(result, expected)
+
+        # multiple groupers
+        gb = df.groupby(['A','B'])
+        expected = DataFrame({ 'values' : Series([1,2,np.nan,3,4,np.nan,np.nan,np.nan,np.nan],
+                                                 index=pd.MultiIndex.from_product([['a','b','z'],['c','d','y']],names=['A','B'])) })
+        result = gb.sum()
+        tm.assert_frame_equal(result, expected)
+
+        # multiple groupers with a non-cat
+        df = df.copy()
+        df['C'] = ['foo','bar']*2
+        gb = df.groupby(['A','B','C'])
+        expected = DataFrame({ 'values' :
+                               Series(np.nan,index=pd.MultiIndex.from_product([['a','b','z'],
+                                                                               ['c','d','y'],
+                                                                               ['foo','bar']],
+                                                                              names=['A','B','C']))
+                               }).sortlevel()
+        expected.iloc[[1,2,7,8],0] = [1,2,3,4]
+        result = gb.sum()
+        tm.assert_frame_equal(result, expected)
 
     def test_pivot_table(self):
 
         raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"])
         raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"])
         df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]})
-        res = pd.pivot_table(df, values='values', index=['A', 'B'])
+        result = pd.pivot_table(df, values='values', index=['A', 'B'])
 
-        ### FIXME ###
-        #self.assertEqual(len(res), 9)
+        expected = Series([1,2,np.nan,3,4,np.nan,np.nan,np.nan,np.nan],
+                          index=pd.MultiIndex.from_product([['a','b','z'],['c','d','y']],names=['A','B']),
+                          name='values')
+        tm.assert_series_equal(result, expected)
 
     def test_count(self):