BUG: Fix not to reindex on non-Categorical groups (GH9049, GH9344)

Junya Hayashi · Junya Hayashi · commit b53ef231ea2a · 2015-02-09T06:07:13.000+09:00
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -267,3 +267,4 @@ Bug Fixes
 - ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).
 
 - Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
+- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1862,7 +1862,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             self.grouper = grouper.values
 
         # pre-computed
-        self._was_factor = False
+        self._grouping_type = None
         self._should_compress = True
 
         # we have a single grouper which may be a myriad of things, some of which are
@@ -1887,7 +1887,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                 level_values = index.levels[level].take(inds)
                 self.grouper = level_values.map(self.grouper)
             else:
-                self._was_factor = True
+                self._grouping_type = "level"
 
                 # all levels may not be observed
                 labels, uniques = algos.factorize(inds, sort=True)
@@ -1915,7 +1915,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             elif isinstance(self.grouper, Categorical):
 
                 factor = self.grouper
-                self._was_factor = True
+                self._grouping_type = "categorical"
 
                 # Is there any way to avoid this?
                 self.grouper = np.asarray(factor)
@@ -1988,8 +1988,9 @@ def group_index(self):
         return self._group_index
 
     def _make_labels(self):
-        if self._was_factor:  # pragma: no cover
-            raise Exception('Should not call this method grouping by level')
+        if self._grouping_type in ("level", "categorical"):  # pragma: no cover
+            raise Exception(
+                'Should not call this method grouping by level or categorical')
         else:
             labels, uniques = algos.factorize(self.grouper, sort=self.sort)
             uniques = Index(uniques, name=self.name)
@@ -3238,10 +3239,11 @@ def _reindex_output(self, result):
             return result
         elif len(groupings) == 1:
             return result
-        elif not any([ping._was_factor for ping in groupings]):
+        elif not any([ping._grouping_type == "categorical"
+                      for ping in groupings]):
             return result
 
-        levels_list = [ ping._group_index for ping in groupings ]
+        levels_list = [ ping.group_index for ping in groupings ]
         index = MultiIndex.from_product(levels_list, names=self.grouper.names)
         d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False }
         return result.reindex(**d).sortlevel(axis=self.axis)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -3484,6 +3484,31 @@ def test_groupby_categorical_unequal_len(self):
         # len(bins) != len(series) here
         self.assertRaises(ValueError,lambda : series.groupby(bins).mean())
 
+    def test_groupby_multiindex_missing_pair(self):
+        # GH9049
+        df = DataFrame({'group1': ['a','a','a','b'],
+                        'group2': ['c','c','d','c'],
+                        'value': [1,1,1,5]})
+        df = df.set_index(['group1', 'group2'])
+        df_grouped = df.groupby(level=['group1','group2'], sort=True)
+
+        res = df_grouped.agg('sum')
+        idx = MultiIndex.from_tuples([('a','c'), ('a','d'), ('b','c')], names=['group1', 'group2'])
+        exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
+
+        tm.assert_frame_equal(res, exp)
+
+    def test_groupby_levels_and_columns(self):
+        # GH9344, GH9049
+        idx_names = ['x', 'y']
+        idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
+        df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
+
+        by_levels = df.groupby(level=idx_names).mean()
+        by_columns = df.reset_index().groupby(idx_names).mean()
+
+        tm.assert_frame_equal(by_levels, by_columns)
+
     def test_gb_apply_list_of_unequal_len_arrays(self):
 
         # GH1738
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -390,6 +390,18 @@ def f(g):
 
 groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)
 
+
+#----------------------------------------------------------------------
+# multi-indexed group sum #9049
+
+setup = common_setup + """
+N = 50
+df = DataFrame({'A': range(N) * 2, 'B': range(N*2), 'C': 1}).set_index(["A", "B"])
+"""
+
+groupby_sum_multiindex = Benchmark("df.groupby(level=[0, 1]).sum()", setup)
+
+
 #----------------------------------------------------------------------
 # Transform testing
 

Original file line number	Diff line number	Diff line change
`@@ -267,3 +267,4 @@ Bug Fixes`
`267`	`267`	- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).
`268`	`268`
`269`	`269`	- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
	`270`	+- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)