Merge pull request #8585 from behzadnouri/in-axis

jreback · jreback · commit f2c939084e99 · 2014-10-27T20:04:25.000-04:00
BUG: column name conflict &amp; as_index=False breaks groupby ops
diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt
@@ -19,7 +19,54 @@ users upgrade to this version.
 
 API changes
 ~~~~~~~~~~~
+- ``groupby`` with ``as_index=False`` will not add erroneous extra columns to
+  result (:issue:`8582`):
 
+  .. code-block:: python
+
+     In [1]: np.random.seed(2718281)
+
+     In [2]: df = pd.DataFrame(np.random.randint(0, 100, (10, 2)),
+        ...:                   columns=['jim', 'joe'])
+
+     In [3]: ts = pd.Series(5 * np.random.randint(0, 3, 10))
+
+     In [4]: df.groupby(ts, as_index=False).max()
+     Out[4]:
+        NaN  jim  joe
+     0    0   72   83
+     1    5   77   84
+     2   10   96   65
+
+with the new release:
+
+  .. ipython:: python
+
+    np.random.seed(2718281)
+    df = pd.DataFrame(np.random.randint(0, 100, (10, 2)),
+                      columns=['jim', 'joe'])
+    df.head()
+
+    ts = pd.Series(5 * np.random.randint(0, 3, 10))
+    df.groupby(ts, as_index=False).max()
+
+- ``groupby`` will not erroneously exclude columns if the column name conflics
+  with the grouper name (:issue:`8112`):
+
+  .. code-block:: python
+
+     In [1]: df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)})
+
+     In [2]: gr = df.groupby(df['jim'] < 2)
+
+     In [3]: _ = gr.nth(0) # invokes the code path which excludes the 1st column
+
+     In [4]: gr.apply(sum) # excludes 1st column from output
+     Out[4]:
+            joe
+     jim
+     False   24
+     True    11
 
 .. _whatsnew_0151.enhancements:
 
@@ -88,3 +135,5 @@ Bug Fixes
 
 
 - Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)
+- Bug in ``GroupBy`` where a name conflict between the grouper and columns
+  would break ``groupby`` operations (:issue:`7115`, :issue:`8112`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -471,7 +471,9 @@ def _set_selection_from_grouper(self):
         grp = self.grouper
         if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
             ax = self.obj._info_axis
-            groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
+            groupers = [g.name for g in grp.groupings
+                           if g.level is None and g.in_axis]
+
             if len(groupers):
                 self._group_selection = ax.difference(Index(groupers)).tolist()
 
@@ -1844,6 +1846,8 @@ class Grouping(object):
     obj :
     name :
     level :
+    in_axis : if the Grouping is a column in self.obj and hence among
+        Groupby.exclusions list
 
     Returns
     -------
@@ -1857,14 +1861,15 @@ class Grouping(object):
     """
 
     def __init__(self, index, grouper=None, obj=None, name=None, level=None,
-                 sort=True):
+                 sort=True, in_axis=False):
 
         self.name = name
         self.level = level
         self.grouper = _convert_grouper(index, grouper)
         self.index = index
         self.sort = sort
         self.obj = obj
+        self.in_axis = in_axis
 
         # right place for this?
         if isinstance(grouper, (Series, Index)) and name is None:
@@ -2096,23 +2101,43 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
 
     groupings = []
     exclusions = []
-    for i, (gpr, level) in enumerate(zip(keys, levels)):
-        name = None
+
+    # if the actual grouper should be obj[key]
+    def is_in_axis(key):
+        if not _is_label_like(key):
+            try:
+                obj._data.items.get_loc(key)
+            except Exception:
+                return False
+
+        return True
+
+    # if the the grouper is obj[name]
+    def is_in_obj(gpr):
         try:
-            obj._data.items.get_loc(gpr)
-            in_axis = True
+            return id(gpr) == id(obj[gpr.name])
         except Exception:
-            in_axis = False
+            return False
+
+    for i, (gpr, level) in enumerate(zip(keys, levels)):
 
-        if _is_label_like(gpr) or in_axis:
-            exclusions.append(gpr)
-            name = gpr
-            gpr = obj[gpr]
+        if is_in_obj(gpr):  # df.groupby(df['name'])
+            in_axis, name = True, gpr.name
+            exclusions.append(name)
+
+        elif is_in_axis(gpr):  # df.groupby('name')
+            in_axis, name, gpr = True, gpr, obj[gpr]
+            exclusions.append(name)
+
+        else:
+            in_axis, name = False, None
 
         if isinstance(gpr, Categorical) and len(gpr) != len(obj):
             raise ValueError("Categorical grouper must have len(grouper) == len(data)")
 
-        ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
+        ping = Grouping(group_axis, gpr, obj=obj, name=name,
+                        level=level, sort=sort, in_axis=in_axis)
+
         groupings.append(ping)
 
     if len(groupings) == 0:
@@ -2647,18 +2672,7 @@ def aggregate(self, arg, *args, **kwargs):
                     result = self._aggregate_generic(arg, *args, **kwargs)
 
         if not self.as_index:
-            if isinstance(result.index, MultiIndex):
-                zipped = zip(result.index.levels, result.index.labels,
-                             result.index.names)
-                for i, (lev, lab, name) in enumerate(zipped):
-                    result.insert(i, name,
-                                  com.take_nd(lev.values, lab,
-                                              allow_fill=False))
-                result = result.consolidate()
-            else:
-                values = result.index.values
-                name = self.grouper.groupings[0].name
-                result.insert(0, name, values)
+            self._insert_inaxis_grouper_inplace(result)
             result.index = np.arange(len(result))
 
         return result.convert_objects()
@@ -3180,6 +3194,17 @@ def _get_data_to_aggregate(self):
         else:
             return obj._data, 1
 
+    def _insert_inaxis_grouper_inplace(self, result):
+        # zip in reverse so we can always insert at loc 0
+        izip = zip(* map(reversed, (
+            self.grouper.names,
+            self.grouper.get_group_levels(),
+            [grp.in_axis for grp in self.grouper.groupings])))
+
+        for name, lev, in_axis in izip:
+            if in_axis:
+                result.insert(0, name, lev)
+
     def _wrap_aggregated_output(self, output, names=None):
         agg_axis = 0 if self.axis == 1 else 1
         agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
@@ -3188,11 +3213,7 @@ def _wrap_aggregated_output(self, output, names=None):
 
         if not self.as_index:
             result = DataFrame(output, columns=output_keys)
-            group_levels = self.grouper.get_group_levels()
-            zipped = zip(self.grouper.names, group_levels)
-
-            for i, (name, labels) in enumerate(zipped):
-                result.insert(i, name, labels)
+            self._insert_inaxis_grouper_inplace(result)
             result = result.consolidate()
         else:
             index = self.grouper.result_index
@@ -3209,11 +3230,7 @@ def _wrap_agged_blocks(self, items, blocks):
             mgr = BlockManager(blocks, [items, index])
             result = DataFrame(mgr)
 
-            group_levels = self.grouper.get_group_levels()
-            zipped = zip(self.grouper.names, group_levels)
-
-            for i, (name, labels) in enumerate(zipped):
-                result.insert(i, name, labels)
+            self._insert_inaxis_grouper_inplace(result)
             result = result.consolidate()
         else:
             index = self.grouper.result_index
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1499,6 +1499,24 @@ def test_groupby_as_index_agg(self):
         result3 = grouped['C'].agg({'Q': np.sum})
         assert_frame_equal(result3, expected3)
 
+        # GH7115 & GH8112 & GH8582
+        df = DataFrame(np.random.randint(0, 100, (50, 3)),
+                       columns=['jim', 'joe', 'jolie'])
+        ts = Series(np.random.randint(5, 10, 50), name='jim')
+
+        gr = df.groupby(ts)
+        _ = gr.nth(0)  # invokes _set_selection_from_grouper internally
+        assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
+
+        for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
+            gr = df.groupby(ts, as_index=False)
+            left = getattr(gr, attr)()
+
+            gr = df.groupby(ts.values, as_index=True)
+            right = getattr(gr, attr)().reset_index(drop=True)
+
+            assert_frame_equal(left, right)
+
     def test_mulitindex_passthru(self):
 
         # GH 7997
@@ -2565,7 +2583,6 @@ def test_groupby_nonstring_columns(self):
         grouped = df.groupby(0)
         result = grouped.mean()
         expected = df.groupby(df[0]).mean()
-        del expected[0]
         assert_frame_equal(result, expected)
 
     def test_cython_grouper_series_bug_noncontig(self):