From fef0f4a885908d52f427ded863693ae1efe22d8a Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Sun, 19 Oct 2014 12:23:29 -0400 Subject: [PATCH] BUG: column name conflict & as_index=False breaks groupby ops --- doc/source/whatsnew/v0.15.1.txt | 49 +++++++++++++++++++ pandas/core/groupby.py | 85 ++++++++++++++++++++------------- pandas/tests/test_groupby.py | 19 +++++++- 3 files changed, 118 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index dc69bd9f55752..cc06797b3276e 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -19,7 +19,54 @@ users upgrade to this version. API changes ~~~~~~~~~~~ +- ``groupby`` with ``as_index=False`` will not add erroneous extra columns to + result (:issue:`8582`): + .. code-block:: python + + In [1]: np.random.seed(2718281) + + In [2]: df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), + ...: columns=['jim', 'joe']) + + In [3]: ts = pd.Series(5 * np.random.randint(0, 3, 10)) + + In [4]: df.groupby(ts, as_index=False).max() + Out[4]: + NaN jim joe + 0 0 72 83 + 1 5 77 84 + 2 10 96 65 + +with the new release: + + .. ipython:: python + + np.random.seed(2718281) + df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), + columns=['jim', 'joe']) + df.head() + + ts = pd.Series(5 * np.random.randint(0, 3, 10)) + df.groupby(ts, as_index=False).max() + +- ``groupby`` will not erroneously exclude columns if the column name conflics + with the grouper name (:issue:`8112`): + + .. code-block:: python + + In [1]: df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)}) + + In [2]: gr = df.groupby(df['jim'] < 2) + + In [3]: _ = gr.nth(0) # invokes the code path which excludes the 1st column + + In [4]: gr.apply(sum) # excludes 1st column from output + Out[4]: + joe + jim + False 24 + True 11 .. _whatsnew_0151.enhancements: @@ -51,3 +98,5 @@ Bug Fixes - Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`) - Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`) - Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`) +- Bug in ``GroupBy`` where a name conflict between the grouper and columns + would break ``groupby`` operations (:issue:`7115`, :issue:`8112`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a141d8cebfd8e..4b85da1b7b224 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -471,7 +471,9 @@ def _set_selection_from_grouper(self): grp = self.grouper if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1: ax = self.obj._info_axis - groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ] + groupers = [g.name for g in grp.groupings + if g.level is None and g.in_axis] + if len(groupers): self._group_selection = ax.difference(Index(groupers)).tolist() @@ -1844,6 +1846,8 @@ class Grouping(object): obj : name : level : + in_axis : if the Grouping is a column in self.obj and hence among + Groupby.exclusions list Returns ------- @@ -1857,7 +1861,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True): + sort=True, in_axis=False): self.name = name self.level = level @@ -1865,6 +1869,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.index = index self.sort = sort self.obj = obj + self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: @@ -2096,23 +2101,43 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True): groupings = [] exclusions = [] - for i, (gpr, level) in enumerate(zip(keys, levels)): - name = None + + # if the actual grouper should be obj[key] + def is_in_axis(key): + if not _is_label_like(key): + try: + obj._data.items.get_loc(key) + except Exception: + return False + + return True + + # if the the grouper is obj[name] + def is_in_obj(gpr): try: - obj._data.items.get_loc(gpr) - in_axis = True + return id(gpr) == id(obj[gpr.name]) except Exception: - in_axis = False + return False + + for i, (gpr, level) in enumerate(zip(keys, levels)): - if _is_label_like(gpr) or in_axis: - exclusions.append(gpr) - name = gpr - gpr = obj[gpr] + if is_in_obj(gpr): # df.groupby(df['name']) + in_axis, name = True, gpr.name + exclusions.append(name) + + elif is_in_axis(gpr): # df.groupby('name') + in_axis, name, gpr = True, gpr, obj[gpr] + exclusions.append(name) + + else: + in_axis, name = False, None if isinstance(gpr, Categorical) and len(gpr) != len(obj): raise ValueError("Categorical grouper must have len(grouper) == len(data)") - ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort) + ping = Grouping(group_axis, gpr, obj=obj, name=name, + level=level, sort=sort, in_axis=in_axis) + groupings.append(ping) if len(groupings) == 0: @@ -2647,18 +2672,7 @@ def aggregate(self, arg, *args, **kwargs): result = self._aggregate_generic(arg, *args, **kwargs) if not self.as_index: - if isinstance(result.index, MultiIndex): - zipped = zip(result.index.levels, result.index.labels, - result.index.names) - for i, (lev, lab, name) in enumerate(zipped): - result.insert(i, name, - com.take_nd(lev.values, lab, - allow_fill=False)) - result = result.consolidate() - else: - values = result.index.values - name = self.grouper.groupings[0].name - result.insert(0, name, values) + self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) return result.convert_objects() @@ -3180,6 +3194,17 @@ def _get_data_to_aggregate(self): else: return obj._data, 1 + def _insert_inaxis_grouper_inplace(self, result): + # zip in reverse so we can always insert at loc 0 + izip = zip(* map(reversed, ( + self.grouper.names, + self.grouper.get_group_levels(), + [grp.in_axis for grp in self.grouper.groupings]))) + + for name, lev, in_axis in izip: + if in_axis: + result.insert(0, name, lev) + def _wrap_aggregated_output(self, output, names=None): agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) @@ -3188,11 +3213,7 @@ def _wrap_aggregated_output(self, output, names=None): if not self.as_index: result = DataFrame(output, columns=output_keys) - group_levels = self.grouper.get_group_levels() - zipped = zip(self.grouper.names, group_levels) - - for i, (name, labels) in enumerate(zipped): - result.insert(i, name, labels) + self._insert_inaxis_grouper_inplace(result) result = result.consolidate() else: index = self.grouper.result_index @@ -3209,11 +3230,7 @@ def _wrap_agged_blocks(self, items, blocks): mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) - group_levels = self.grouper.get_group_levels() - zipped = zip(self.grouper.names, group_levels) - - for i, (name, labels) in enumerate(zipped): - result.insert(i, name, labels) + self._insert_inaxis_grouper_inplace(result) result = result.consolidate() else: index = self.grouper.result_index diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 7ead8b30e8671..303d7f99240af 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1499,6 +1499,24 @@ def test_groupby_as_index_agg(self): result3 = grouped['C'].agg({'Q': np.sum}) assert_frame_equal(result3, expected3) + # GH7115 & GH8112 & GH8582 + df = DataFrame(np.random.randint(0, 100, (50, 3)), + columns=['jim', 'joe', 'jolie']) + ts = Series(np.random.randint(5, 10, 50), name='jim') + + gr = df.groupby(ts) + _ = gr.nth(0) # invokes _set_selection_from_grouper internally + assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) + + for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: + gr = df.groupby(ts, as_index=False) + left = getattr(gr, attr)() + + gr = df.groupby(ts.values, as_index=True) + right = getattr(gr, attr)().reset_index(drop=True) + + assert_frame_equal(left, right) + def test_mulitindex_passthru(self): # GH 7997 @@ -2565,7 +2583,6 @@ def test_groupby_nonstring_columns(self): grouped = df.groupby(0) result = grouped.mean() expected = df.groupby(df[0]).mean() - del expected[0] assert_frame_equal(result, expected) def test_cython_grouper_series_bug_noncontig(self):