Skip to content

Commit f2c9390

Browse files
committed
Merge pull request #8585 from behzadnouri/in-axis
BUG: column name conflict & as_index=False breaks groupby ops
2 parents 211c80c + fef0f4a commit f2c9390

File tree

3 files changed

+118
-35
lines changed

3 files changed

+118
-35
lines changed

doc/source/whatsnew/v0.15.1.txt

+49
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,54 @@ users upgrade to this version.
1919

2020
API changes
2121
~~~~~~~~~~~
22+
- ``groupby`` with ``as_index=False`` will not add erroneous extra columns to
23+
result (:issue:`8582`):
2224

25+
.. code-block:: python
26+
27+
In [1]: np.random.seed(2718281)
28+
29+
In [2]: df = pd.DataFrame(np.random.randint(0, 100, (10, 2)),
30+
...: columns=['jim', 'joe'])
31+
32+
In [3]: ts = pd.Series(5 * np.random.randint(0, 3, 10))
33+
34+
In [4]: df.groupby(ts, as_index=False).max()
35+
Out[4]:
36+
NaN jim joe
37+
0 0 72 83
38+
1 5 77 84
39+
2 10 96 65
40+
41+
with the new release:
42+
43+
.. ipython:: python
44+
45+
np.random.seed(2718281)
46+
df = pd.DataFrame(np.random.randint(0, 100, (10, 2)),
47+
columns=['jim', 'joe'])
48+
df.head()
49+
50+
ts = pd.Series(5 * np.random.randint(0, 3, 10))
51+
df.groupby(ts, as_index=False).max()
52+
53+
- ``groupby`` will not erroneously exclude columns if the column name conflics
54+
with the grouper name (:issue:`8112`):
55+
56+
.. code-block:: python
57+
58+
In [1]: df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)})
59+
60+
In [2]: gr = df.groupby(df['jim'] < 2)
61+
62+
In [3]: _ = gr.nth(0) # invokes the code path which excludes the 1st column
63+
64+
In [4]: gr.apply(sum) # excludes 1st column from output
65+
Out[4]:
66+
joe
67+
jim
68+
False 24
69+
True 11
2370

2471
.. _whatsnew_0151.enhancements:
2572

@@ -88,3 +135,5 @@ Bug Fixes
88135

89136

90137
- Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)
138+
- Bug in ``GroupBy`` where a name conflict between the grouper and columns
139+
would break ``groupby`` operations (:issue:`7115`, :issue:`8112`)

pandas/core/groupby.py

+51-34
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,9 @@ def _set_selection_from_grouper(self):
471471
grp = self.grouper
472472
if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
473473
ax = self.obj._info_axis
474-
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
474+
groupers = [g.name for g in grp.groupings
475+
if g.level is None and g.in_axis]
476+
475477
if len(groupers):
476478
self._group_selection = ax.difference(Index(groupers)).tolist()
477479

@@ -1844,6 +1846,8 @@ class Grouping(object):
18441846
obj :
18451847
name :
18461848
level :
1849+
in_axis : if the Grouping is a column in self.obj and hence among
1850+
Groupby.exclusions list
18471851
18481852
Returns
18491853
-------
@@ -1857,14 +1861,15 @@ class Grouping(object):
18571861
"""
18581862

18591863
def __init__(self, index, grouper=None, obj=None, name=None, level=None,
1860-
sort=True):
1864+
sort=True, in_axis=False):
18611865

18621866
self.name = name
18631867
self.level = level
18641868
self.grouper = _convert_grouper(index, grouper)
18651869
self.index = index
18661870
self.sort = sort
18671871
self.obj = obj
1872+
self.in_axis = in_axis
18681873

18691874
# right place for this?
18701875
if isinstance(grouper, (Series, Index)) and name is None:
@@ -2096,23 +2101,43 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
20962101

20972102
groupings = []
20982103
exclusions = []
2099-
for i, (gpr, level) in enumerate(zip(keys, levels)):
2100-
name = None
2104+
2105+
# if the actual grouper should be obj[key]
2106+
def is_in_axis(key):
2107+
if not _is_label_like(key):
2108+
try:
2109+
obj._data.items.get_loc(key)
2110+
except Exception:
2111+
return False
2112+
2113+
return True
2114+
2115+
# if the the grouper is obj[name]
2116+
def is_in_obj(gpr):
21012117
try:
2102-
obj._data.items.get_loc(gpr)
2103-
in_axis = True
2118+
return id(gpr) == id(obj[gpr.name])
21042119
except Exception:
2105-
in_axis = False
2120+
return False
2121+
2122+
for i, (gpr, level) in enumerate(zip(keys, levels)):
21062123

2107-
if _is_label_like(gpr) or in_axis:
2108-
exclusions.append(gpr)
2109-
name = gpr
2110-
gpr = obj[gpr]
2124+
if is_in_obj(gpr): # df.groupby(df['name'])
2125+
in_axis, name = True, gpr.name
2126+
exclusions.append(name)
2127+
2128+
elif is_in_axis(gpr): # df.groupby('name')
2129+
in_axis, name, gpr = True, gpr, obj[gpr]
2130+
exclusions.append(name)
2131+
2132+
else:
2133+
in_axis, name = False, None
21112134

21122135
if isinstance(gpr, Categorical) and len(gpr) != len(obj):
21132136
raise ValueError("Categorical grouper must have len(grouper) == len(data)")
21142137

2115-
ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
2138+
ping = Grouping(group_axis, gpr, obj=obj, name=name,
2139+
level=level, sort=sort, in_axis=in_axis)
2140+
21162141
groupings.append(ping)
21172142

21182143
if len(groupings) == 0:
@@ -2647,18 +2672,7 @@ def aggregate(self, arg, *args, **kwargs):
26472672
result = self._aggregate_generic(arg, *args, **kwargs)
26482673

26492674
if not self.as_index:
2650-
if isinstance(result.index, MultiIndex):
2651-
zipped = zip(result.index.levels, result.index.labels,
2652-
result.index.names)
2653-
for i, (lev, lab, name) in enumerate(zipped):
2654-
result.insert(i, name,
2655-
com.take_nd(lev.values, lab,
2656-
allow_fill=False))
2657-
result = result.consolidate()
2658-
else:
2659-
values = result.index.values
2660-
name = self.grouper.groupings[0].name
2661-
result.insert(0, name, values)
2675+
self._insert_inaxis_grouper_inplace(result)
26622676
result.index = np.arange(len(result))
26632677

26642678
return result.convert_objects()
@@ -3180,6 +3194,17 @@ def _get_data_to_aggregate(self):
31803194
else:
31813195
return obj._data, 1
31823196

3197+
def _insert_inaxis_grouper_inplace(self, result):
3198+
# zip in reverse so we can always insert at loc 0
3199+
izip = zip(* map(reversed, (
3200+
self.grouper.names,
3201+
self.grouper.get_group_levels(),
3202+
[grp.in_axis for grp in self.grouper.groupings])))
3203+
3204+
for name, lev, in_axis in izip:
3205+
if in_axis:
3206+
result.insert(0, name, lev)
3207+
31833208
def _wrap_aggregated_output(self, output, names=None):
31843209
agg_axis = 0 if self.axis == 1 else 1
31853210
agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
@@ -3188,11 +3213,7 @@ def _wrap_aggregated_output(self, output, names=None):
31883213

31893214
if not self.as_index:
31903215
result = DataFrame(output, columns=output_keys)
3191-
group_levels = self.grouper.get_group_levels()
3192-
zipped = zip(self.grouper.names, group_levels)
3193-
3194-
for i, (name, labels) in enumerate(zipped):
3195-
result.insert(i, name, labels)
3216+
self._insert_inaxis_grouper_inplace(result)
31963217
result = result.consolidate()
31973218
else:
31983219
index = self.grouper.result_index
@@ -3209,11 +3230,7 @@ def _wrap_agged_blocks(self, items, blocks):
32093230
mgr = BlockManager(blocks, [items, index])
32103231
result = DataFrame(mgr)
32113232

3212-
group_levels = self.grouper.get_group_levels()
3213-
zipped = zip(self.grouper.names, group_levels)
3214-
3215-
for i, (name, labels) in enumerate(zipped):
3216-
result.insert(i, name, labels)
3233+
self._insert_inaxis_grouper_inplace(result)
32173234
result = result.consolidate()
32183235
else:
32193236
index = self.grouper.result_index

pandas/tests/test_groupby.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -1499,6 +1499,24 @@ def test_groupby_as_index_agg(self):
14991499
result3 = grouped['C'].agg({'Q': np.sum})
15001500
assert_frame_equal(result3, expected3)
15011501

1502+
# GH7115 & GH8112 & GH8582
1503+
df = DataFrame(np.random.randint(0, 100, (50, 3)),
1504+
columns=['jim', 'joe', 'jolie'])
1505+
ts = Series(np.random.randint(5, 10, 50), name='jim')
1506+
1507+
gr = df.groupby(ts)
1508+
_ = gr.nth(0) # invokes _set_selection_from_grouper internally
1509+
assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
1510+
1511+
for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
1512+
gr = df.groupby(ts, as_index=False)
1513+
left = getattr(gr, attr)()
1514+
1515+
gr = df.groupby(ts.values, as_index=True)
1516+
right = getattr(gr, attr)().reset_index(drop=True)
1517+
1518+
assert_frame_equal(left, right)
1519+
15021520
def test_mulitindex_passthru(self):
15031521

15041522
# GH 7997
@@ -2565,7 +2583,6 @@ def test_groupby_nonstring_columns(self):
25652583
grouped = df.groupby(0)
25662584
result = grouped.mean()
25672585
expected = df.groupby(df[0]).mean()
2568-
del expected[0]
25692586
assert_frame_equal(result, expected)
25702587

25712588
def test_cython_grouper_series_bug_noncontig(self):

0 commit comments

Comments
 (0)