Skip to content

BUG: column name conflict & as_index=False breaks groupby ops #8585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 28, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions doc/source/whatsnew/v0.15.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,54 @@ users upgrade to this version.

API changes
~~~~~~~~~~~
- ``groupby`` with ``as_index=False`` will not add erroneous extra columns to
result (:issue:`8582`):

.. code-block:: python

In [1]: np.random.seed(2718281)

In [2]: df = pd.DataFrame(np.random.randint(0, 100, (10, 2)),
...: columns=['jim', 'joe'])

In [3]: ts = pd.Series(5 * np.random.randint(0, 3, 10))

In [4]: df.groupby(ts, as_index=False).max()
Out[4]:
NaN jim joe
0 0 72 83
1 5 77 84
2 10 96 65

with the new release:

.. ipython:: python

np.random.seed(2718281)
df = pd.DataFrame(np.random.randint(0, 100, (10, 2)),
columns=['jim', 'joe'])
df.head()

ts = pd.Series(5 * np.random.randint(0, 3, 10))
df.groupby(ts, as_index=False).max()

- ``groupby`` will not erroneously exclude columns if the column name conflics
with the grouper name (:issue:`8112`):

.. code-block:: python

In [1]: df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)})

In [2]: gr = df.groupby(df['jim'] < 2)

In [3]: _ = gr.nth(0) # invokes the code path which excludes the 1st column

In [4]: gr.apply(sum) # excludes 1st column from output
Out[4]:
joe
jim
False 24
True 11

.. _whatsnew_0151.enhancements:

Expand Down Expand Up @@ -51,3 +98,5 @@ Bug Fixes
- Bug in ``cut``/``qcut`` when using ``Series`` and ``retbins=True`` (:issue:`8589`)
- Bug in numeric index operations of add/sub with Float/Index Index with numpy arrays (:issue:`8608`)
- Fix ``shape`` attribute for ``MultiIndex`` (:issue:`8609`)
- Bug in ``GroupBy`` where a name conflict between the grouper and columns
would break ``groupby`` operations (:issue:`7115`, :issue:`8112`)
85 changes: 51 additions & 34 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,9 @@ def _set_selection_from_grouper(self):
grp = self.grouper
if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1:
ax = self.obj._info_axis
groupers = [ g.name for g in grp.groupings if g.level is None and g.name is not None and g.name in ax ]
groupers = [g.name for g in grp.groupings
if g.level is None and g.in_axis]

if len(groupers):
self._group_selection = ax.difference(Index(groupers)).tolist()

Expand Down Expand Up @@ -1844,6 +1846,8 @@ class Grouping(object):
obj :
name :
level :
in_axis : if the Grouping is a column in self.obj and hence among
Groupby.exclusions list
Returns
-------
Expand All @@ -1857,14 +1861,15 @@ class Grouping(object):
"""

def __init__(self, index, grouper=None, obj=None, name=None, level=None,
sort=True):
sort=True, in_axis=False):

self.name = name
self.level = level
self.grouper = _convert_grouper(index, grouper)
self.index = index
self.sort = sort
self.obj = obj
self.in_axis = in_axis

# right place for this?
if isinstance(grouper, (Series, Index)) and name is None:
Expand Down Expand Up @@ -2096,23 +2101,43 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):

groupings = []
exclusions = []
for i, (gpr, level) in enumerate(zip(keys, levels)):
name = None

# if the actual grouper should be obj[key]
def is_in_axis(key):
if not _is_label_like(key):
try:
obj._data.items.get_loc(key)
except Exception:
return False

return True

# if the the grouper is obj[name]
def is_in_obj(gpr):
try:
obj._data.items.get_loc(gpr)
in_axis = True
return id(gpr) == id(obj[gpr.name])
except Exception:
in_axis = False
return False

for i, (gpr, level) in enumerate(zip(keys, levels)):

if _is_label_like(gpr) or in_axis:
exclusions.append(gpr)
name = gpr
gpr = obj[gpr]
if is_in_obj(gpr): # df.groupby(df['name'])
in_axis, name = True, gpr.name
exclusions.append(name)

elif is_in_axis(gpr): # df.groupby('name')
in_axis, name, gpr = True, gpr, obj[gpr]
exclusions.append(name)

else:
in_axis, name = False, None

if isinstance(gpr, Categorical) and len(gpr) != len(obj):
raise ValueError("Categorical grouper must have len(grouper) == len(data)")

ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort)
ping = Grouping(group_axis, gpr, obj=obj, name=name,
level=level, sort=sort, in_axis=in_axis)

groupings.append(ping)

if len(groupings) == 0:
Expand Down Expand Up @@ -2647,18 +2672,7 @@ def aggregate(self, arg, *args, **kwargs):
result = self._aggregate_generic(arg, *args, **kwargs)

if not self.as_index:
if isinstance(result.index, MultiIndex):
zipped = zip(result.index.levels, result.index.labels,
result.index.names)
for i, (lev, lab, name) in enumerate(zipped):
result.insert(i, name,
com.take_nd(lev.values, lab,
allow_fill=False))
result = result.consolidate()
else:
values = result.index.values
name = self.grouper.groupings[0].name
result.insert(0, name, values)
self._insert_inaxis_grouper_inplace(result)
result.index = np.arange(len(result))

return result.convert_objects()
Expand Down Expand Up @@ -3180,6 +3194,17 @@ def _get_data_to_aggregate(self):
else:
return obj._data, 1

def _insert_inaxis_grouper_inplace(self, result):
# zip in reverse so we can always insert at loc 0
izip = zip(* map(reversed, (
self.grouper.names,
self.grouper.get_group_levels(),
[grp.in_axis for grp in self.grouper.groupings])))

for name, lev, in_axis in izip:
if in_axis:
result.insert(0, name, lev)

def _wrap_aggregated_output(self, output, names=None):
agg_axis = 0 if self.axis == 1 else 1
agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
Expand All @@ -3188,11 +3213,7 @@ def _wrap_aggregated_output(self, output, names=None):

if not self.as_index:
result = DataFrame(output, columns=output_keys)
group_levels = self.grouper.get_group_levels()
zipped = zip(self.grouper.names, group_levels)

for i, (name, labels) in enumerate(zipped):
result.insert(i, name, labels)
self._insert_inaxis_grouper_inplace(result)
result = result.consolidate()
else:
index = self.grouper.result_index
Expand All @@ -3209,11 +3230,7 @@ def _wrap_agged_blocks(self, items, blocks):
mgr = BlockManager(blocks, [items, index])
result = DataFrame(mgr)

group_levels = self.grouper.get_group_levels()
zipped = zip(self.grouper.names, group_levels)

for i, (name, labels) in enumerate(zipped):
result.insert(i, name, labels)
self._insert_inaxis_grouper_inplace(result)
result = result.consolidate()
else:
index = self.grouper.result_index
Expand Down
19 changes: 18 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1499,6 +1499,24 @@ def test_groupby_as_index_agg(self):
result3 = grouped['C'].agg({'Q': np.sum})
assert_frame_equal(result3, expected3)

# GH7115 & GH8112 & GH8582
df = DataFrame(np.random.randint(0, 100, (50, 3)),
columns=['jim', 'joe', 'jolie'])
ts = Series(np.random.randint(5, 10, 50), name='jim')

gr = df.groupby(ts)
_ = gr.nth(0) # invokes _set_selection_from_grouper internally
assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))

for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
gr = df.groupby(ts, as_index=False)
left = getattr(gr, attr)()

gr = df.groupby(ts.values, as_index=True)
right = getattr(gr, attr)().reset_index(drop=True)

assert_frame_equal(left, right)

def test_mulitindex_passthru(self):

# GH 7997
Expand Down Expand Up @@ -2565,7 +2583,6 @@ def test_groupby_nonstring_columns(self):
grouped = df.groupby(0)
result = grouped.mean()
expected = df.groupby(df[0]).mean()
del expected[0]
assert_frame_equal(result, expected)

def test_cython_grouper_series_bug_noncontig(self):
Expand Down