-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Bug: Grouping by index and column fails on DataFrame with single index (GH14327) #14333
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
a421a52
ec9340f
848c9bb
0f95bca
75a0390
6b37bd4
897ec1c
05e6557
33eb725
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2201,36 +2201,45 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, | |
raise AssertionError('Level %s not in index' % str(level)) | ||
level = index.names.index(level) | ||
|
||
inds = index.labels[level] | ||
level_index = index.levels[level] | ||
|
||
if self.name is None: | ||
self.name = index.names[level] | ||
|
||
# XXX complete hack | ||
if isinstance(index, MultiIndex): | ||
inds = index.labels[level] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think this should be a private method in an Index instead (and overridden in MultiIndex) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean everything inside the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just take args and return a tuples of things There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it, thanks for the clarification |
||
level_index = index.levels[level] | ||
|
||
if grouper is not None: | ||
level_values = index.levels[level].take(inds) | ||
self.grouper = level_values.map(self.grouper) | ||
else: | ||
# all levels may not be observed | ||
labels, uniques = algos.factorize(inds, sort=True) | ||
# XXX complete hack | ||
|
||
if grouper is not None: | ||
level_values = index.levels[level].take(inds) | ||
self.grouper = level_values.map(self.grouper) | ||
else: | ||
# all levels may not be observed | ||
labels, uniques = algos.factorize(inds, sort=True) | ||
|
||
if len(uniques) > 0 and uniques[0] == -1: | ||
# handle NAs | ||
mask = inds != -1 | ||
ok_labels, uniques = algos.factorize(inds[mask], sort=True) | ||
if len(uniques) > 0 and uniques[0] == -1: | ||
# handle NAs | ||
mask = inds != -1 | ||
ok_labels, uniques = algos.factorize(inds[mask], | ||
sort=True) | ||
|
||
labels = np.empty(len(inds), dtype=inds.dtype) | ||
labels[mask] = ok_labels | ||
labels[~mask] = -1 | ||
labels = np.empty(len(inds), dtype=inds.dtype) | ||
labels[mask] = ok_labels | ||
labels[~mask] = -1 | ||
|
||
if len(uniques) < len(level_index): | ||
level_index = level_index.take(uniques) | ||
if len(uniques) < len(level_index): | ||
level_index = level_index.take(uniques) | ||
|
||
self._labels = labels | ||
self._group_index = level_index | ||
self.grouper = level_index.take(labels) | ||
|
||
# Single level index passed | ||
else: | ||
# Use single level index as grouper if none passed | ||
if grouper is None: | ||
self.grouper = index | ||
|
||
self._labels = labels | ||
self._group_index = level_index | ||
self.grouper = level_index.take(labels) | ||
else: | ||
if isinstance(self.grouper, (list, tuple)): | ||
self.grouper = com._asarray_tuplesafe(self.grouper) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -458,6 +458,28 @@ def test_grouper_creation_bug(self): | |
expected = s.groupby(level='one').sum() | ||
assert_series_equal(result, expected) | ||
|
||
def test_grouper_column_and_index(self): | ||
# GH 14327 | ||
|
||
# Grouping a multi-index frame by a column and an index level should | ||
# be equivalent to resetting the index and grouping by two columns | ||
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), | ||
('b', 1), ('b', 2), ('b', 3)]) | ||
idx.names = ['outer', 'inner'] | ||
df_multi = pd.DataFrame({"A": np.arange(6), | ||
'B': ['one', 'one', 'two', | ||
'two', 'one', 'one']}, | ||
index=idx) | ||
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you try with these reversed as well, e.g. |
||
expected = df_multi.reset_index().groupby(['B', 'inner']).mean() | ||
assert_frame_equal(result, expected) | ||
|
||
# Grouping a single-index frame by a column and the index should | ||
# be equivalent to resetting the index and grouping by two columns | ||
df_single = df_multi.reset_index('outer') | ||
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() | ||
assert_frame_equal(result, expected) | ||
|
||
def test_grouper_getting_correct_binner(self): | ||
|
||
# GH 10063 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move to 0.20.0