-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Bug: Grouping by index and column fails on DataFrame with single index (GH14327) #14333
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a421a52
ec9340f
848c9bb
0f95bca
75a0390
6b37bd4
897ec1c
05e6557
33eb725
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -432,6 +432,35 @@ def _update_inplace(self, result, **kwargs): | |
# guard when called from IndexOpsMixin | ||
raise TypeError("Index can't be updated inplace") | ||
|
||
def _get_grouper_for_level(self, group_mapper, level): | ||
""" | ||
Get index grouper corresponding to an index level | ||
|
||
Parameters | ||
---------- | ||
group_mapper: Group mapping function or None | ||
Function mapping index values to groups | ||
level : int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just make this |
||
Index level (Only used by MultiIndex override) | ||
|
||
Returns | ||
------- | ||
grouper : Index | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it needs to always return a tuple (doesn't matter if they are used or not) |
||
Index of values to group on | ||
labels : None | ||
Array of locations in level_index | ||
(Only returned by MultiIndex override) | ||
level_index : None | ||
Index of unique values for level | ||
(Only returned by MultiIndex override) | ||
""" | ||
if group_mapper is None: | ||
grouper = self | ||
else: | ||
grouper = self.map(group_mapper) | ||
|
||
return grouper, None, None | ||
|
||
def is_(self, other): | ||
""" | ||
More flexible, faster check like ``is`` but that works through views | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -524,6 +524,55 @@ def _format_native_types(self, na_rep='nan', **kwargs): | |
|
||
return mi.values | ||
|
||
def _get_grouper_for_level(self, group_mapper, level): | ||
""" | ||
Get index grouper corresponding to an index level | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a docstring here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, same as above for styling. |
||
Parameters | ||
---------- | ||
group_mapper: Group mapping function or None | ||
Function mapping index values to groups | ||
level : int | ||
Index level | ||
|
||
Returns | ||
------- | ||
grouper : Index | ||
Index of values to group on | ||
labels : ndarray of int or None | ||
Array of locations in level_index | ||
level_index : Index or None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. call this uniques (these are not actually used anywhere, but are descriptive). We use certain terms in the codebase. This will make it consistent. |
||
Index of unique values for level | ||
""" | ||
inds = self.labels[level] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. call this indexer |
||
level_index = self.levels[level] | ||
|
||
if group_mapper is not None: | ||
# Handle group mapping function and return | ||
level_values = self.levels[level].take(inds) | ||
grouper = level_values.map(group_mapper) | ||
return grouper, None, None | ||
|
||
labels, uniques = algos.factorize(inds, sort=True) | ||
|
||
if len(uniques) > 0 and uniques[0] == -1: | ||
# Handle NAs | ||
mask = inds != -1 | ||
ok_labels, uniques = algos.factorize(inds[mask], | ||
sort=True) | ||
|
||
labels = np.empty(len(inds), dtype=inds.dtype) | ||
labels[mask] = ok_labels | ||
labels[~mask] = -1 | ||
|
||
if len(uniques) < len(level_index): | ||
# Remove unobserved levels from level_index | ||
level_index = level_index.take(uniques) | ||
|
||
grouper = level_index.take(labels) | ||
|
||
return grouper, labels, level_index | ||
|
||
@property | ||
def _constructor(self): | ||
return MultiIndex.from_tuples | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -458,6 +458,39 @@ def test_grouper_creation_bug(self): | |
expected = s.groupby(level='one').sum() | ||
assert_series_equal(result, expected) | ||
|
||
def test_grouper_column_and_index(self): | ||
# GH 14327 | ||
|
||
# Grouping a multi-index frame by a column and an index level should | ||
# be equivalent to resetting the index and grouping by two columns | ||
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), | ||
('b', 1), ('b', 2), ('b', 3)]) | ||
idx.names = ['outer', 'inner'] | ||
df_multi = pd.DataFrame({"A": np.arange(6), | ||
'B': ['one', 'one', 'two', | ||
'two', 'one', 'one']}, | ||
index=idx) | ||
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you try with these reversed as well, e.g. |
||
expected = df_multi.reset_index().groupby(['B', 'inner']).mean() | ||
assert_frame_equal(result, expected) | ||
|
||
# Test the reverse grouping order | ||
result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() | ||
expected = df_multi.reset_index().groupby(['inner', 'B']).mean() | ||
assert_frame_equal(result, expected) | ||
|
||
# Grouping a single-index frame by a column and the index should | ||
# be equivalent to resetting the index and grouping by two columns | ||
df_single = df_multi.reset_index('outer') | ||
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() | ||
expected = df_single.reset_index().groupby(['B', 'inner']).mean() | ||
assert_frame_equal(result, expected) | ||
|
||
# Test the reverse grouping order | ||
result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() | ||
expected = df_single.reset_index().groupby(['inner', 'B']).mean() | ||
assert_frame_equal(result, expected) | ||
|
||
def test_grouper_getting_correct_binner(self): | ||
|
||
# GH 10063 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
call this
mapper