-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: groupby.nth should be a filter #49262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
15a3aa7
0e2e5ee
4d89459
59e883f
f9f1066
cd74b98
4b0a101
5c308e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3001,97 +3001,67 @@ def nth( | |
... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) | ||
>>> g = df.groupby('A') | ||
>>> g.nth(0) | ||
B | ||
A | ||
1 NaN | ||
2 3.0 | ||
A B | ||
0 1 NaN | ||
2 2 3.0 | ||
>>> g.nth(1) | ||
B | ||
A | ||
1 2.0 | ||
2 5.0 | ||
A B | ||
1 1 2.0 | ||
4 2 5.0 | ||
>>> g.nth(-1) | ||
B | ||
A | ||
1 4.0 | ||
2 5.0 | ||
A B | ||
3 1 4.0 | ||
4 2 5.0 | ||
>>> g.nth([0, 1]) | ||
B | ||
A | ||
1 NaN | ||
1 2.0 | ||
2 3.0 | ||
2 5.0 | ||
A B | ||
0 1 NaN | ||
1 1 2.0 | ||
2 2 3.0 | ||
4 2 5.0 | ||
>>> g.nth(slice(None, -1)) | ||
B | ||
A | ||
1 NaN | ||
1 2.0 | ||
2 3.0 | ||
A B | ||
0 1 NaN | ||
1 1 2.0 | ||
2 2 3.0 | ||
|
||
Index notation may also be used | ||
|
||
>>> g.nth[0, 1] | ||
B | ||
A | ||
1 NaN | ||
1 2.0 | ||
2 3.0 | ||
2 5.0 | ||
A B | ||
0 1 NaN | ||
1 1 2.0 | ||
2 2 3.0 | ||
4 2 5.0 | ||
>>> g.nth[:-1] | ||
B | ||
A | ||
1 NaN | ||
1 2.0 | ||
2 3.0 | ||
A B | ||
0 1 NaN | ||
1 1 2.0 | ||
2 2 3.0 | ||
|
||
Specifying `dropna` allows count ignoring ``NaN`` | ||
|
||
>>> g.nth(0, dropna='any') | ||
B | ||
A | ||
1 2.0 | ||
2 3.0 | ||
A B | ||
1 1 2.0 | ||
2 2 3.0 | ||
|
||
NaNs denote group exhausted when using dropna | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This description probably needs updating |
||
|
||
>>> g.nth(3, dropna='any') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think this example should be shown in the whatsnew? On the surface, this example appears to be quite different from before |
||
B | ||
A | ||
1 NaN | ||
2 NaN | ||
|
||
Specifying `as_index=False` in `groupby` keeps the original index. | ||
|
||
>>> df.groupby('A', as_index=False).nth(1) | ||
A B | ||
1 1 2.0 | ||
4 2 5.0 | ||
Empty DataFrame | ||
Columns: [A, B] | ||
Index: [] | ||
""" | ||
if not dropna: | ||
with self._group_selection_context(): | ||
mask = self._make_mask_from_positional_indexer(n) | ||
mask = self._make_mask_from_positional_indexer(n) | ||
|
||
ids, _, _ = self.grouper.group_info | ||
ids, _, _ = self.grouper.group_info | ||
|
||
# Drop NA values in grouping | ||
mask = mask & (ids != -1) | ||
# Drop NA values in grouping | ||
mask = mask & (ids != -1) | ||
|
||
out = self._mask_selected_obj(mask) | ||
if not self.as_index: | ||
return out | ||
|
||
result_index = self.grouper.result_index | ||
if self.axis == 0: | ||
out.index = result_index[ids[mask]] | ||
if not self.observed and isinstance(result_index, CategoricalIndex): | ||
out = out.reindex(result_index) | ||
|
||
out = self._reindex_output(out) | ||
else: | ||
out.columns = result_index[ids[mask]] | ||
|
||
return out.sort_index(axis=self.axis) if self.sort else out | ||
out = self._mask_selected_obj(mask) | ||
return out | ||
|
||
# dropna is truthy | ||
if not is_integer(n): | ||
|
@@ -3108,7 +3078,6 @@ def nth( | |
# old behaviour, but with all and any support for DataFrames. | ||
# modified in GH 7559 to have better perf | ||
n = cast(int, n) | ||
max_len = n if n >= 0 else -1 - n | ||
dropped = self.obj.dropna(how=dropna, axis=self.axis) | ||
|
||
# get a new grouper for our dropped obj | ||
|
@@ -3138,22 +3107,7 @@ def nth( | |
grb = dropped.groupby( | ||
grouper, as_index=self.as_index, sort=self.sort, axis=self.axis | ||
) | ||
sizes, result = grb.size(), grb.nth(n) | ||
mask = (sizes < max_len)._values | ||
|
||
# set the results which don't meet the criteria | ||
if len(result) and mask.any(): | ||
result.loc[mask] = np.nan | ||
|
||
# reset/reindex to the original groups | ||
if len(self.obj) == len(dropped) or len(result) == len( | ||
self.grouper.result_index | ||
): | ||
result.index = self.grouper.result_index | ||
else: | ||
result = result.reindex(self.grouper.result_index) | ||
|
||
return result | ||
return grb.nth(n) | ||
|
||
@final | ||
def quantile( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Generally like consistency now that
nth
is a filter, but I think this should be called out in it's own "notable bug fix" section