-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Groupby.nth includes group key inconsistently #12839 #13316
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,7 +95,7 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, | |
@Appender(_doc_template) | ||
@Appender(_local_template) | ||
def f(self): | ||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
try: | ||
return self._cython_agg_general(alias, numeric_only=numeric_only) | ||
except AssertionError as e: | ||
|
@@ -457,8 +457,21 @@ def _selected_obj(self): | |
else: | ||
return self.obj[self._selection] | ||
|
||
def _set_selection_from_grouper(self): | ||
""" we may need create a selection if we have non-level groupers """ | ||
def _clear_group_selection(self): | ||
""" | ||
Clear group based selection. Used for methods needing to return info on | ||
each group regardless of whether a group selection was previously set. | ||
""" | ||
if self._group_selection is not None: | ||
self._group_selection = None | ||
# GH12839 clear cached selection too when changing group selection | ||
self._reset_cache('_selected_obj') | ||
|
||
def _set_group_selection(self): | ||
""" | ||
Create group based selection. Used when selection is not passed | ||
directly but instead via a grouper. | ||
""" | ||
grp = self.grouper | ||
if self.as_index and getattr(grp, 'groupings', None) is not None and \ | ||
self.obj.ndim > 1: | ||
|
@@ -468,6 +481,8 @@ def _set_selection_from_grouper(self): | |
|
||
if len(groupers): | ||
self._group_selection = ax.difference(Index(groupers)).tolist() | ||
# GH12839 clear selected obj cache when group selection changes | ||
self._reset_cache('_selected_obj') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree I think this is sufficient There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm I think maybe move the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, I think |
||
|
||
def _set_result_index_ordered(self, result): | ||
# set the result index on the passed values object and | ||
|
@@ -511,7 +526,7 @@ def _make_wrapper(self, name): | |
|
||
# need to setup the selection | ||
# as are not passed directly but in the grouper | ||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
|
||
f = getattr(self._selected_obj, name) | ||
if not isinstance(f, types.MethodType): | ||
|
@@ -979,7 +994,7 @@ def mean(self, *args, **kwargs): | |
except GroupByError: | ||
raise | ||
except Exception: # pragma: no cover | ||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
f = lambda x: x.mean(axis=self.axis) | ||
return self._python_agg_general(f) | ||
|
||
|
@@ -997,7 +1012,7 @@ def median(self): | |
raise | ||
except Exception: # pragma: no cover | ||
|
||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
|
||
def f(x): | ||
if isinstance(x, np.ndarray): | ||
|
@@ -1040,7 +1055,7 @@ def var(self, ddof=1, *args, **kwargs): | |
if ddof == 1: | ||
return self._cython_agg_general('var') | ||
else: | ||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
f = lambda x: x.var(ddof=ddof) | ||
return self._python_agg_general(f) | ||
|
||
|
@@ -1216,7 +1231,7 @@ def nth(self, n, dropna=None): | |
raise TypeError("n needs to be an int or a list/set/tuple of ints") | ||
|
||
nth_values = np.array(nth_values, dtype=np.intp) | ||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
|
||
if not dropna: | ||
mask = np.in1d(self._cumcount_array(), nth_values) | \ | ||
|
@@ -1324,7 +1339,7 @@ def cumcount(self, ascending=True): | |
dtype: int64 | ||
""" | ||
|
||
self._set_selection_from_grouper() | ||
self._set_group_selection() | ||
|
||
index = self._selected_obj.index | ||
cumcounts = self._cumcount_array(ascending=ascending) | ||
|
@@ -1402,6 +1417,7 @@ def head(self, n=5): | |
0 1 2 | ||
2 5 6 | ||
""" | ||
self._clear_group_selection() | ||
mask = self._cumcount_array() < n | ||
return self._selected_obj[mask] | ||
|
||
|
@@ -1428,6 +1444,7 @@ def tail(self, n=5): | |
0 a 1 | ||
2 b 1 | ||
""" | ||
self._clear_group_selection() | ||
mask = self._cumcount_array(ascending=False) < n | ||
return self._selected_obj[mask] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -354,6 +354,35 @@ def test_nth_multi_index_as_expected(self): | |
names=['A', 'B'])) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_group_selection_cache(self): | ||
# GH 12839 nth, head, and tail should return same result consistently | ||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) | ||
expected = df.iloc[[0, 2]].set_index('A') | ||
|
||
g = df.groupby('A') | ||
result1 = g.head(n=2) | ||
result2 = g.nth(0) | ||
assert_frame_equal(result1, df) | ||
assert_frame_equal(result2, expected) | ||
|
||
g = df.groupby('A') | ||
result1 = g.tail(n=2) | ||
result2 = g.nth(0) | ||
assert_frame_equal(result1, df) | ||
assert_frame_equal(result2, expected) | ||
|
||
g = df.groupby('A') | ||
result1 = g.nth(0) | ||
result2 = g.head(n=2) | ||
assert_frame_equal(result1, expected) | ||
assert_frame_equal(result2, df) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you meant with doing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no I mean
e.g. it was 2 of the same calls in a row that was triggering yes? want to have an exact replica of the original failure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compare each result as well (e.g. don't just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, the issue was switching between two methods that either use a group selection ( |
||
g = df.groupby('A') | ||
result1 = g.nth(0) | ||
result2 = g.tail(n=2) | ||
assert_frame_equal(result1, expected) | ||
assert_frame_equal(result2, df) | ||
|
||
def test_grouper_index_types(self): | ||
# related GH5375 | ||
# groupby misbehaving when using a Floatlike index | ||
|
@@ -6107,7 +6136,7 @@ def test_cython_transform(self): | |
# bit a of hack to make sure the cythonized shift | ||
# is equivalent to pre 0.17.1 behavior | ||
if op == 'shift': | ||
gb._set_selection_from_grouper() | ||
gb._set_group_selection() | ||
|
||
for (op, args), targop in ops: | ||
if op != 'shift' and 'int' not in gb_target: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
call this
_reset_group_selection
. sorry to be a pain about the names, but naming is very important for consistency.