diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index dfb5ebc9379b1..ca73351475154 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -313,7 +313,7 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - +- Bug in ``groupby(..).nth()`` where the group key is included inconsistently (:issue:`12839`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bea62e98e4a2a..0c3b53aa1419c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -95,7 +95,7 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, @Appender(_doc_template) @Appender(_local_template) def f(self): - self._set_selection_from_grouper() + self._set_group_selection() try: return self._cython_agg_general(alias, numeric_only=numeric_only) except AssertionError as e: @@ -457,8 +457,21 @@ def _selected_obj(self): else: return self.obj[self._selection] - def _set_selection_from_grouper(self): - """ we may need create a selection if we have non-level groupers """ + def _reset_group_selection(self): + """ + Clear group based selection. Used for methods needing to return info on + each group regardless of whether a group selection was previously set. + """ + if self._group_selection is not None: + self._group_selection = None + # GH12839 clear cached selection too when changing group selection + self._reset_cache('_selected_obj') + + def _set_group_selection(self): + """ + Create group based selection. Used when selection is not passed + directly but instead via a grouper. + """ grp = self.grouper if self.as_index and getattr(grp, 'groupings', None) is not None and \ self.obj.ndim > 1: @@ -468,6 +481,8 @@ def _set_selection_from_grouper(self): if len(groupers): self._group_selection = ax.difference(Index(groupers)).tolist() + # GH12839 clear selected obj cache when group selection changes + self._reset_cache('_selected_obj') def _set_result_index_ordered(self, result): # set the result index on the passed values object and @@ -511,7 +526,7 @@ def _make_wrapper(self, name): # need to setup the selection # as are not passed directly but in the grouper - self._set_selection_from_grouper() + self._set_group_selection() f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): @@ -979,7 +994,7 @@ def mean(self, *args, **kwargs): except GroupByError: raise except Exception: # pragma: no cover - self._set_selection_from_grouper() + self._set_group_selection() f = lambda x: x.mean(axis=self.axis) return self._python_agg_general(f) @@ -997,7 +1012,7 @@ def median(self): raise except Exception: # pragma: no cover - self._set_selection_from_grouper() + self._set_group_selection() def f(x): if isinstance(x, np.ndarray): @@ -1040,7 +1055,7 @@ def var(self, ddof=1, *args, **kwargs): if ddof == 1: return self._cython_agg_general('var') else: - self._set_selection_from_grouper() + self._set_group_selection() f = lambda x: x.var(ddof=ddof) return self._python_agg_general(f) @@ -1216,7 +1231,7 @@ def nth(self, n, dropna=None): raise TypeError("n needs to be an int or a list/set/tuple of ints") nth_values = np.array(nth_values, dtype=np.intp) - self._set_selection_from_grouper() + self._set_group_selection() if not dropna: mask = np.in1d(self._cumcount_array(), nth_values) | \ @@ -1324,7 +1339,7 @@ def cumcount(self, ascending=True): dtype: int64 """ - self._set_selection_from_grouper() + self._set_group_selection() index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) @@ -1402,6 +1417,7 @@ def head(self, n=5): 0 1 2 2 5 6 """ + self._reset_group_selection() mask = self._cumcount_array() < n return self._selected_obj[mask] @@ -1428,6 +1444,7 @@ def tail(self, n=5): 0 a 1 2 b 1 """ + self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6659e6b106a67..139f0472f7480 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -354,6 +354,35 @@ def test_nth_multi_index_as_expected(self): names=['A', 'B'])) assert_frame_equal(result, expected) + def test_group_selection_cache(self): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index @@ -6107,7 +6136,7 @@ def test_cython_transform(self): # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == 'shift': - gb._set_selection_from_grouper() + gb._set_group_selection() for (op, args), targop in ops: if op != 'shift' and 'int' not in gb_target: