diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index a88b7332d9b9e..76c4280d8b728 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -707,6 +707,38 @@ can be used as group keys. If so, the order of the levels will be preserved: data.groupby(factor).mean() + +Taking the first rows of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Just like for a DataFrame or Series you can call head and tail on a groupby: + +.. ipython:: python + + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df + + g = df.groupby('A') + g.head(1) + + g.tail(1) + +This shows the first or last n rows from each group. + +.. warning:: + + Before 0.14.0 this was implemented with a fall-through apply, + so the result would incorrectly respect the as_index flag: + + .. code-block:: python + + >>> g.head(1): # was equivalent to g.apply(lambda x: x.head(1)) + A B + A + 1 0 1 2 + 5 2 5 6 + + Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 4432e9e891e7d..a3d6d255db9a9 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -61,6 +61,24 @@ These are out-of-bounds selections s.year s.index.year +- More consistent behaviour for some groupby methods: + - groupby head and tail now act more like filter rather than an aggregation: + + .. ipython:: python + + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.head(1) # filters DataFrame + + g.apply(lambda x: x.head(1)) # used to simply fall-through + + - groupby head and tail respect column selection: + + .. ipython:: python + + g[['B']].head(1) + + - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f0588524e16eb..598df5507fa69 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -587,7 +587,8 @@ def head(self, n=5): """ Returns first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))`` + Essentially equivalent to ``.apply(lambda x: x.head(n))``, + except ignores as_index flag. Example ------- @@ -599,24 +600,23 @@ def head(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(1) - A B - A - 1 0 1 2 - 5 2 5 6 + A B + 0 1 2 + 2 5 6 """ + obj = self._selected_obj rng = np.arange(self.grouper._max_groupsize, dtype='int64') in_head = self._cumcount_array(rng) < n - head = self.obj[in_head] - if self.as_index: - head.index = self._index_with_as_index(in_head) + head = obj[in_head] return head def tail(self, n=5): """ Returns last n rows of each group - Essentially equivalent to ``.apply(lambda x: x.tail(n))`` + Essentially equivalent to ``.apply(lambda x: x.tail(n))``, + except ignores as_index flag. Example ------- @@ -628,17 +628,15 @@ def tail(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(1) - A B - A - 1 0 1 2 - 5 2 5 6 + A B + 0 1 2 + 2 5 6 """ + obj = self._selected_obj rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') in_tail = self._cumcount_array(rng, ascending=False) > -n - tail = self.obj[in_tail] - if self.as_index: - tail.index = self._index_with_as_index(in_tail) + tail = obj[in_tail] return tail def _cumcount_array(self, arr, **kwargs): @@ -654,6 +652,13 @@ def _cumcount_array(self, arr, **kwargs): cumcounts[v] = arr[len(v)-1::-1] return cumcounts + @cache_readonly + def _selected_obj(self): + if self._selection is None or isinstance(self.obj, Series): + return self.obj + else: + return self.obj[self._selection] + def _index_with_as_index(self, b): """ Take boolean mask of index to be returned from apply, if as_index=True diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4eee1d3a212e0..8af11c8bf77e1 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1315,12 +1315,10 @@ def test_groupby_as_index_apply(self): g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index - exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)]) - assert_index_equal(res_as, exp_as) - res_not_as = g_not_as.head(2).index - exp_not_as = Index([0, 1, 2, 4]) - assert_index_equal(res_not_as, exp_not_as) + exp = Index([0, 1, 2, 4]) + assert_index_equal(res_as, exp) + assert_index_equal(res_not_as, exp) res_as_apply = g_as.apply(lambda x: x.head(2)).index res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index @@ -1355,11 +1353,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df, g_not_as.head(7)) # contains all assert_frame_equal(df, g_not_as.tail(7)) - # as_index=True, yuck - # prepend the A column as an index, in a roundabout way - df_as = df.copy() - df_as.index = df.set_index('A', append=True, - drop=False).index.swaplevel(0, 1) + # as_index=True, (used to be different) + df_as = df assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) @@ -1373,6 +1368,18 @@ def test_groupby_head_tail(self): assert_frame_equal(df_as, g_as.head(7)) # contains all assert_frame_equal(df_as, g_as.tail(7)) + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year,