Skip to content

Commit 1bab0a2

Browse files
committed
Merge pull request #6533 from hayd/groupby_head
API change in groupby head and tail
2 parents baa4c1d + 1fbc534 commit 1bab0a2

File tree

4 files changed

+88
-26
lines changed

4 files changed

+88
-26
lines changed

doc/source/groupby.rst

+32
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,38 @@ can be used as group keys. If so, the order of the levels will be preserved:
707707
708708
data.groupby(factor).mean()
709709
710+
711+
Taking the first rows of each group
712+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
713+
714+
Just like for a DataFrame or Series you can call head and tail on a groupby:
715+
716+
.. ipython:: python
717+
718+
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
719+
df
720+
721+
g = df.groupby('A')
722+
g.head(1)
723+
724+
g.tail(1)
725+
726+
This shows the first or last n rows from each group.
727+
728+
.. warning::
729+
730+
Before 0.14.0 this was implemented with a fall-through apply,
731+
so the result would incorrectly respect the as_index flag:
732+
733+
.. code-block:: python
734+
735+
>>> g.head(1): # was equivalent to g.apply(lambda x: x.head(1))
736+
A B
737+
A
738+
1 0 1 2
739+
5 2 5 6
740+
741+
710742
Enumerate group items
711743
~~~~~~~~~~~~~~~~~~~~~
712744

doc/source/v0.14.0.txt

+18
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,24 @@ These are out-of-bounds selections
6161
s.year
6262
s.index.year
6363

64+
- More consistent behaviour for some groupby methods:
65+
- groupby head and tail now act more like filter rather than an aggregation:
66+
67+
.. ipython:: python
68+
69+
df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
70+
g = df.groupby('A')
71+
g.head(1) # filters DataFrame
72+
73+
g.apply(lambda x: x.head(1)) # used to simply fall-through
74+
75+
- groupby head and tail respect column selection:
76+
77+
.. ipython:: python
78+
79+
g[['B']].head(1)
80+
81+
6482
- Local variable usage has changed in
6583
:func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
6684
(:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have

pandas/core/groupby.py

+21-16
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,8 @@ def head(self, n=5):
587587
"""
588588
Returns first n rows of each group.
589589
590-
Essentially equivalent to ``.apply(lambda x: x.head(n))``
590+
Essentially equivalent to ``.apply(lambda x: x.head(n))``,
591+
except ignores as_index flag.
591592
592593
Example
593594
-------
@@ -599,24 +600,23 @@ def head(self, n=5):
599600
0 1 2
600601
2 5 6
601602
>>> df.groupby('A').head(1)
602-
A B
603-
A
604-
1 0 1 2
605-
5 2 5 6
603+
A B
604+
0 1 2
605+
2 5 6
606606
607607
"""
608+
obj = self._selected_obj
608609
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
609610
in_head = self._cumcount_array(rng) < n
610-
head = self.obj[in_head]
611-
if self.as_index:
612-
head.index = self._index_with_as_index(in_head)
611+
head = obj[in_head]
613612
return head
614613

615614
def tail(self, n=5):
616615
"""
617616
Returns last n rows of each group
618617
619-
Essentially equivalent to ``.apply(lambda x: x.tail(n))``
618+
Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
619+
except ignores as_index flag.
620620
621621
Example
622622
-------
@@ -628,17 +628,15 @@ def tail(self, n=5):
628628
0 1 2
629629
2 5 6
630630
>>> df.groupby('A').head(1)
631-
A B
632-
A
633-
1 0 1 2
634-
5 2 5 6
631+
A B
632+
0 1 2
633+
2 5 6
635634
636635
"""
636+
obj = self._selected_obj
637637
rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
638638
in_tail = self._cumcount_array(rng, ascending=False) > -n
639-
tail = self.obj[in_tail]
640-
if self.as_index:
641-
tail.index = self._index_with_as_index(in_tail)
639+
tail = obj[in_tail]
642640
return tail
643641

644642
def _cumcount_array(self, arr, **kwargs):
@@ -654,6 +652,13 @@ def _cumcount_array(self, arr, **kwargs):
654652
cumcounts[v] = arr[len(v)-1::-1]
655653
return cumcounts
656654

655+
@cache_readonly
656+
def _selected_obj(self):
657+
if self._selection is None or isinstance(self.obj, Series):
658+
return self.obj
659+
else:
660+
return self.obj[self._selection]
661+
657662
def _index_with_as_index(self, b):
658663
"""
659664
Take boolean mask of index to be returned from apply, if as_index=True

pandas/tests/test_groupby.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -1315,12 +1315,10 @@ def test_groupby_as_index_apply(self):
13151315
g_not_as = df.groupby('user_id', as_index=False)
13161316

13171317
res_as = g_as.head(2).index
1318-
exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)])
1319-
assert_index_equal(res_as, exp_as)
1320-
13211318
res_not_as = g_not_as.head(2).index
1322-
exp_not_as = Index([0, 1, 2, 4])
1323-
assert_index_equal(res_not_as, exp_not_as)
1319+
exp = Index([0, 1, 2, 4])
1320+
assert_index_equal(res_as, exp)
1321+
assert_index_equal(res_not_as, exp)
13241322

13251323
res_as_apply = g_as.apply(lambda x: x.head(2)).index
13261324
res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
@@ -1355,11 +1353,8 @@ def test_groupby_head_tail(self):
13551353
assert_frame_equal(df, g_not_as.head(7)) # contains all
13561354
assert_frame_equal(df, g_not_as.tail(7))
13571355

1358-
# as_index=True, yuck
1359-
# prepend the A column as an index, in a roundabout way
1360-
df_as = df.copy()
1361-
df_as.index = df.set_index('A', append=True,
1362-
drop=False).index.swaplevel(0, 1)
1356+
# as_index=True, (used to be different)
1357+
df_as = df
13631358

13641359
assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
13651360
assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
@@ -1373,6 +1368,18 @@ def test_groupby_head_tail(self):
13731368
assert_frame_equal(df_as, g_as.head(7)) # contains all
13741369
assert_frame_equal(df_as, g_as.tail(7))
13751370

1371+
# test with selection
1372+
assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []])
1373+
assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']])
1374+
assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']])
1375+
assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]])
1376+
1377+
assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []])
1378+
assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']])
1379+
assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']])
1380+
assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]])
1381+
1382+
13761383
def test_groupby_multiple_key(self):
13771384
df = tm.makeTimeDataFrame()
13781385
grouped = df.groupby([lambda x: x.year,

0 commit comments

Comments
 (0)