Skip to content

Commit 6e758b7

Browse files
committed
Merge pull request #6569 from hayd/groupby_nth
ENH/BUG groupby nth now filters, works with DataFrames
2 parents fb1b4a9 + feaca40 commit 6e758b7

File tree

5 files changed

+157
-18
lines changed

5 files changed

+157
-18
lines changed

doc/source/groupby.rst

+28
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,34 @@ This shows the first or last n rows from each group.
738738
1 0 1 2
739739
5 2 5 6
740740
741+
Taking the nth row of each group
742+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
743+
744+
To select from a DataFrame or Series the nth item, use the nth method:
745+
746+
.. ipython:: python
747+
748+
DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
749+
g = df.groupby('A')
750+
g.nth(0)
751+
752+
g.nth(1)
753+
754+
g.nth(-1)
755+
756+
If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy.
757+
758+
.. ipython:: python
759+
760+
g.nth(0, dropna='any')
761+
762+
g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
763+
764+
g.B.nth(0, dropna=True)
765+
766+
.. warning::
767+
768+
Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby.
741769

742770
Enumerate group items
743771
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.14.0.txt

+11-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ These are out-of-bounds selections
6262
s.index.year
6363

6464
- More consistent behaviour for some groupby methods:
65-
- groupby head and tail now act more like filter rather than an aggregation:
65+
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:
6666

6767
.. ipython:: python
6868

@@ -78,6 +78,16 @@ These are out-of-bounds selections
7878

7979
g[['B']].head(1)
8080

81+
- groupby ``nth`` now filters by default, with optional dropna argument to ignore
82+
NaN (to replicate the previous behaviour.)
83+
84+
.. ipython:: python
85+
86+
DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
87+
g = df.groupby('A')
88+
g.nth(0) # can also use negative ints
89+
90+
g.nth(0, dropna='any') # similar to old behaviour
8191

8292
- Local variable usage has changed in
8393
:func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`

pandas/core/groupby.py

+75-11
Original file line numberDiff line numberDiff line change
@@ -523,15 +523,75 @@ def ohlc(self):
523523
"""
524524
return self._cython_agg_general('ohlc')
525525

526-
def nth(self, n):
527-
def picker(arr):
528-
arr = arr[notnull(arr)]
529-
if len(arr) >= n + 1:
530-
return arr.iget(n)
526+
def nth(self, n, dropna=None):
527+
"""
528+
Take the nth row from each group.
529+
530+
If dropna, will not show nth non-null row, dropna is either
531+
Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
532+
to calling dropna(how=dropna) before the groupby.
533+
534+
Examples
535+
--------
536+
>>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
537+
>>> g = df.groupby('A')
538+
>>> g.nth(0)
539+
A B
540+
0 1 NaN
541+
2 5 6
542+
>>> g.nth(1)
543+
A B
544+
1 1 4
545+
>>> g.nth(-1)
546+
A B
547+
1 1 4
548+
2 5 6
549+
>>> g.nth(0, dropna='any')
550+
B
551+
A
552+
1 4
553+
5 6
554+
>>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
555+
B
556+
A
557+
1 NaN
558+
5 NaN
559+
560+
"""
561+
562+
if not dropna: # good choice
563+
m = self.grouper._max_groupsize
564+
if n >= m or n < -m:
565+
return self._selected_obj.loc[[]]
566+
rng = np.zeros(m, dtype=bool)
567+
if n >= 0:
568+
rng[n] = True
569+
is_nth = self._cumcount_array(rng)
531570
else:
571+
rng[- n - 1] = True
572+
is_nth = self._cumcount_array(rng, ascending=False)
573+
return self._selected_obj[is_nth]
574+
575+
if (isinstance(self._selected_obj, DataFrame)
576+
and dropna not in ['any', 'all']):
577+
# Note: when agg-ing picker doesn't raise this, just returns NaN
578+
raise ValueError("For a DataFrame groupby, dropna must be "
579+
"either None, 'any' or 'all', "
580+
"(was passed %s)." % (dropna),)
581+
582+
# old behaviour, but with all and any support for DataFrames.
583+
584+
max_len = n if n >= 0 else - 1 - n
585+
def picker(x):
586+
x = x.dropna(how=dropna) # Note: how is ignored if Series
587+
if len(x) <= max_len:
532588
return np.nan
589+
else:
590+
return x.iloc[n]
591+
533592
return self.agg(picker)
534593

594+
535595
def cumcount(self, **kwargs):
536596
"""
537597
Number each item in each group from 0 to the length of that group - 1.
@@ -579,8 +639,7 @@ def cumcount(self, **kwargs):
579639
ascending = kwargs.pop('ascending', True)
580640

581641
index = self.obj.index
582-
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
583-
cumcounts = self._cumcount_array(rng, ascending=ascending)
642+
cumcounts = self._cumcount_array(ascending=ascending)
584643
return Series(cumcounts, index)
585644

586645
def head(self, n=5):
@@ -606,8 +665,7 @@ def head(self, n=5):
606665
607666
"""
608667
obj = self._selected_obj
609-
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
610-
in_head = self._cumcount_array(rng) < n
668+
in_head = self._cumcount_array() < n
611669
head = obj[in_head]
612670
return head
613671

@@ -639,11 +697,17 @@ def tail(self, n=5):
639697
tail = obj[in_tail]
640698
return tail
641699

642-
def _cumcount_array(self, arr, **kwargs):
700+
def _cumcount_array(self, arr=None, **kwargs):
701+
"""
702+
arr is where cumcount gets it's values from
703+
"""
643704
ascending = kwargs.pop('ascending', True)
644705

706+
if arr is None:
707+
arr = np.arange(self.grouper._max_groupsize, dtype='int64')
708+
645709
len_index = len(self.obj.index)
646-
cumcounts = np.zeros(len_index, dtype='int64')
710+
cumcounts = np.empty(len_index, dtype=arr.dtype)
647711
if ascending:
648712
for v in self.indices.values():
649713
cumcounts[v] = arr[:len(v)]

pandas/tests/test_groupby.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -156,19 +156,18 @@ def test_first_last_nth(self):
156156
assert_frame_equal(last, expected, check_names=False)
157157

158158
nth = grouped.nth(1)
159-
expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
160-
expected.index = ['bar', 'foo']
159+
expected = self.df.iloc[[2, 3]]
161160
assert_frame_equal(nth, expected, check_names=False)
162161

163162
# it works!
164163
grouped['B'].first()
165164
grouped['B'].last()
166165
grouped['B'].nth(0)
167166

168-
self.df['B'][self.df['A'] == 'foo'] = np.nan
167+
self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
169168
self.assert_(com.isnull(grouped['B'].first()['foo']))
170169
self.assert_(com.isnull(grouped['B'].last()['foo']))
171-
self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
170+
self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing
172171

173172
def test_first_last_nth_dtypes(self):
174173

@@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self):
189188
assert_frame_equal(last, expected, check_names=False)
190189

191190
nth = grouped.nth(1)
192-
expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
193-
expected.index = ['bar', 'foo']
191+
expected = df.iloc[[2, 3]]
194192
assert_frame_equal(nth, expected, check_names=False)
195193

196194
# GH 2763, first/last shifting dtypes
@@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self):
201199
f = s.groupby(level=0).first()
202200
self.assertEqual(f.dtype, 'int64')
203201

202+
def test_nth(self):
203+
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
204+
g = df.groupby('A')
205+
206+
assert_frame_equal(g.nth(0), df.iloc[[0, 2]])
207+
assert_frame_equal(g.nth(1), df.iloc[[1]])
208+
assert_frame_equal(g.nth(2), df.loc[[]])
209+
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]])
210+
assert_frame_equal(g.nth(-2), df.iloc[[0]])
211+
assert_frame_equal(g.nth(-3), df.loc[[]])
212+
assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
213+
assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
214+
assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']])
215+
216+
exp = df.set_index('A')
217+
assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
218+
assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
219+
220+
exp['B'] = np.nan
221+
assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
222+
assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
223+
224+
204225
def test_grouper_index_types(self):
205226
# related GH5375
206227
# groupby misbehaving when using a Floatlike index

vb_suite/groupby.py

+16
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,22 @@ def f(g):
269269
groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup,
270270
start_date=datetime(2011, 10, 1))
271271

272+
273+
#----------------------------------------------------------------------
274+
# DataFrame nth
275+
276+
setup = common_setup + """
277+
df = pd.DataFrame(np.random.randint(1, 100, (10000, 2)))
278+
"""
279+
280+
# Not really a fair test as behaviour has changed!
281+
groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup,
282+
start_date=datetime(2014, 3, 1))
283+
284+
groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
285+
start_date=datetime(2014, 3, 1))
286+
287+
272288
#----------------------------------------------------------------------
273289
# Sum booleans #2692
274290

0 commit comments

Comments
 (0)