Skip to content

Commit a98035c

Browse files
committed
ENH: add Cython nth/last functions, vbenchmarks. close #1043
1 parent 59f0ee7 commit a98035c

File tree

5 files changed

+238
-18
lines changed

5 files changed

+238
-18
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ pandas 0.8.0
3939
sense indexing/selection functionality
4040
- Series/DataFrame.update methods, in-place variant of combine_first (#961)
4141
- Add ``match`` function to API (#502)
42+
- Add Cython-optimized first, last, min, max, prod functions to GroupBy (#994,
43+
#1043)
4244

4345
**Improvements to existing features**
4446

pandas/core/groupby.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,20 @@ def f(self):
3030

3131
return f
3232

33+
def _first_compat(x, axis=0):
34+
x = np.asarray(x)
35+
x = x[com.notnull(x)]
36+
if len(x) == 0:
37+
return np.nan
38+
return x[0]
39+
40+
def _last_compat(x, axis=0):
41+
x = np.asarray(x)
42+
x = x[com.notnull(x)]
43+
if len(x) == 0:
44+
return np.nan
45+
return x[-1]
46+
3347

3448
class GroupBy(object):
3549
"""
@@ -314,6 +328,8 @@ def size(self):
314328
prod = _groupby_function('prod', 'prod', np.prod)
315329
min = _groupby_function('min', 'min', np.min)
316330
max = _groupby_function('max', 'max', np.max)
331+
first = _groupby_function('first', 'first', _first_compat)
332+
last = _groupby_function('last', 'last', _last_compat)
317333

318334
def ohlc(self):
319335
"""
@@ -323,11 +339,11 @@ def ohlc(self):
323339
"""
324340
return self._cython_agg_general('ohlc')
325341

326-
def last(self):
327-
return self.nth(-1)
342+
# def last(self):
343+
# return self.nth(-1)
328344

329-
def first(self):
330-
return self.nth(0)
345+
# def first(self):
346+
# return self.nth(0)
331347

332348
def nth(self, n):
333349
def picker(arr):
@@ -621,7 +637,9 @@ def get_group_levels(self):
621637
'max' : lib.group_max,
622638
'mean' : lib.group_mean,
623639
'var' : lib.group_var,
624-
'std' : lib.group_var
640+
'std' : lib.group_var,
641+
'first': lambda a, b, c, d: lib.group_nth(a, b, c, d, 1),
642+
'last': lib.group_last
625643
}
626644

627645
_cython_transforms = {
@@ -858,7 +876,9 @@ def names(self):
858876
'max' : lib.group_max_bin,
859877
'var' : lib.group_var_bin,
860878
'std' : lib.group_var_bin,
861-
'ohlc' : lib.group_ohlc
879+
'ohlc' : lib.group_ohlc,
880+
'first': lambda a, b, c, d: lib.group_nth_bin(a, b, c, d, 1),
881+
'last': lib.group_last_bin
862882
}
863883

864884
_name_functions = {

pandas/src/groupby.pyx

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,188 @@ def group_prod(ndarray[float64_t, ndim=2] out,
330330
else:
331331
out[i, j] = prodx[i, j]
332332

333+
#----------------------------------------------------------------------
334+
# first, nth, last
335+
336+
@cython.boundscheck(False)
337+
@cython.wraparound(False)
338+
def group_nth(ndarray[float64_t, ndim=2] out,
339+
ndarray[int64_t] counts,
340+
ndarray[float64_t, ndim=2] values,
341+
ndarray[int64_t] labels, int64_t rank):
342+
'''
343+
Only aggregates on axis=0
344+
'''
345+
cdef:
346+
Py_ssize_t i, j, N, K, lab
347+
float64_t val, count
348+
ndarray[float64_t, ndim=2] resx
349+
ndarray[int64_t, ndim=2] nobs
350+
351+
nobs = np.zeros((<object> out).shape, dtype=np.int64)
352+
resx = np.empty_like(out)
353+
354+
N, K = (<object> values).shape
355+
356+
for i in range(N):
357+
lab = labels[i]
358+
if lab < 0:
359+
continue
360+
361+
counts[lab] += 1
362+
for j in range(K):
363+
val = values[i, j]
364+
365+
# not nan
366+
if val == val:
367+
nobs[lab, j] += 1
368+
if nobs[lab, j] == rank:
369+
resx[lab, j] = val
370+
371+
for i in range(len(counts)):
372+
for j in range(K):
373+
if nobs[i, j] == 0:
374+
out[i, j] = nan
375+
else:
376+
out[i, j] = resx[i, j]
377+
378+
379+
@cython.boundscheck(False)
380+
@cython.wraparound(False)
381+
def group_nth_bin(ndarray[float64_t, ndim=2] out,
382+
ndarray[int64_t] counts,
383+
ndarray[float64_t, ndim=2] values,
384+
ndarray[int64_t] bins, int64_t rank):
385+
'''
386+
Only aggregates on axis=0
387+
'''
388+
cdef:
389+
Py_ssize_t i, j, N, K, ngroups, b
390+
float64_t val, count
391+
ndarray[float64_t, ndim=2] resx, nobs
392+
393+
nobs = np.zeros_like(out)
394+
resx = np.empty_like(out)
395+
396+
if bins[len(bins) - 1] == len(values):
397+
ngroups = len(bins)
398+
else:
399+
ngroups = len(bins) + 1
400+
401+
N, K = (<object> values).shape
402+
403+
b = 0
404+
for i in range(N):
405+
while b < ngroups - 1 and i >= bins[b]:
406+
b += 1
407+
408+
counts[b] += 1
409+
for j in range(K):
410+
val = values[i, j]
411+
412+
# not nan
413+
if val == val:
414+
nobs[b, j] += 1
415+
if nobs[b, j] == rank:
416+
resx[b, j] = val
417+
418+
for i in range(ngroups):
419+
for j in range(K):
420+
if nobs[i, j] == 0:
421+
out[i, j] = nan
422+
else:
423+
out[i, j] = resx[i, j]
424+
425+
@cython.boundscheck(False)
426+
@cython.wraparound(False)
427+
def group_last(ndarray[float64_t, ndim=2] out,
428+
ndarray[int64_t] counts,
429+
ndarray[float64_t, ndim=2] values,
430+
ndarray[int64_t] labels):
431+
'''
432+
Only aggregates on axis=0
433+
'''
434+
cdef:
435+
Py_ssize_t i, j, N, K, lab
436+
float64_t val, count
437+
ndarray[float64_t, ndim=2] resx
438+
ndarray[int64_t, ndim=2] nobs
439+
440+
nobs = np.zeros((<object> out).shape, dtype=np.int64)
441+
resx = np.empty_like(out)
442+
443+
N, K = (<object> values).shape
444+
445+
for i in range(N):
446+
lab = labels[i]
447+
if lab < 0:
448+
continue
449+
450+
counts[lab] += 1
451+
for j in range(K):
452+
val = values[i, j]
453+
454+
# not nan
455+
if val == val:
456+
nobs[lab, j] += 1
457+
resx[lab, j] = val
458+
459+
for i in range(len(counts)):
460+
for j in range(K):
461+
if nobs[i, j] == 0:
462+
out[i, j] = nan
463+
else:
464+
out[i, j] = resx[i, j]
465+
466+
467+
@cython.boundscheck(False)
468+
@cython.wraparound(False)
469+
def group_last_bin(ndarray[float64_t, ndim=2] out,
470+
ndarray[int64_t] counts,
471+
ndarray[float64_t, ndim=2] values,
472+
ndarray[int64_t] bins):
473+
'''
474+
Only aggregates on axis=0
475+
'''
476+
cdef:
477+
Py_ssize_t i, j, N, K, ngroups, b
478+
float64_t val, count
479+
ndarray[float64_t, ndim=2] resx, nobs
480+
481+
nobs = np.zeros_like(out)
482+
resx = np.empty_like(out)
483+
484+
if bins[len(bins) - 1] == len(values):
485+
ngroups = len(bins)
486+
else:
487+
ngroups = len(bins) + 1
488+
489+
N, K = (<object> values).shape
490+
491+
b = 0
492+
for i in range(N):
493+
while b < ngroups - 1 and i >= bins[b]:
494+
b += 1
495+
496+
counts[b] += 1
497+
for j in range(K):
498+
val = values[i, j]
499+
500+
# not nan
501+
if val == val:
502+
nobs[b, j] += 1
503+
resx[b, j] = val
504+
505+
for i in range(ngroups):
506+
for j in range(K):
507+
if nobs[i, j] == 0:
508+
out[i, j] = nan
509+
else:
510+
out[i, j] = resx[i, j]
511+
512+
#----------------------------------------------------------------------
513+
# group_min, group_max
514+
333515

334516
@cython.boundscheck(False)
335517
@cython.wraparound(False)
@@ -787,6 +969,7 @@ def group_min_bin(ndarray[float64_t, ndim=2] out,
787969
else:
788970
out[i, j] = minx[i, j]
789971

972+
790973
@cython.boundscheck(False)
791974
@cython.wraparound(False)
792975
def group_max_bin(ndarray[float64_t, ndim=2] out,

pandas/tests/test_groupby.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -121,25 +121,23 @@ def test_basic(self):
121121
# corner cases
122122
self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2)
123123

124+
def test_first_last_nth(self):
124125
# tests for first / last / nth
125126
grouped = self.df.groupby('A')
126127
first = grouped.first()
127-
expected = grouped.get_group('bar')
128-
expected = expected.xs(expected.index[0])[1:]
129-
expected.name ='bar'
130-
assert_series_equal(first.xs('bar'), expected)
128+
expected = self.df.ix[[1, 0], ['C', 'D']]
129+
expected.index = ['bar', 'foo']
130+
assert_frame_equal(first, expected)
131131

132132
last = grouped.last()
133-
expected = grouped.get_group('bar')
134-
expected = expected.xs(expected.index[-1])[1:]
135-
expected.name ='bar'
136-
assert_series_equal(last.xs('bar'), expected)
133+
expected = self.df.ix[[5, 7], ['C', 'D']]
134+
expected.index = ['bar', 'foo']
135+
assert_frame_equal(last, expected)
137136

138137
nth = grouped.nth(1)
139-
expected = grouped.get_group('foo')
140-
expected = expected.xs(expected.index[1])[1:]
141-
expected.name ='foo'
142-
assert_series_equal(nth.xs('foo'), expected)
138+
expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
139+
expected.index = ['bar', 'foo']
140+
assert_frame_equal(nth, expected)
143141

144142
def test_empty_groups(self):
145143
# GH # 1048

vb_suite/groupby.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,20 @@ def f():
155155

156156
groupby_apply_dict_return = Benchmark('data.groupby(labels).apply(f)',
157157
setup, start_date=datetime(2011, 12, 15))
158+
159+
#----------------------------------------------------------------------
160+
# First / last functions
161+
162+
setup = common_setup + """
163+
labels = np.arange(10000).repeat(10)
164+
data = Series(randn(len(labels)))
165+
data[::3] = np.nan
166+
data[1::3] = np.nan
167+
labels = labels.take(np.random.permutation(len(labels)))
168+
"""
169+
170+
groupby_first = Benchmark('data.groupby(labels).first()', setup,
171+
start_date=datetime(2012, 5, 1))
172+
173+
groupby_last = Benchmark('data.groupby(labels).last()', setup,
174+
start_date=datetime(2012, 5, 1))

0 commit comments

Comments
 (0)