Skip to content

Commit 2fa4ba8

Browse files
committed
ENH: implement group_prod, group_prod_bin cython aggregators, close #1018
1 parent 992b1ff commit 2fa4ba8

File tree

6 files changed

+146
-21
lines changed

6 files changed

+146
-21
lines changed

RELEASE.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ Where to get it
2525
pandas 0.8.0
2626
============
2727

28+
**New features**
29+
30+
- Add GroupBy.prod optimized aggregation function and 'prod' fast time series
31+
conversion method (#1018)
32+
2833
**API Changes**
2934

3035
- Change BDay (business day) to not normalize dates by default

pandas/core/groupby.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,17 @@ def sum(self):
309309
except Exception:
310310
return self.aggregate(lambda x: np.sum(x, axis=self.axis))
311311

312+
def prod(self):
313+
"""
314+
Compute product of values, excluding missing values
315+
316+
For multiple groupings, the result index will be a MultiIndex
317+
"""
318+
try:
319+
return self._cython_agg_general('prod')
320+
except Exception:
321+
return self.aggregate(lambda x: np.prod(x, axis=self.axis))
322+
312323
def ohlc(self):
313324
"""
314325
Compute sum of values, excluding missing values
@@ -592,6 +603,7 @@ def get_group_levels(self):
592603

593604
_cython_functions = {
594605
'add' : lib.group_add,
606+
'prod' : lib.group_prod,
595607
'mean' : lib.group_mean,
596608
'var' : lib.group_var,
597609
'std' : lib.group_var
@@ -822,6 +834,7 @@ def ngroups(self):
822834

823835
_cython_functions = {
824836
'add' : lib.group_add_bin,
837+
'prod' : lib.group_prod_bin,
825838
'mean' : lib.group_mean_bin,
826839
'var' : lib.group_var_bin,
827840
'std' : lib.group_var_bin,

pandas/src/groupby.pyx

Lines changed: 109 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,61 @@ def group_add(ndarray[float64_t, ndim=2] out,
276276
else:
277277
out[i, j] = sumx[i, j]
278278

279+
@cython.boundscheck(False)
280+
@cython.wraparound(False)
281+
def group_prod(ndarray[float64_t, ndim=2] out,
282+
ndarray[int32_t] counts,
283+
ndarray[float64_t, ndim=2] values,
284+
ndarray[int32_t] labels):
285+
'''
286+
Only aggregates on axis=0
287+
'''
288+
cdef:
289+
Py_ssize_t i, j, N, K, lab
290+
float64_t val, count
291+
ndarray[float64_t, ndim=2] prodx, nobs
292+
293+
nobs = np.zeros_like(out)
294+
prodx = np.ones_like(out)
295+
296+
N, K = (<object> values).shape
297+
298+
if K > 1:
299+
for i in range(N):
300+
lab = labels[i]
301+
if lab < 0:
302+
continue
303+
304+
counts[lab] += 1
305+
for j in range(K):
306+
val = values[i, j]
307+
308+
# not nan
309+
if val == val:
310+
nobs[lab, j] += 1
311+
prodx[lab, j] *= val
312+
else:
313+
for i in range(N):
314+
lab = labels[i]
315+
if lab < 0:
316+
continue
317+
318+
counts[lab] += 1
319+
val = values[i, 0]
320+
321+
# not nan
322+
if val == val:
323+
nobs[lab, 0] += 1
324+
prodx[lab, 0] *= val
325+
326+
for i in range(len(counts)):
327+
for j in range(K):
328+
if nobs[i, j] == 0:
329+
out[i, j] = nan
330+
else:
331+
out[i, j] = prodx[i, j]
332+
333+
279334
@cython.boundscheck(False)
280335
@cython.wraparound(False)
281336
def group_mean(ndarray[float64_t, ndim=2] out,
@@ -456,26 +511,6 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
456511

457512
return bins, labels
458513

459-
#@cython.boundscheck(False)
460-
#@cython.wraparound(False)
461-
#cdef ndarray[int32_t] counts_by_bins(ndarray[int32_t] bins,
462-
# Py_ssize_t datalen):
463-
# cdef:
464-
# Py_ssize_t ngroups = len(bins)
465-
# i = 0
466-
467-
# counts = np.zeros(ngroups, dtype='i4')
468-
469-
# if ngroups > 0:
470-
# counts[0] = bins[0]
471-
# for i in range(1, ngroups):
472-
# if i == ngroups - 1:
473-
# counts[i] = datalen - bins[i-1]
474-
# else:
475-
# counts[i] = bins[i] - bins[i-1]
476-
477-
# return counts
478-
479514
# add passing bin edges, instead of labels
480515

481516
@cython.boundscheck(False)
@@ -532,6 +567,60 @@ def group_add_bin(ndarray[float64_t, ndim=2] out,
532567
else:
533568
out[i, j] = sumx[i, j]
534569

570+
@cython.boundscheck(False)
571+
@cython.wraparound(False)
572+
def group_prod_bin(ndarray[float64_t, ndim=2] out,
573+
ndarray[int32_t] counts,
574+
ndarray[float64_t, ndim=2] values,
575+
ndarray[int32_t] bins):
576+
'''
577+
Only aggregates on axis=0
578+
'''
579+
cdef:
580+
Py_ssize_t i, j, N, K, ngroups, b
581+
float64_t val, count
582+
ndarray[float64_t, ndim=2] prodx, nobs
583+
584+
nobs = np.zeros_like(out)
585+
prodx = np.ones_like(out)
586+
587+
ngroups = len(bins) + 1
588+
N, K = (<object> values).shape
589+
590+
b = 0
591+
if K > 1:
592+
for i in range(N):
593+
if b < ngroups - 1 and i >= bins[b]:
594+
b += 1
595+
596+
counts[b] += 1
597+
for j in range(K):
598+
val = values[i, j]
599+
600+
# not nan
601+
if val == val:
602+
nobs[b, j] += 1
603+
prodx[b, j] *= val
604+
else:
605+
for i in range(N):
606+
if b < ngroups - 1 and i >= bins[b]:
607+
b += 1
608+
609+
counts[b] += 1
610+
val = values[i, 0]
611+
612+
# not nan
613+
if val == val:
614+
nobs[b, 0] += 1
615+
prodx[b, 0] *= val
616+
617+
for i in range(ngroups):
618+
for j in range(K):
619+
if nobs[i, j] == 0:
620+
out[i, j] = nan
621+
else:
622+
out[i, j] = prodx[i, j]
623+
535624

536625
@cython.boundscheck(False)
537626
@cython.wraparound(False)

pandas/tests/test_groupby.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,7 @@ def _testit(op):
989989

990990
_testit(lambda x: x.sum())
991991
_testit(lambda x: x.mean())
992+
_testit(lambda x: x.prod())
992993

993994
def test_cython_agg_boolean(self):
994995
frame = DataFrame({'a': np.random.randint(0, 5, 50),

pandas/tests/test_timeseries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -976,7 +976,7 @@ def test_apply(self):
976976
assert_series_equal(applied, expected)
977977

978978
def test_numpy_reduction(self):
979-
result = self.ts.convert('A', how=np.prod, closed='right')
979+
result = self.ts.convert('A', how='prod', closed='right')
980980

981981
expected = self.ts.groupby(lambda x: x.year).agg(np.prod)
982982
expected.index = result.index

pandas/tests/test_tseries.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,23 @@ def test_group_mean_bin():
339339

340340
assert_almost_equal(out, exp)
341341

342+
def test_group_prod_bin():
343+
# original group_prod
344+
obj = np.random.randn(10, 1)
345+
346+
lab = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int32)
347+
cts = np.array([3, 3, 4], dtype=np.int32)
348+
exp = np.zeros((3, 1), np.float64)
349+
lib.group_prod(exp, cts, obj, lab)
350+
351+
# bin-based group_prod
352+
bins = np.array([3, 6], dtype=np.int32)
353+
out = np.zeros((3, 1), np.float64)
354+
counts = np.zeros(len(out), dtype=np.int32)
355+
lib.group_prod_bin(out, counts, obj, bins)
356+
357+
assert_almost_equal(out, exp)
358+
342359
def test_group_var_bin():
343360
# original group_var
344361
obj = np.random.randn(10, 1)

0 commit comments

Comments
 (0)