ENH: implement group_prod, group_prod_bin cython aggregators, close #1018

wesm · wesm · commit 2fa4ba87bb18 · 2012-04-14T17:36:10.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -25,6 +25,11 @@ Where to get it
 pandas 0.8.0
 ============
 
+**New features**
+
+  - Add GroupBy.prod optimized aggregation function and 'prod' fast time series
+    conversion method (#1018)
+
 **API Changes**
 
  - Change BDay (business day) to not normalize dates by default
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -309,6 +309,17 @@ def sum(self):
         except Exception:
             return self.aggregate(lambda x: np.sum(x, axis=self.axis))
 
+    def prod(self):
+        """
+        Compute product of values, excluding missing values
+
+        For multiple groupings, the result index will be a MultiIndex
+        """
+        try:
+            return self._cython_agg_general('prod')
+        except Exception:
+            return self.aggregate(lambda x: np.prod(x, axis=self.axis))
+
     def ohlc(self):
         """
         Compute sum of values, excluding missing values
@@ -592,6 +603,7 @@ def get_group_levels(self):
 
     _cython_functions = {
         'add' : lib.group_add,
+        'prod' : lib.group_prod,
         'mean' : lib.group_mean,
         'var' : lib.group_var,
         'std' : lib.group_var
@@ -822,6 +834,7 @@ def ngroups(self):
 
     _cython_functions = {
         'add' : lib.group_add_bin,
+        'prod' : lib.group_prod_bin,
         'mean' : lib.group_mean_bin,
         'var' : lib.group_var_bin,
         'std' : lib.group_var_bin,
diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx
@@ -276,6 +276,61 @@ def group_add(ndarray[float64_t, ndim=2] out,
             else:
                 out[i, j] = sumx[i, j]
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod(ndarray[float64_t, ndim=2] out,
+              ndarray[int32_t] counts,
+              ndarray[float64_t, ndim=2] values,
+              ndarray[int32_t] labels):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        ndarray[float64_t, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    if K > 1:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    prodx[lab, j] *= val
+    else:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[lab, 0] += 1
+                prodx[lab, 0] *= val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_mean(ndarray[float64_t, ndim=2] out,
@@ -456,26 +511,6 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
 
     return bins, labels
 
-#@cython.boundscheck(False)
-#@cython.wraparound(False)
-#cdef ndarray[int32_t] counts_by_bins(ndarray[int32_t] bins,
-#                                     Py_ssize_t datalen):
-#    cdef:
-#        Py_ssize_t ngroups = len(bins)
-#        i = 0
-
-#    counts = np.zeros(ngroups, dtype='i4')
-
-#    if ngroups > 0:
-#        counts[0] = bins[0]
-#        for i in range(1, ngroups):
-#            if i == ngroups - 1:
-#                counts[i] = datalen - bins[i-1]
-#            else:
-#                counts[i] = bins[i] - bins[i-1]
-
-#    return counts
-
 # add passing bin edges, instead of labels
 
 @cython.boundscheck(False)
@@ -532,6 +567,60 @@ def group_add_bin(ndarray[float64_t, ndim=2] out,
             else:
                 out[i, j] = sumx[i, j]
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_prod_bin(ndarray[float64_t, ndim=2] out,
+                  ndarray[int32_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int32_t] bins):
+    '''
+    Only aggregates on axis=0
+    '''
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, b
+        float64_t val, count
+        ndarray[float64_t, ndim=2] prodx, nobs
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    ngroups = len(bins) + 1
+    N, K = (<object> values).shape
+
+    b = 0
+    if K > 1:
+        for i in range(N):
+            if b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[b, j] += 1
+                    prodx[b, j] *= val
+    else:
+        for i in range(N):
+            if b < ngroups - 1 and i >= bins[b]:
+                b += 1
+
+            counts[b] += 1
+            val = values[i, 0]
+
+            # not nan
+            if val == val:
+                nobs[b, 0] += 1
+                prodx[b, 0] *= val
+
+    for i in range(ngroups):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = prodx[i, j]
+
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -989,6 +989,7 @@ def _testit(op):
 
         _testit(lambda x: x.sum())
         _testit(lambda x: x.mean())
+        _testit(lambda x: x.prod())
 
     def test_cython_agg_boolean(self):
         frame = DataFrame({'a': np.random.randint(0, 5, 50),
diff --git a/pandas/tests/test_timeseries.py b/pandas/tests/test_timeseries.py
@@ -976,7 +976,7 @@ def test_apply(self):
         assert_series_equal(applied, expected)
 
     def test_numpy_reduction(self):
-        result = self.ts.convert('A', how=np.prod, closed='right')
+        result = self.ts.convert('A', how='prod', closed='right')
 
         expected = self.ts.groupby(lambda x: x.year).agg(np.prod)
         expected.index = result.index
diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -339,6 +339,23 @@ def test_group_mean_bin():
 
     assert_almost_equal(out, exp)
 
+def test_group_prod_bin():
+    # original group_prod
+    obj = np.random.randn(10, 1)
+
+    lab = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int32)
+    cts = np.array([3, 3, 4], dtype=np.int32)
+    exp = np.zeros((3, 1), np.float64)
+    lib.group_prod(exp, cts, obj, lab)
+
+    # bin-based group_prod
+    bins = np.array([3, 6], dtype=np.int32)
+    out  = np.zeros((3, 1), np.float64)
+    counts = np.zeros(len(out), dtype=np.int32)
+    lib.group_prod_bin(out, counts, obj, bins)
+
+    assert_almost_equal(out, exp)
+
 def test_group_var_bin():
     # original group_var
     obj = np.random.randn(10, 1)