Merge pull request #10472 from jvkersch/fix/var-welford-algorithm

jreback · jreback · commit bbec57d6f881 · 2015-07-08T09:19:41.000-05:00
ENH: Make group_var_ use Welford's algorithm.
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -135,8 +135,11 @@ Bug Fixes
 - Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
 
 - Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
+
 - Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
 
 - Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']``  (:issue:`10413`, :issue:`10467`)
 
 - Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
+
+- Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -1147,67 +1147,52 @@ def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
 group_var_template = """@cython.wraparound(False)
 @cython.boundscheck(False)
+@cython.cdivision(True)
 def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[%(dest_type2)s, ndim=2] values,
               ndarray[int64_t] labels):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, ct
-        ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx
+        %(dest_type2)s val, ct, oldmean
+        ndarray[%(dest_type2)s, ndim=2] nobs, mean
 
     if not len(values) == len(labels):
        raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
+    mean = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
-    with nogil:
-        if K > 1:
-            for i in range(N):
-
-                lab = labels[i]
-                if lab < 0:
-                    continue
+    out[:, :] = 0.0
 
-                counts[lab] += 1
-
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-                        sumxx[lab, j] += val * val
-        else:
-            for i in range(N):
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-                lab = labels[i]
-                if lab < 0:
-                    continue
+            counts[lab] += 1
 
-                counts[lab] += 1
-                val = values[i, 0]
+            for j in range(K):
+                val = values[i, j]
 
                 # not nan
                 if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-                    sumxx[lab, 0] += val * val
-
+                    nobs[lab, j] += 1
+                    oldmean = mean[lab, j]
+                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
+                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
 
         for i in range(ncounts):
             for j in range(K):
                 ct = nobs[i, j]
                 if ct < 2:
                     out[i, j] = NAN
                 else:
-                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                                 (ct * ct - ct))
+                    out[i, j] /= (ct - 1)
+
 """
 
 group_var_bin_template = """@cython.wraparound(False)
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
@@ -7232,131 +7232,101 @@ def group_prod_bin_float32(ndarray[float32_t, ndim=2] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
+@cython.cdivision(True)
 def group_var_float64(ndarray[float64_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float64_t, ndim=2] values,
               ndarray[int64_t] labels):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, ct
-        ndarray[float64_t, ndim=2] nobs, sumx, sumxx
+        float64_t val, ct, oldmean
+        ndarray[float64_t, ndim=2] nobs, mean
 
     if not len(values) == len(labels):
        raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
+    mean = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
-    with nogil:
-        if K > 1:
-            for i in range(N):
-
-                lab = labels[i]
-                if lab < 0:
-                    continue
+    out[:, :] = 0.0
 
-                counts[lab] += 1
-
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-                        sumxx[lab, j] += val * val
-        else:
-            for i in range(N):
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-                lab = labels[i]
-                if lab < 0:
-                    continue
+            counts[lab] += 1
 
-                counts[lab] += 1
-                val = values[i, 0]
+            for j in range(K):
+                val = values[i, j]
 
                 # not nan
                 if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-                    sumxx[lab, 0] += val * val
-
+                    nobs[lab, j] += 1
+                    oldmean = mean[lab, j]
+                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
+                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
 
         for i in range(ncounts):
             for j in range(K):
                 ct = nobs[i, j]
                 if ct < 2:
                     out[i, j] = NAN
                 else:
-                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                                 (ct * ct - ct))
+                    out[i, j] /= (ct - 1)
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
+@cython.cdivision(True)
 def group_var_float32(ndarray[float32_t, ndim=2] out,
               ndarray[int64_t] counts,
               ndarray[float32_t, ndim=2] values,
               ndarray[int64_t] labels):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, ct
-        ndarray[float32_t, ndim=2] nobs, sumx, sumxx
+        float32_t val, ct, oldmean
+        ndarray[float32_t, ndim=2] nobs, mean
 
     if not len(values) == len(labels):
        raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-    sumxx = np.zeros_like(out)
+    mean = np.zeros_like(out)
 
     N, K = (<object> values).shape
 
-    with nogil:
-        if K > 1:
-            for i in range(N):
-
-                lab = labels[i]
-                if lab < 0:
-                    continue
+    out[:, :] = 0.0
 
-                counts[lab] += 1
-
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-                        sumxx[lab, j] += val * val
-        else:
-            for i in range(N):
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
 
-                lab = labels[i]
-                if lab < 0:
-                    continue
+            counts[lab] += 1
 
-                counts[lab] += 1
-                val = values[i, 0]
+            for j in range(K):
+                val = values[i, j]
 
                 # not nan
                 if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-                    sumxx[lab, 0] += val * val
-
+                    nobs[lab, j] += 1
+                    oldmean = mean[lab, j]
+                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
+                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
 
         for i in range(ncounts):
             for j in range(K):
                 ct = nobs[i, j]
                 if ct < 2:
                     out[i, j] = NAN
                 else:
-                    out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) /
-                                 (ct * ct - ct))
+                    out[i, j] /= (ct - 1)
+
 
 
 @cython.wraparound(False)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py