pandas-dev · jreback · Jan 4, 2021 · Jan 4, 2021 · Jan 4, 2021 · Jan 4, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -294,6 +294,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
 - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
 - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`)
+- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`)
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -246,12 +246,13 @@ def group_cumsum(numeric[:, :] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        numeric val
-        numeric[:, :] accum
+        numeric val, y, t
+        numeric[:, :] accum, compensation
         int64_t lab
 
     N, K = (<object>values).shape
     accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
+    compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
 
     with nogil:
         for i in range(N):
@@ -264,15 +265,21 @@ def group_cumsum(numeric[:, :] out,
 
                 if numeric == float32_t or numeric == float64_t:
                     if val == val:
-                        accum[lab, j] += val
+                        y = val - compensation[lab, j]
+                        t = accum[lab, j] + y
+                        compensation[lab, j] = t - accum[lab, j] - y
+                        accum[lab, j] = t
                         out[i, j] = accum[lab, j]
                     else:
                         out[i, j] = NaN
                         if not skipna:
                             accum[lab, j] = NaN
                             break
                 else:
-                    accum[lab, j] += val
+                    y = val - compensation[lab, j]
+                    t = accum[lab, j] + y
+                    compensation[lab, j] = t - accum[lab, j] - y
+                    accum[lab, j] = t
                     out[i, j] = accum[lab, j]
 
 
@@ -637,8 +644,8 @@ def _group_mean(floating[:, :] out,
                 Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        floating val, count
-        floating[:, :] sumx
+        floating val, count, y, t
+        floating[:, :] sumx, compensation
         int64_t[:, :] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
@@ -649,6 +656,7 @@ def _group_mean(floating[:, :] out,
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
     sumx = np.zeros_like(out)
+    compensation = np.zeros_like(out)
 
     N, K = (<object>values).shape
 
@@ -664,7 +672,10 @@ def _group_mean(floating[:, :] out,
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    sumx[lab, j] += val
+                    y = val - compensation[lab, j]
+                    t = sumx[lab, j] + y
+                    compensation[lab, j] = t - sumx[lab, j] - y
+                    sumx[lab, j] = t
 
         for i in range(ncounts):
             for j in range(K):

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2178,12 +2178,26 @@ def test_groupby_series_with_tuple_name():
 
 
 @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
-def test_groupby_numerical_stability_sum():
+@pytest.mark.parametrize(
+    "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
+)
+def test_groupby_numerical_stability_sum_mean(func, values):
     # GH#38778
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
     df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
-    result = df.groupby("group").sum()
-    expected = DataFrame(
-        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
-    )
+    result = getattr(df.groupby("group"), func)()
+    expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
+def test_groupby_numerical_stability_cumsum():
+    # GH#38934
+    data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
+    result = df.groupby("group").cumsum()
+    exp_data = (
+        [1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
+    )
+    expected = DataFrame({"a": exp_data, "b": exp_data})
+    tm.assert_frame_equal(result, expected, check_exact=True)