From e85dc45aa4d48e5077c778b6d9934bcc135b70bd Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 04:09:08 +0100
Subject: [PATCH 1/5] Fix groupby stability issues

---
 pandas/_libs/groupby.pyx             | 25 ++++++++++++++++++-------
 pandas/tests/groupby/test_groupby.py | 17 ++++++++++++++---
 2 files changed, 32 insertions(+), 10 deletions(-)
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index ac8f22263f787..553ecbc58e745 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -246,12 +246,13 @@ def group_cumsum(numeric[:, :] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        numeric val
-        numeric[:, :] accum
+        numeric val, y, t
+        numeric[:, :] accum, compensation
         int64_t lab
 
     N, K = (<object>values).shape
     accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
+    compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
 
     with nogil:
         for i in range(N):
@@ -264,7 +265,10 @@ def group_cumsum(numeric[:, :] out,
 
                 if numeric == float32_t or numeric == float64_t:
                     if val == val:
-                        accum[lab, j] += val
+                        y = val - compensation[lab, j]
+                        t = accum[lab, j] + y
+                        compensation[lab, j] = t - accum[lab, j] - y
+                        accum[lab, j] = t
                         out[i, j] = accum[lab, j]
                     else:
                         out[i, j] = NaN
@@ -272,7 +276,10 @@ def group_cumsum(numeric[:, :] out,
                             accum[lab, j] = NaN
                             break
                 else:
-                    accum[lab, j] += val
+                    y = val - compensation[lab, j]
+                    t = accum[lab, j] + y
+                    compensation[lab, j] = t - accum[lab, j] - y
+                    accum[lab, j] = t
                     out[i, j] = accum[lab, j]
 
 
@@ -637,8 +644,8 @@ def _group_mean(floating[:, :] out,
                 Py_ssize_t min_count=-1):
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        floating val, count
-        floating[:, :] sumx
+        floating val, count, y, t
+        floating[:, :] sumx, compensation
         int64_t[:, :] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
@@ -649,6 +656,7 @@ def _group_mean(floating[:, :] out,
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
     sumx = np.zeros_like(out)
+    compensation = np.zeros_like(out)
 
     N, K = (<object>values).shape
 
@@ -664,7 +672,10 @@ def _group_mean(floating[:, :] out,
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    sumx[lab, j] += val
+                    y = val - compensation[lab, j]
+                    t = sumx[lab, j] + y
+                    compensation[lab, j] = t - sumx[lab, j] - y
+                    sumx[lab, j] = t
 
         for i in range(ncounts):
             for j in range(K):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index e1c63448a2d22..1f9e6f90fe387 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2178,12 +2178,23 @@ def test_groupby_series_with_tuple_name():
 
 
 @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
-def test_groupby_numerical_stability_sum():
+@pytest.mark.parametrize("func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])])
+def test_groupby_numerical_stability_sum_mean(func, values):
     # GH#38778
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
     df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
-    result = df.groupby("group").sum()
+    result = getattr(df.groupby("group"), func)()
     expected = DataFrame(
-        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
+        {"a": values, "b": values}, index=Index([1, 2], name="group")
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_numerical_stability_cumsum():
+    # GH#38778
+    data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
+    result = df.groupby("group").cumsum()
+    exp_data = [1e16] * 4 + [5e15] * 2 + [97.0, 98.0]
+    expected = DataFrame({"a": exp_data, "b": exp_data})
+    tm.assert_frame_equal(result, expected)

From 966cb119c7648b77b1b13875137ee901f0a06e9d Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 04:32:25 +0100
Subject: [PATCH 2/5] Add whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index af11b6543a74b..d69cf3288fc6e 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -294,6 +294,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
 - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
 - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`)
+- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`x`)
 
 Reshaping
 ^^^^^^^^^

From 03e8d7ee68ef9addf4cefeea10238d955ee03e69 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 04:34:14 +0100
Subject: [PATCH 3/5] Run Black

---
 pandas/tests/groupby/test_groupby.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 1f9e6f90fe387..86107dfdc9c4a 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2178,15 +2178,15 @@ def test_groupby_series_with_tuple_name():
 
 
 @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
-@pytest.mark.parametrize("func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])])
+@pytest.mark.parametrize(
+    "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
+)
 def test_groupby_numerical_stability_sum_mean(func, values):
     # GH#38778
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
     df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
     result = getattr(df.groupby("group"), func)()
-    expected = DataFrame(
-        {"a": values, "b": values}, index=Index([1, 2], name="group")
-    )
+    expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
     tm.assert_frame_equal(result, expected)
 
 

From a4f5af54c618d05b260f254dc14d6d3ef9552c28 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 04:35:21 +0100
Subject: [PATCH 4/5] Adjust gh pr reference

---
 doc/source/whatsnew/v1.3.0.rst       | 2 +-
 pandas/tests/groupby/test_groupby.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index d69cf3288fc6e..b4b98ec0403a8 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -294,7 +294,7 @@ Groupby/resample/rolling
 - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
 - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
 - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`)
-- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`x`)
+- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 86107dfdc9c4a..81cb6444bb647 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2191,7 +2191,7 @@ def test_groupby_numerical_stability_sum_mean(func, values):
 
 
 def test_groupby_numerical_stability_cumsum():
-    # GH#38778
+    # GH#38934
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
     df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
     result = df.groupby("group").cumsum()

From 0ac16f7edb6493d32cff70fe40ffff74a913de16 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Mon, 4 Jan 2021 13:20:54 +0100
Subject: [PATCH 5/5] Add exact check

---
 pandas/tests/groupby/test_groupby.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 81cb6444bb647..5735f895e33b6 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2190,11 +2190,14 @@ def test_groupby_numerical_stability_sum_mean(func, values):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
 def test_groupby_numerical_stability_cumsum():
     # GH#38934
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
     df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
     result = df.groupby("group").cumsum()
-    exp_data = [1e16] * 4 + [5e15] * 2 + [97.0, 98.0]
+    exp_data = (
+        [1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
+    )
     expected = DataFrame({"a": exp_data, "b": exp_data})
-    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(result, expected, check_exact=True)