diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af11b6543a74b..b4b98ec0403a8 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -294,6 +294,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) +- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ac8f22263f787..553ecbc58e745 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -246,12 +246,13 @@ def group_cumsum(numeric[:, :] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val - numeric[:, :] accum + numeric val, y, t + numeric[:, :] accum, compensation int64_t lab N, K = (values).shape accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) + compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) with nogil: for i in range(N): @@ -264,7 +265,10 @@ def group_cumsum(numeric[:, :] out, if numeric == float32_t or numeric == float64_t: if val == val: - accum[lab, j] += val + y = val - compensation[lab, j] + t = accum[lab, j] + y + compensation[lab, j] = t - accum[lab, j] - y + accum[lab, j] = t out[i, j] = accum[lab, j] else: out[i, j] = NaN @@ -272,7 +276,10 @@ def group_cumsum(numeric[:, :] out, accum[lab, j] = NaN break else: - accum[lab, j] += val + y = val - compensation[lab, j] + t = accum[lab, j] + y + compensation[lab, j] = t - accum[lab, j] - y + accum[lab, j] = t out[i, j] = accum[lab, j] @@ -637,8 +644,8 @@ def _group_mean(floating[:, :] out, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, :] sumx + floating val, count, y, t + floating[:, :] sumx, compensation int64_t[:, :] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -649,6 +656,7 @@ def _group_mean(floating[:, :] out, nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros_like(out) + compensation = np.zeros_like(out) N, K = (values).shape @@ -664,7 +672,10 @@ def _group_mean(floating[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t for i in range(ncounts): for j in range(K): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e1c63448a2d22..5735f895e33b6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2178,12 +2178,26 @@ def test_groupby_series_with_tuple_name(): @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") -def test_groupby_numerical_stability_sum(): +@pytest.mark.parametrize( + "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])] +) +def test_groupby_numerical_stability_sum_mean(func, values): # GH#38778 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) - result = df.groupby("group").sum() - expected = DataFrame( - {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group") - ) + result = getattr(df.groupby("group"), func)() + expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group")) tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") +def test_groupby_numerical_stability_cumsum(): + # GH#38934 + data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] + df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) + result = df.groupby("group").cumsum() + exp_data = ( + [1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0] + ) + expected = DataFrame({"a": exp_data, "b": exp_data}) + tm.assert_frame_equal(result, expected, check_exact=True)