Skip to content

Commit c2c5eba

Browse files
authored
ENH: Improve numerical stability for groupby.mean and groupby.cumsum (#38934)
1 parent 8b2ebdb commit c2c5eba

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ Groupby/resample/rolling
294294
- Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
295295
- Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
296296
- Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`)
297+
- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`)
297298

298299
Reshaping
299300
^^^^^^^^^

pandas/_libs/groupby.pyx

+18-7
Original file line numberDiff line numberDiff line change
@@ -246,12 +246,13 @@ def group_cumsum(numeric[:, :] out,
246246
"""
247247
cdef:
248248
Py_ssize_t i, j, N, K, size
249-
numeric val
250-
numeric[:, :] accum
249+
numeric val, y, t
250+
numeric[:, :] accum, compensation
251251
int64_t lab
252252

253253
N, K = (<object>values).shape
254254
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
255+
compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
255256

256257
with nogil:
257258
for i in range(N):
@@ -264,15 +265,21 @@ def group_cumsum(numeric[:, :] out,
264265

265266
if numeric == float32_t or numeric == float64_t:
266267
if val == val:
267-
accum[lab, j] += val
268+
y = val - compensation[lab, j]
269+
t = accum[lab, j] + y
270+
compensation[lab, j] = t - accum[lab, j] - y
271+
accum[lab, j] = t
268272
out[i, j] = accum[lab, j]
269273
else:
270274
out[i, j] = NaN
271275
if not skipna:
272276
accum[lab, j] = NaN
273277
break
274278
else:
275-
accum[lab, j] += val
279+
y = val - compensation[lab, j]
280+
t = accum[lab, j] + y
281+
compensation[lab, j] = t - accum[lab, j] - y
282+
accum[lab, j] = t
276283
out[i, j] = accum[lab, j]
277284

278285

@@ -637,8 +644,8 @@ def _group_mean(floating[:, :] out,
637644
Py_ssize_t min_count=-1):
638645
cdef:
639646
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
640-
floating val, count
641-
floating[:, :] sumx
647+
floating val, count, y, t
648+
floating[:, :] sumx, compensation
642649
int64_t[:, :] nobs
643650
Py_ssize_t len_values = len(values), len_labels = len(labels)
644651

@@ -649,6 +656,7 @@ def _group_mean(floating[:, :] out,
649656

650657
nobs = np.zeros((<object>out).shape, dtype=np.int64)
651658
sumx = np.zeros_like(out)
659+
compensation = np.zeros_like(out)
652660

653661
N, K = (<object>values).shape
654662

@@ -664,7 +672,10 @@ def _group_mean(floating[:, :] out,
664672
# not nan
665673
if val == val:
666674
nobs[lab, j] += 1
667-
sumx[lab, j] += val
675+
y = val - compensation[lab, j]
676+
t = sumx[lab, j] + y
677+
compensation[lab, j] = t - sumx[lab, j] - y
678+
sumx[lab, j] = t
668679

669680
for i in range(ncounts):
670681
for j in range(K):

pandas/tests/groupby/test_groupby.py

+19-5
Original file line numberDiff line numberDiff line change
@@ -2178,12 +2178,26 @@ def test_groupby_series_with_tuple_name():
21782178

21792179

21802180
@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
2181-
def test_groupby_numerical_stability_sum():
2181+
@pytest.mark.parametrize(
2182+
"func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
2183+
)
2184+
def test_groupby_numerical_stability_sum_mean(func, values):
21822185
# GH#38778
21832186
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
21842187
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
2185-
result = df.groupby("group").sum()
2186-
expected = DataFrame(
2187-
{"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
2188-
)
2188+
result = getattr(df.groupby("group"), func)()
2189+
expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
21892190
tm.assert_frame_equal(result, expected)
2191+
2192+
2193+
@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
2194+
def test_groupby_numerical_stability_cumsum():
2195+
# GH#38934
2196+
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
2197+
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
2198+
result = df.groupby("group").cumsum()
2199+
exp_data = (
2200+
[1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
2201+
)
2202+
expected = DataFrame({"a": exp_data, "b": exp_data})
2203+
tm.assert_frame_equal(result, expected, check_exact=True)

0 commit comments

Comments
 (0)