Skip to content

Commit 5cc24c2

Browse files
authored
ENH: Use Kahan summation to calculate groupby.sum() (#38903)
1 parent 87542f0 commit 5cc24c2

File tree

3 files changed

+22
-10
lines changed

3 files changed

+22
-10
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ Groupby/resample/rolling
291291

292292
- Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
293293
- Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
294-
-
294+
- Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`)
295295

296296
Reshaping
297297
^^^^^^^^^

pandas/_libs/groupby.pyx

+8-9
Original file line numberDiff line numberDiff line change
@@ -467,12 +467,12 @@ def _group_add(complexfloating_t[:, :] out,
467467
const int64_t[:] labels,
468468
Py_ssize_t min_count=0):
469469
"""
470-
Only aggregates on axis=0
470+
Only aggregates on axis=0 using Kahan summation
471471
"""
472472
cdef:
473473
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
474-
complexfloating_t val, count
475-
complexfloating_t[:, :] sumx
474+
complexfloating_t val, count, t, y
475+
complexfloating_t[:, :] sumx, compensation
476476
int64_t[:, :] nobs
477477
Py_ssize_t len_values = len(values), len_labels = len(labels)
478478

@@ -481,6 +481,7 @@ def _group_add(complexfloating_t[:, :] out,
481481

482482
nobs = np.zeros((<object>out).shape, dtype=np.int64)
483483
sumx = np.zeros_like(out)
484+
compensation = np.zeros_like(out)
484485

485486
N, K = (<object>values).shape
486487

@@ -497,12 +498,10 @@ def _group_add(complexfloating_t[:, :] out,
497498
# not nan
498499
if val == val:
499500
nobs[lab, j] += 1
500-
if (complexfloating_t is complex64_t or
501-
complexfloating_t is complex128_t):
502-
# clang errors if we use += with these dtypes
503-
sumx[lab, j] = sumx[lab, j] + val
504-
else:
505-
sumx[lab, j] += val
501+
y = val - compensation[lab, j]
502+
t = sumx[lab, j] + y
503+
compensation[lab, j] = t - sumx[lab, j] - y
504+
sumx[lab, j] = t
506505

507506
for i in range(ncounts):
508507
for j in range(K):

pandas/tests/groupby/test_groupby.py

+13
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
import pytest
77

8+
from pandas.compat import IS64
89
from pandas.errors import PerformanceWarning
910

1011
import pandas as pd
@@ -2174,3 +2175,15 @@ def test_groupby_series_with_tuple_name():
21742175
expected = Series([2, 4], index=[1, 2], name=("a", "a"))
21752176
expected.index.name = ("b", "b")
21762177
tm.assert_series_equal(result, expected)
2178+
2179+
2180+
@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
2181+
def test_groupby_numerical_stability_sum():
2182+
# GH#38778
2183+
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
2184+
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
2185+
result = df.groupby("group").sum()
2186+
expected = DataFrame(
2187+
{"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
2188+
)
2189+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)