diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9d1b3eaebdf8b..94a12fec6adcb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -289,7 +289,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) -- +- Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ffb75401013dc..ac8f22263f787 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -467,12 +467,12 @@ def _group_add(complexfloating_t[:, :] out, const int64_t[:] labels, Py_ssize_t min_count=0): """ - Only aggregates on axis=0 + Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - complexfloating_t val, count - complexfloating_t[:, :] sumx + complexfloating_t val, count, t, y + complexfloating_t[:, :] sumx, compensation int64_t[:, :] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -481,6 +481,7 @@ def _group_add(complexfloating_t[:, :] out, nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros_like(out) + compensation = np.zeros_like(out) N, K = (values).shape @@ -497,12 +498,10 @@ def _group_add(complexfloating_t[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - if (complexfloating_t is complex64_t or - complexfloating_t is complex128_t): - # clang errors if we use += with these dtypes - sumx[lab, j] = sumx[lab, j] + val - else: - sumx[lab, j] += val + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t for i in range(ncounts): for j in range(K): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e5021b7b4dd5f..4e085a7608e31 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import PerformanceWarning import pandas as pd @@ -2174,3 +2175,15 @@ def test_groupby_series_with_tuple_name(): expected = Series([2, 4], index=[1, 2], name=("a", "a")) expected.index.name = ("b", "b") tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") +def test_groupby_numerical_stability_sum(): + # GH#38778 + data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] + df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) + result = df.groupby("group").sum() + expected = DataFrame( + {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group") + ) + tm.assert_frame_equal(result, expected)