From 4e037a638689173347e62980ddb705071c5032e2 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sat, 2 Jan 2021 16:53:22 +0100
Subject: [PATCH 1/6] ENH: Use Kahan summation to calculate groupby.sum()

---
 doc/source/whatsnew/v1.3.0.rst       |  1 +
 pandas/_libs/groupby.pyx             | 17 ++++++++---------
 pandas/tests/groupby/test_groupby.py | 11 +++++++++++
 3 files changed, 20 insertions(+), 9 deletions(-)
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 9d1b3eaebdf8b..21e67fc222cfe 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -51,6 +51,7 @@ Other enhancements
 - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
 - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
 - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
+- Improve numerical stability for :meth:`DataFrameGroupBy.sum()` and :meth:`SeriesGroupBy.sum()` through using Kahan summation (:issue:`38778`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index ffb75401013dc..ac8f22263f787 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -467,12 +467,12 @@ def _group_add(complexfloating_t[:, :] out,
                const int64_t[:] labels,
                Py_ssize_t min_count=0):
     """
-    Only aggregates on axis=0
+    Only aggregates on axis=0 using Kahan summation
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        complexfloating_t val, count
-        complexfloating_t[:, :] sumx
+        complexfloating_t val, count, t, y
+        complexfloating_t[:, :] sumx, compensation
         int64_t[:, :] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
@@ -481,6 +481,7 @@ def _group_add(complexfloating_t[:, :] out,
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
     sumx = np.zeros_like(out)
+    compensation = np.zeros_like(out)
 
     N, K = (<object>values).shape
 
@@ -497,12 +498,10 @@ def _group_add(complexfloating_t[:, :] out,
                 # not nan
                 if val == val:
                     nobs[lab, j] += 1
-                    if (complexfloating_t is complex64_t or
-                            complexfloating_t is complex128_t):
-                        # clang errors if we use += with these dtypes
-                        sumx[lab, j] = sumx[lab, j] + val
-                    else:
-                        sumx[lab, j] += val
+                    y = val - compensation[lab, j]
+                    t = sumx[lab, j] + y
+                    compensation[lab, j] = t - sumx[lab, j] - y
+                    sumx[lab, j] = t
 
         for i in range(ncounts):
             for j in range(K):
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index e5021b7b4dd5f..13e5f88d7a26b 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2174,3 +2174,14 @@ def test_groupby_series_with_tuple_name():
     expected = Series([2, 4], index=[1, 2], name=("a", "a"))
     expected.index.name = ("b", "b")
     tm.assert_series_equal(result, expected)
+
+
+def test_groupby_numerical_stability_sum():
+    # GH#38778
+    data = [1e16, 1e16, 99, 98, -5e15, -5e15, -5e15, -5e15]
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
+    result = df.groupby("group").sum()
+    expected = DataFrame(
+        {"a": [99.0, 98.0], "b": [99.0, 98.0]}, index=Index([1, 2], name="group")
+    )
+    tm.assert_frame_equal(result, expected)

From 769e66ffb36905749dfedd92e46ba14a6b76e4a1 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sat, 2 Jan 2021 20:13:43 +0100
Subject: [PATCH 2/6] Change test

---
 pandas/tests/groupby/test_groupby.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 13e5f88d7a26b..a7a3088982361 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2178,10 +2178,10 @@ def test_groupby_series_with_tuple_name():
 
 def test_groupby_numerical_stability_sum():
     # GH#38778
-    data = [1e16, 1e16, 99, 98, -5e15, -5e15, -5e15, -5e15]
+    data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
     df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
     result = df.groupby("group").sum()
     expected = DataFrame(
-        {"a": [99.0, 98.0], "b": [99.0, 98.0]}, index=Index([1, 2], name="group")
+        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
     )
     tm.assert_frame_equal(result, expected)

From c1f92ed161d56111446b8cc4f519436a9307eef2 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sat, 2 Jan 2021 20:40:27 +0100
Subject: [PATCH 3/6] Pin test to float64 dtype

---
 pandas/tests/groupby/test_groupby.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index a7a3088982361..365eca2346025 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2179,9 +2179,9 @@ def test_groupby_series_with_tuple_name():
 def test_groupby_numerical_stability_sum():
     # GH#38778
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
-    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}, dtype="float64")
     result = df.groupby("group").sum()
     expected = DataFrame(
-        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
+        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1.0, 2.0], name="group")
     )
     tm.assert_frame_equal(result, expected)

From a236445c9823cde96c7d4ba0448ac5497de443fc Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sat, 2 Jan 2021 23:01:17 +0100
Subject: [PATCH 4/6] Xfail on 32 bit

---
 pandas/tests/groupby/test_groupby.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 365eca2346025..4e085a7608e31 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from pandas.compat import IS64
 from pandas.errors import PerformanceWarning
 
 import pandas as pd
@@ -2176,12 +2177,13 @@ def test_groupby_series_with_tuple_name():
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
 def test_groupby_numerical_stability_sum():
     # GH#38778
     data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
-    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}, dtype="float64")
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
     result = df.groupby("group").sum()
     expected = DataFrame(
-        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1.0, 2.0], name="group")
+        {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group")
     )
     tm.assert_frame_equal(result, expected)

From 892ece446eb5e86aaddd1906bd437ad00eb47e46 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sun, 3 Jan 2021 00:44:02 +0100
Subject: [PATCH 5/6] Remove brackets

---
 doc/source/whatsnew/v1.3.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 21e67fc222cfe..47f808d59dfe4 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -51,7 +51,7 @@ Other enhancements
 - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
 - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
 - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
-- Improve numerical stability for :meth:`DataFrameGroupBy.sum()` and :meth:`SeriesGroupBy.sum()` through using Kahan summation (:issue:`38778`)
+- Improve numerical stability for :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` through using Kahan summation (:issue:`38778`)
 
 .. ---------------------------------------------------------------------------
 

From dc91579eeea9c48baa4fca3d20d7200483dbdf2a Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Sun, 3 Jan 2021 18:59:38 +0100
Subject: [PATCH 6/6] Move whatsnew

---
 doc/source/whatsnew/v1.3.0.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 47f808d59dfe4..94a12fec6adcb 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -51,7 +51,6 @@ Other enhancements
 - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
 - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
 - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
-- Improve numerical stability for :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` through using Kahan summation (:issue:`38778`)
 
 .. ---------------------------------------------------------------------------
 
@@ -290,7 +289,7 @@ Groupby/resample/rolling
 
 - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
 - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
--
+- Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`)
 
 Reshaping
 ^^^^^^^^^