From 4e037a638689173347e62980ddb705071c5032e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 2 Jan 2021 16:53:22 +0100 Subject: [PATCH 1/6] ENH: Use Kahan summation to calculate groupby.sum() --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/groupby.pyx | 17 ++++++++--------- pandas/tests/groupby/test_groupby.py | 11 +++++++++++ 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9d1b3eaebdf8b..21e67fc222cfe 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -51,6 +51,7 @@ Other enhancements - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) +- Improve numerical stability for :meth:`DataFrameGroupBy.sum()` and :meth:`SeriesGroupBy.sum()` through using Kahan summation (:issue:`38778`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ffb75401013dc..ac8f22263f787 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -467,12 +467,12 @@ def _group_add(complexfloating_t[:, :] out, const int64_t[:] labels, Py_ssize_t min_count=0): """ - Only aggregates on axis=0 + Only aggregates on axis=0 using Kahan summation """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - complexfloating_t val, count - complexfloating_t[:, :] sumx + complexfloating_t val, count, t, y + complexfloating_t[:, :] sumx, compensation int64_t[:, :] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -481,6 +481,7 @@ def _group_add(complexfloating_t[:, :] out, nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros_like(out) + compensation = np.zeros_like(out) N, K = (values).shape @@ -497,12 +498,10 @@ def _group_add(complexfloating_t[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - if (complexfloating_t is complex64_t or - complexfloating_t is complex128_t): - # clang errors if we use += with these dtypes - sumx[lab, j] = sumx[lab, j] + val - else: - sumx[lab, j] += val + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t for i in range(ncounts): for j in range(K): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e5021b7b4dd5f..13e5f88d7a26b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2174,3 +2174,14 @@ def test_groupby_series_with_tuple_name(): expected = Series([2, 4], index=[1, 2], name=("a", "a")) expected.index.name = ("b", "b") tm.assert_series_equal(result, expected) + + +def test_groupby_numerical_stability_sum(): + # GH#38778 + data = [1e16, 1e16, 99, 98, -5e15, -5e15, -5e15, -5e15] + df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) + result = df.groupby("group").sum() + expected = DataFrame( + {"a": [99.0, 98.0], "b": [99.0, 98.0]}, index=Index([1, 2], name="group") + ) + tm.assert_frame_equal(result, expected) From 769e66ffb36905749dfedd92e46ba14a6b76e4a1 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 2 Jan 2021 20:13:43 +0100 Subject: [PATCH 2/6] Change test --- pandas/tests/groupby/test_groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 13e5f88d7a26b..a7a3088982361 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2178,10 +2178,10 @@ def test_groupby_series_with_tuple_name(): def test_groupby_numerical_stability_sum(): # GH#38778 - data = [1e16, 1e16, 99, 98, -5e15, -5e15, -5e15, -5e15] + data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) result = df.groupby("group").sum() expected = DataFrame( - {"a": [99.0, 98.0], "b": [99.0, 98.0]}, index=Index([1, 2], name="group") + {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group") ) tm.assert_frame_equal(result, expected) From c1f92ed161d56111446b8cc4f519436a9307eef2 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 2 Jan 2021 20:40:27 +0100 Subject: [PATCH 3/6] Pin test to float64 dtype --- pandas/tests/groupby/test_groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a7a3088982361..365eca2346025 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2179,9 +2179,9 @@ def test_groupby_series_with_tuple_name(): def test_groupby_numerical_stability_sum(): # GH#38778 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] - df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) + df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}, dtype="float64") result = df.groupby("group").sum() expected = DataFrame( - {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group") + {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1.0, 2.0], name="group") ) tm.assert_frame_equal(result, expected) From a236445c9823cde96c7d4ba0448ac5497de443fc Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 2 Jan 2021 23:01:17 +0100 Subject: [PATCH 4/6] Xfail on 32 bit --- pandas/tests/groupby/test_groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 365eca2346025..4e085a7608e31 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import PerformanceWarning import pandas as pd @@ -2176,12 +2177,13 @@ def test_groupby_series_with_tuple_name(): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") def test_groupby_numerical_stability_sum(): # GH#38778 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] - df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}, dtype="float64") + df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) result = df.groupby("group").sum() expected = DataFrame( - {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1.0, 2.0], name="group") + {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group") ) tm.assert_frame_equal(result, expected) From 892ece446eb5e86aaddd1906bd437ad00eb47e46 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 3 Jan 2021 00:44:02 +0100 Subject: [PATCH 5/6] Remove brackets --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 21e67fc222cfe..47f808d59dfe4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -51,7 +51,7 @@ Other enhancements - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) -- Improve numerical stability for :meth:`DataFrameGroupBy.sum()` and :meth:`SeriesGroupBy.sum()` through using Kahan summation (:issue:`38778`) +- Improve numerical stability for :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` through using Kahan summation (:issue:`38778`) .. --------------------------------------------------------------------------- From dc91579eeea9c48baa4fca3d20d7200483dbdf2a Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 3 Jan 2021 18:59:38 +0100 Subject: [PATCH 6/6] Move whatsnew --- doc/source/whatsnew/v1.3.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 47f808d59dfe4..94a12fec6adcb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -51,7 +51,6 @@ Other enhancements - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) -- Improve numerical stability for :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` through using Kahan summation (:issue:`38778`) .. --------------------------------------------------------------------------- @@ -290,7 +289,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) -- +- Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) Reshaping ^^^^^^^^^