From e85dc45aa4d48e5077c778b6d9934bcc135b70bd Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 04:09:08 +0100 Subject: [PATCH 1/5] Fix groupby stability issues --- pandas/_libs/groupby.pyx | 25 ++++++++++++++++++------- pandas/tests/groupby/test_groupby.py | 17 ++++++++++++++--- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ac8f22263f787..553ecbc58e745 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -246,12 +246,13 @@ def group_cumsum(numeric[:, :] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val - numeric[:, :] accum + numeric val, y, t + numeric[:, :] accum, compensation int64_t lab N, K = (values).shape accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) + compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) with nogil: for i in range(N): @@ -264,7 +265,10 @@ def group_cumsum(numeric[:, :] out, if numeric == float32_t or numeric == float64_t: if val == val: - accum[lab, j] += val + y = val - compensation[lab, j] + t = accum[lab, j] + y + compensation[lab, j] = t - accum[lab, j] - y + accum[lab, j] = t out[i, j] = accum[lab, j] else: out[i, j] = NaN @@ -272,7 +276,10 @@ def group_cumsum(numeric[:, :] out, accum[lab, j] = NaN break else: - accum[lab, j] += val + y = val - compensation[lab, j] + t = accum[lab, j] + y + compensation[lab, j] = t - accum[lab, j] - y + accum[lab, j] = t out[i, j] = accum[lab, j] @@ -637,8 +644,8 @@ def _group_mean(floating[:, :] out, Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, :] sumx + floating val, count, y, t + floating[:, :] sumx, compensation int64_t[:, :] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) @@ -649,6 +656,7 @@ def _group_mean(floating[:, :] out, nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros_like(out) + compensation = np.zeros_like(out) N, K = (values).shape @@ -664,7 +672,10 @@ def _group_mean(floating[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + sumx[lab, j] = t for i in range(ncounts): for j in range(K): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e1c63448a2d22..1f9e6f90fe387 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2178,12 +2178,23 @@ def test_groupby_series_with_tuple_name(): @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") -def test_groupby_numerical_stability_sum(): +@pytest.mark.parametrize("func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]) +def test_groupby_numerical_stability_sum_mean(func, values): # GH#38778 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) - result = df.groupby("group").sum() + result = getattr(df.groupby("group"), func)() expected = DataFrame( - {"a": [97.0, 98.0], "b": [97.0, 98.0]}, index=Index([1, 2], name="group") + {"a": values, "b": values}, index=Index([1, 2], name="group") ) tm.assert_frame_equal(result, expected) + + +def test_groupby_numerical_stability_cumsum(): + # GH#38778 + data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] + df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) + result = df.groupby("group").cumsum() + exp_data = [1e16] * 4 + [5e15] * 2 + [97.0, 98.0] + expected = DataFrame({"a": exp_data, "b": exp_data}) + tm.assert_frame_equal(result, expected) From 966cb119c7648b77b1b13875137ee901f0a06e9d Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 04:32:25 +0100 Subject: [PATCH 2/5] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af11b6543a74b..d69cf3288fc6e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -294,6 +294,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) +- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`x`) Reshaping ^^^^^^^^^ From 03e8d7ee68ef9addf4cefeea10238d955ee03e69 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 04:34:14 +0100 Subject: [PATCH 3/5] Run Black --- pandas/tests/groupby/test_groupby.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1f9e6f90fe387..86107dfdc9c4a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2178,15 +2178,15 @@ def test_groupby_series_with_tuple_name(): @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") -@pytest.mark.parametrize("func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]) +@pytest.mark.parametrize( + "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])] +) def test_groupby_numerical_stability_sum_mean(func, values): # GH#38778 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) result = getattr(df.groupby("group"), func)() - expected = DataFrame( - {"a": values, "b": values}, index=Index([1, 2], name="group") - ) + expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group")) tm.assert_frame_equal(result, expected) From a4f5af54c618d05b260f254dc14d6d3ef9552c28 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 04:35:21 +0100 Subject: [PATCH 4/5] Adjust gh pr reference --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d69cf3288fc6e..b4b98ec0403a8 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -294,7 +294,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) -- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`x`) +- Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) Reshaping ^^^^^^^^^ diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 86107dfdc9c4a..81cb6444bb647 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2191,7 +2191,7 @@ def test_groupby_numerical_stability_sum_mean(func, values): def test_groupby_numerical_stability_cumsum(): - # GH#38778 + # GH#38934 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) result = df.groupby("group").cumsum() From 0ac16f7edb6493d32cff70fe40ffff74a913de16 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 13:20:54 +0100 Subject: [PATCH 5/5] Add exact check --- pandas/tests/groupby/test_groupby.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 81cb6444bb647..5735f895e33b6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2190,11 +2190,14 @@ def test_groupby_numerical_stability_sum_mean(func, values): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system") def test_groupby_numerical_stability_cumsum(): # GH#38934 data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) result = df.groupby("group").cumsum() - exp_data = [1e16] * 4 + [5e15] * 2 + [97.0, 98.0] + exp_data = ( + [1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0] + ) expected = DataFrame({"a": exp_data, "b": exp_data}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_exact=True)