From e2fb9acfa003195ea25e3fb82f6d4ca5649df98c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 11:25:46 -0400 Subject: [PATCH 1/6] PERF: group_cumsum ints/datetimelike --- pandas/_libs/groupby.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b72b927b3c2a8..a2c97f85e15f2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -260,10 +260,7 @@ def group_cumsum(numeric[:, ::1] out, accum[lab, j] = NaN break else: - y = val - compensation[lab, j] - t = accum[lab, j] + y - compensation[lab, j] = t - accum[lab, j] - y - accum[lab, j] = t + accum[lab, j] = val + accum[lab, j] out[i, j] = accum[lab, j] From 246d829068d7091f8ad9abe808a55fb508d44dd4 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 11:40:33 -0400 Subject: [PATCH 2/6] WIP --- pandas/_libs/groupby.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a2c97f85e15f2..8b24164659ec0 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -230,7 +230,7 @@ def group_cumsum(numeric[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val, y, t + numeric val, next_val, y, t numeric[:, ::1] accum, compensation intp_t lab @@ -260,8 +260,9 @@ def group_cumsum(numeric[:, ::1] out, accum[lab, j] = NaN break else: - accum[lab, j] = val + accum[lab, j] - out[i, j] = accum[lab, j] + next_val = val + accum[lab, j] + accum[lab, j] = next_val + out[i, j] = next_val @cython.boundscheck(False) From 6c1b9ca66ab9e33037e00abc15bca3d1f0df61f7 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 12:00:25 -0400 Subject: [PATCH 3/6] PERF/CLN: no need for kahan for int group_cumsum --- pandas/_libs/groupby.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8b24164659ec0..f5ae1c9b42c36 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -230,7 +230,7 @@ def group_cumsum(numeric[:, ::1] out, """ cdef: Py_ssize_t i, j, N, K, size - numeric val, next_val, y, t + numeric val, y, t numeric[:, ::1] accum, compensation intp_t lab @@ -253,16 +253,16 @@ def group_cumsum(numeric[:, ::1] out, t = accum[lab, j] + y compensation[lab, j] = t - accum[lab, j] - y accum[lab, j] = t - out[i, j] = accum[lab, j] + out[i, j] = t else: out[i, j] = NaN if not skipna: accum[lab, j] = NaN break else: - next_val = val + accum[lab, j] - accum[lab, j] = next_val - out[i, j] = next_val + t = val + accum[lab, j] + accum[lab, j] = t + out[i, j] = t @cython.boundscheck(False) From 8b8a832088456108d3a553200cf6fbce73b3c617 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 12:02:11 -0400 Subject: [PATCH 4/6] Add benchmark --- asv_bench/benchmarks/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 27761ccd0d917..1d349fe31ef0e 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -509,7 +509,7 @@ class CumminMax: param_names = ["dtype", "method"] params = [ ["float64", "int64", "Float64", "Int64"], - ["cummin", "cummax"], + ["cummin", "cummax", "cumsum"], ] def setup(self, dtype, method): From 06e98ab239da9934f98a35f8b386ac294a080062 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 12:02:42 -0400 Subject: [PATCH 5/6] Change benchmark name --- asv_bench/benchmarks/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 1d349fe31ef0e..8138ba2ced046 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -505,7 +505,7 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) -class CumminMax: +class Cumulative: param_names = ["dtype", "method"] params = [ ["float64", "int64", "Float64", "Int64"], From 208e6edcb07ae91f03d38c5b55a77d0a170d3121 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 8 Jun 2021 19:41:54 -0400 Subject: [PATCH 6/6] Add kahan comment --- pandas/_libs/groupby.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f5ae1c9b42c36..c05dbf5e3c8ec 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -247,6 +247,8 @@ def group_cumsum(numeric[:, ::1] out, for j in range(K): val = values[i, j] + # For floats, use Kahan summation to reduce floating-point + # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) if numeric == float32_t or numeric == float64_t: if val == val: y = val - compensation[lab, j]