PERF: Improve performance in rolling.mean(engine=numba) (#44176)

mroeschke · web-flow · commit 5d520c64aed6 · 2021-10-27T22:07:11.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -426,7 +426,7 @@ Performance improvements
 - :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
 - Indexing into a :class:`SparseArray` with a ``slice`` with ``step=1`` no longer requires converting to a dense array (:issue:`43777`)
 - Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
-- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
+- Performance improvement in :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.Rolling.sum`, :meth:`.Expanding.sum` with ``engine="numba"`` (:issue:`43612`, :issue:`44176`)
 - Improved performance of :meth:`pandas.read_csv` with ``memory_map=True`` when file encoding is UTF-8 (:issue:`43787`)
 - Performance improvement in :meth:`RangeIndex.sort_values` overriding :meth:`Index.sort_values` (:issue:`43666`)
 - Performance improvement in :meth:`RangeIndex.insert` (:issue:`43988`)
diff --git a/pandas/core/_numba/kernels/__init__.py b/pandas/core/_numba/kernels/__init__.py
@@ -1,3 +1,4 @@
 from pandas.core._numba.kernels.mean_ import sliding_mean
+from pandas.core._numba.kernels.sum_ import sliding_sum
 
-__all__ = ["sliding_mean"]
+__all__ = ["sliding_mean", "sliding_sum"]
diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py
@@ -1,5 +1,5 @@
 """
-Numba 1D aggregation kernels that can be shared by
+Numba 1D mean kernels that can be shared by
 * Dataframe / Series
 * groupby
 * rolling / expanding
@@ -11,20 +11,7 @@
 import numba
 import numpy as np
 
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def is_monotonic_increasing(bounds: np.ndarray) -> bool:
-    """Check if int64 values are monotonically increasing."""
-    n = len(bounds)
-    if n < 2:
-        return True
-    prev = bounds[0]
-    for i in range(1, n):
-        cur = bounds[i]
-        if cur < prev:
-            return False
-        prev = cur
-    return True
+from pandas.core._numba.kernels.shared import is_monotonic_increasing
 
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
diff --git a/pandas/core/_numba/kernels/shared.py b/pandas/core/_numba/kernels/shared.py
@@ -0,0 +1,17 @@
+import numba
+import numpy as np
+
+
+@numba.jit(numba.boolean(numba.int64[:]), nopython=True, nogil=True, parallel=False)
+def is_monotonic_increasing(bounds: np.ndarray) -> bool:
+    """Check if int64 values are monotonically increasing."""
+    n = len(bounds)
+    if n < 2:
+        return True
+    prev = bounds[0]
+    for i in range(1, n):
+        cur = bounds[i]
+        if cur < prev:
+            return False
+        prev = cur
+    return True
diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py
@@ -0,0 +1,98 @@
+"""
+Numba 1D sum kernels that can be shared by
+* Dataframe / Series
+* groupby
+* rolling / expanding
+
+Mirrors pandas/_libs/window/aggregation.pyx
+"""
+from __future__ import annotations
+
+import numba
+import numpy as np
+
+from pandas.core._numba.kernels.shared import is_monotonic_increasing
+
+
+@numba.jit(nopython=True, nogil=True, parallel=False)
+def add_sum(
+    val: float, nobs: int, sum_x: float, compensation: float
+) -> tuple[int, float, float]:
+    if not np.isnan(val):
+        nobs += 1
+        y = val - compensation
+        t = sum_x + y
+        compensation = t - sum_x - y
+        sum_x = t
+    return nobs, sum_x, compensation
+
+
+@numba.jit(nopython=True, nogil=True, parallel=False)
+def remove_sum(
+    val: float, nobs: int, sum_x: float, compensation: float
+) -> tuple[int, float, float]:
+    if not np.isnan(val):
+        nobs -= 1
+        y = -val - compensation
+        t = sum_x + y
+        compensation = t - sum_x - y
+        sum_x = t
+    return nobs, sum_x, compensation
+
+
+@numba.jit(nopython=True, nogil=True, parallel=False)
+def sliding_sum(
+    values: np.ndarray,
+    start: np.ndarray,
+    end: np.ndarray,
+    min_periods: int,
+) -> np.ndarray:
+    N = len(start)
+    nobs = 0
+    sum_x = 0.0
+    compensation_add = 0.0
+    compensation_remove = 0.0
+
+    is_monotonic_increasing_bounds = is_monotonic_increasing(
+        start
+    ) and is_monotonic_increasing(end)
+
+    output = np.empty(N, dtype=np.float64)
+
+    for i in range(N):
+        s = start[i]
+        e = end[i]
+        if i == 0 or not is_monotonic_increasing_bounds:
+            for j in range(s, e):
+                val = values[j]
+                nobs, sum_x, compensation_add = add_sum(
+                    val, nobs, sum_x, compensation_add
+                )
+        else:
+            for j in range(start[i - 1], s):
+                val = values[j]
+                nobs, sum_x, compensation_remove = remove_sum(
+                    val, nobs, sum_x, compensation_remove
+                )
+
+            for j in range(end[i - 1], e):
+                val = values[j]
+                nobs, sum_x, compensation_add = add_sum(
+                    val, nobs, sum_x, compensation_add
+                )
+
+        if nobs == 0 == nobs:
+            result = 0.0
+        elif nobs >= min_periods:
+            result = sum_x
+        else:
+            result = np.nan
+
+        output[i] = result
+
+        if not is_monotonic_increasing_bounds:
+            nobs = 0
+            sum_x = 0.0
+            compensation_remove = 0.0
+
+    return output
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -1345,15 +1345,16 @@ def sum(
         if maybe_use_numba(engine):
             if self.method == "table":
                 func = generate_manual_numpy_nan_agg_with_axis(np.nansum)
+                return self.apply(
+                    func,
+                    raw=True,
+                    engine=engine,
+                    engine_kwargs=engine_kwargs,
+                )
             else:
-                func = np.nansum
+                from pandas.core._numba.kernels import sliding_sum
 
-            return self.apply(
-                func,
-                raw=True,
-                engine=engine,
-                engine_kwargs=engine_kwargs,
-            )
+                return self._numba_apply(sliding_sum, "rolling_sum", engine_kwargs)
         window_func = window_aggregations.roll_sum
         return self._apply(window_func, name="sum", **kwargs)
 
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -59,15 +59,17 @@ def test_numba_vs_cython_rolling_methods(
         expected = getattr(roll, method)(engine="cython")
 
         # Check the cache
-        if method != "mean":
+        if method not in ("mean", "sum"):
             assert (
                 getattr(np, f"nan{method}"),
                 "Rolling_apply_single",
             ) in NUMBA_FUNC_CACHE
 
         tm.assert_equal(result, expected)
 
-    @pytest.mark.parametrize("data", [DataFrame(np.eye(5)), Series(range(5))])
+    @pytest.mark.parametrize(
+        "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]
+    )
     def test_numba_vs_cython_expanding_methods(
         self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators
     ):
@@ -82,7 +84,7 @@ def test_numba_vs_cython_expanding_methods(
         expected = getattr(expand, method)(engine="cython")
 
         # Check the cache
-        if method != "mean":
+        if method not in ("mean", "sum"):
             assert (
                 getattr(np, f"nan{method}"),
                 "Expanding_apply_single",