Rolling min/max: added brief comments; renamed a couple of local variables; de-coupled numba and cython unit tests.

viable-alternative · viable-alternative · commit fe47c7474b5e · 2025-04-23T17:24:29.000-04:00
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -989,15 +989,14 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
 
 # ----------------------------------------------------------------------
 
-# Moving maximum / minimum code taken from Bottleneck
-# Licence at LICENSES/BOTTLENECK_LICENCE
-
 cdef int64_t bisect_left(
     deque[int64_t]& a,
     int64_t x,
     int64_t lo=0,
     int64_t hi=-1
 ) nogil:
+    """Same as https://docs.python.org/3/library/bisect.html."""
+
     cdef int64_t mid
     if hi == -1:
         hi = a.size()
@@ -1011,6 +1010,9 @@ cdef int64_t bisect_left(
 
 from libc.math cimport isnan
 
+# Prior version of moving maximum / minimum code taken from Bottleneck
+# Licence at LICENSES/BOTTLENECK_LICENCE
+
 
 def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
              ndarray[int64_t] end, int64_t minp) -> np.ndarray:
@@ -1066,16 +1068,19 @@ def _roll_min_max(
 ):
     cdef:
         Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start)
-        deque Q[int64_t]
-        stack Dominators[int64_t]
+        # Indices of bounded extrema in `values`. `candidates[i]` is always increasing.
+        # `values[candidates[i]]` is decreasing for max and increasing for min.
+        deque candidates[int64_t]
+        # Indices of largest windows that "cover" preceding windows.
+        stack dominators[int64_t]
         ndarray[float64_t, ndim=1] output
 
         Py_ssize_t this_start, this_end, stash_start
         int64_t q_idx
 
     output = np.empty(N, dtype=np.float64)
-    Q = deque[int64_t]()
-    Dominators = stack[int64_t]()
+    candidates = deque[int64_t]()
+    dominators = stack[int64_t]()
 
     # This function was "ported" / translated from sliding_min_max()
     # in /pandas/core/_numba/kernels/min_max_.py.
@@ -1100,12 +1105,13 @@ def _roll_min_max(
             for i in range(N - 2, -1, -1):
                 if start[i_next] < start[i] \
                     and (
-                           Dominators.empty()
-                        or start[Dominators.top()] > start[i_next]
+                           dominators.empty()
+                        or start[dominators.top()] > start[i_next]
                 ):
-                    Dominators.push(i_next)
+                    dominators.push(i_next)
                 i_next = i
 
+        # NaN tracking to guarantee minp
         valid_start = -minp
 
         last_end = 0
@@ -1115,21 +1121,21 @@ def _roll_min_max(
             this_start = start[i]
             this_end = end[i]
 
-            if (not Dominators.empty() and Dominators.top() == i):
-                Dominators.pop()
+            if (not dominators.empty() and dominators.top() == i):
+                dominators.pop()
 
             if not (this_end > last_end
                     or (this_end == last_end and this_start >= last_start)):
                 raise ValueError(
                     "Start/End ordering requirement is violated at index {}".format(i))
 
-            if Dominators.empty():
+            if dominators.empty():
                 stash_start = this_start
             else:
-                stash_start = min(this_start, start[Dominators.top()])
+                stash_start = min(this_start, start[dominators.top()])
 
-            while not Q.empty() and Q.front() < stash_start:
-                Q.pop_front()
+            while not candidates.empty() and candidates.front() < stash_start:
+                candidates.pop_front()
 
             for k in range(last_end, this_end):
                 if not isnan(values[k]):
@@ -1138,20 +1144,23 @@ def _roll_min_max(
                         valid_start += 1
 
                     if is_max:
-                        while not Q.empty() and values[k] >= values[Q.back()]:
-                            Q.pop_back()
+                        while (not candidates.empty()
+                                and values[k] >= values[candidates.back()]):
+                            candidates.pop_back()
                     else:
-                        while not Q.empty() and values[k] <= values[Q.back()]:
-                            Q.pop_back()
-                    Q.push_back(k)
+                        while (not candidates.empty()
+                                and values[k] <= values[candidates.back()]):
+                            candidates.pop_back()
+                    candidates.push_back(k)
 
-            if Q.empty() or this_start > valid_start:
+            if candidates.empty() or this_start > valid_start:
                 output[i] = NaN
-            elif Q.front() >= this_start:
-                output[i] = values[Q.front()]
+            elif candidates.front() >= this_start:
+                # ^^ This is here to avoid costly bisection for fixed window sizes.
+                output[i] = values[candidates.front()]
             else:
-                q_idx = bisect_left(Q, this_start, lo=1)
-                output[i] = values[Q[q_idx]]
+                q_idx = bisect_left(candidates, this_start, lo=1)
+                output[i] = values[candidates[q_idx]]
             last_end = this_end
             last_start = this_start
 
diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
@@ -23,6 +23,7 @@
 
 @numba.njit(nogil=True, parallel=False)
 def bisect_left(a: list[Any], x: Any, lo: int = 0, hi: int = -1) -> int:
+    """Same as https://docs.python.org/3/library/bisect.html; not in numba yet!"""
     if hi == -1:
         hi = len(a)
     while lo < hi:
@@ -57,8 +58,11 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
         else:
             return a <= b
 
-    Q: list = []  # this is a queue
-    Dominators: list = []  # this is a stack
+    # Indices of bounded extrema in `values`. `candidates[i]` is always increasing.
+    # `values[candidates[i]]` is decreasing for max and increasing for min.
+    candidates: list[int] = []  # this is a queue
+    # Indices of largest windows that "cover" preceding windows.
+    dominators: list[int] = []  # this is a stack
 
     if min_periods < 1:
         min_periods = 1
@@ -68,11 +72,12 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
         for i in range(N - 2, -1, -1):
             next_dominates = start[i_next] < start[i]
             if next_dominates and (
-                not Dominators or start[Dominators[-1]] > start[i_next]
+                not dominators or start[dominators[-1]] > start[i_next]
             ):
-                Dominators.append(i_next)
+                dominators.append(i_next)
             i_next = i
 
+    # NaN tracking to guarantee min_periods
     valid_start = -min_periods
 
     last_end = 0
@@ -82,8 +87,8 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
         this_start = start[i].item()
         this_end = end[i].item()
 
-        if Dominators and Dominators[-1] == i:
-            Dominators.pop()
+        if dominators and dominators[-1] == i:
+            dominators.pop()
 
         if not (
             this_end > last_end or (this_end == last_end and this_start >= last_start)
@@ -93,30 +98,31 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
             )
 
         stash_start = (
-            this_start if not Dominators else min(this_start, start[Dominators[-1]])
+            this_start if not dominators else min(this_start, start[dominators[-1]])
         )
-        while Q and Q[0] < stash_start:
-            Q.pop(0)
+        while candidates and candidates[0] < stash_start:
+            candidates.pop(0)
 
         for k in range(last_end, this_end):
             if not np.isnan(values[k]):
                 valid_start += 1
                 while valid_start >= 0 and np.isnan(values[valid_start]):
                     valid_start += 1
-                while Q and cmp(values[k], values[Q[-1]], is_max):
-                    Q.pop()  # Q.pop_back()
-                Q.append(k)  # Q.push_back(k)
+                while candidates and cmp(values[k], values[candidates[-1]], is_max):
+                    candidates.pop()  # Q.pop_back()
+                candidates.append(k)  # Q.push_back(k)
 
-        if not Q or (this_start > valid_start):
+        if not candidates or (this_start > valid_start):
             if values.dtype.kind != "i":
                 output[i] = np.nan
             else:
                 na_pos.append(i)
-        elif Q[0] >= this_start:
-            output[i] = values[Q[0]]
+        elif candidates[0] >= this_start:
+            # ^^ This is here to avoid costly bisection for fixed window sizes.
+            output[i] = values[candidates[0]]
         else:
-            q_idx = bisect_left(Q, this_start, lo=1)
-            output[i] = values[Q[q_idx]]
+            q_idx = bisect_left(candidates, this_start, lo=1)
+            output[i] = values[candidates[q_idx]]
         last_end = this_end
         last_start = this_start
 
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -12,6 +12,7 @@
     to_datetime,
 )
 import pandas._testing as tm
+from pandas.api.indexers import BaseIndexer
 from pandas.util.version import Version
 
 pytestmark = [pytest.mark.single_cpu]
@@ -583,16 +584,65 @@ def test_npfunc_no_warnings():
         df.col1.rolling(2).apply(np.prod, raw=True, engine="numba")
 
 
-from .test_rolling import TestMinMax
+class PrescribedWindowIndexer(BaseIndexer):
+    def __init__(self, start, end):
+        self._start = start
+        self._end = end
+        super().__init__()
+
+    def get_window_bounds(
+        self, num_values=None, min_periods=None, center=None, closed=None, step=None
+    ):
+        if num_values is None:
+            num_values = len(self._start)
+        start = np.clip(self._start, 0, num_values)
+        end = np.clip(self._end, 0, num_values)
+        return start, end
 
 
 @td.skip_if_no("numba")
 class TestMinMaxNumba:
-    parent = TestMinMax()
-
-    @pytest.mark.parametrize("is_max, has_nan, exp_list", TestMinMax.TestData)
+    @pytest.mark.parametrize(
+        "is_max, has_nan, exp_list",
+        [
+            (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
+            (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]),
+            (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]),
+            (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]),
+        ],
+    )
     def test_minmax(self, is_max, has_nan, exp_list):
-        TestMinMaxNumba.parent.test_minmax(is_max, has_nan, exp_list, "numba")
+        nan_idx = [0, 5, 8]
+        df = DataFrame(
+            {
+                "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0],
+                "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3],
+                "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10],
+            }
+        )
+        if has_nan:
+            df.loc[nan_idx, "data"] = np.nan
+        expected = Series(exp_list, name="data")
+        r = df.data.rolling(
+            PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy())
+        )
+        if is_max:
+            result = r.max(engine="numba")
+        else:
+            result = r.min(engine="numba")
+
+        tm.assert_series_equal(result, expected)
 
     def test_wrong_order(self):
-        TestMinMaxNumba.parent.test_wrong_order("numba")
+        start = np.array(range(5), dtype=np.int64)
+        end = start + 1
+        end[3] = end[2]
+        start[3] = start[2] - 1
+
+        df = DataFrame({"data": start * 1.0, "start": start, "end": end})
+
+        r = df.data.rolling(PrescribedWindowIndexer(start, end))
+        with pytest.raises(
+            ValueError, match="Start/End ordering requirement is violated at index 3"
+        ):
+            r.max(engine="numba")
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1965,15 +1965,16 @@ def get_window_bounds(
 
 
 class TestMinMax:
-    TestData = [
-        (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
-        (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]),
-        (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]),
-        (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]),
-    ]
-
-    @pytest.mark.parametrize("is_max, has_nan, exp_list", TestData)
-    def test_minmax(self, is_max, has_nan, exp_list, engine=None):
+    @pytest.mark.parametrize(
+        "is_max, has_nan, exp_list",
+        [
+            (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
+            (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]),
+            (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]),
+            (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]),
+        ],
+    )
+    def test_minmax(self, is_max, has_nan, exp_list):
         nan_idx = [0, 5, 8]
         df = DataFrame(
             {
@@ -1989,13 +1990,13 @@ def test_minmax(self, is_max, has_nan, exp_list, engine=None):
             PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy())
         )
         if is_max:
-            result = r.max(engine=engine)
+            result = r.max()
         else:
-            result = r.min(engine=engine)
+            result = r.min()
 
         tm.assert_series_equal(result, expected)
 
-    def test_wrong_order(self, engine=None):
+    def test_wrong_order(self):
         start = np.array(range(5), dtype=np.int64)
         end = start + 1
         end[3] = end[2]
@@ -2007,4 +2008,4 @@ def test_wrong_order(self, engine=None):
         with pytest.raises(
             ValueError, match="Start/End ordering requirement is violated at index 3"
         ):
-            r.max(engine=engine)
+            r.max()