In-code comments replaced with minimal comment at the top of the

viable-alternative · viable-alternative · commit 3a709732eb4e · 2025-04-21T10:35:22.000-04:00
fuinction, as requested by @mroeschke
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -1070,7 +1070,6 @@ def _roll_min_max(
         stack Dominators[int64_t]
         ndarray[float64_t, ndim=1] output
 
-        # ideally want these in the i-loop scope
         Py_ssize_t this_start, this_end, stash_start
         int64_t q_idx
 
@@ -1079,8 +1078,8 @@ def _roll_min_max(
     Dominators = stack[int64_t]()
 
     # This function was "ported" / translated from sliding_min_max()
-    # in /pandas/core/_numba/kernels/min_max_.py. (See there for detailed
-    # comments and credits.)
+    # in /pandas/core/_numba/kernels/min_max_.py.
+    # (See there for credits and some comments.)
     # Code translation assumptions/rules:
     # - min_periods --> minp
     # - deque[0] --> front()
@@ -1138,26 +1137,6 @@ def _roll_min_max(
                     while valid_start >= 0 and isnan(values[valid_start]):
                         valid_start += 1
 
-                    # Sadly, this runs more than 15% faster than trying to use
-                    # generic comparison functions.
-                    # That is, I tried:
-                    #
-                    # | cdef inline bint le(float64_t a, float64_t b) nogil:
-                    # |     return a <= b
-                    # | cdef inline bint ge(float64_t a, float64_t b) nogil:
-                    # |     return a >= b
-                    # | ctypedef bint (*cmp_func_t) (float64_t a, float64_t b) nogil
-                    # | ...
-                    # | cmp_func_t cmp
-                    # |
-                    # | if is_max:
-                    # |     cmp = ge
-                    # | else:
-                    # |     cmp = le
-                    # and, finally
-                    # | while not Q.empty() and cmp(values[k], values[Q.back()]):
-                    # |     Q.pop_back()
-
                     if is_max:
                         while not Q.empty() and values[k] >= values[Q.back()]:
                             Q.pop_back()
diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
@@ -43,7 +43,10 @@ def sliding_min_max(
     min_periods: int,
     is_max: bool,
 ) -> tuple[np.ndarray, list[int]]:
-    # numba-only init part
+    # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
+    # It was generalized to work with an arbitrary list of any window size and position
+    # by adding the Dominators stack.
+
     N = len(start)
     na_pos = []
     output = np.empty(N, dtype=result_dtype)
@@ -54,59 +57,24 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
         else:
             return a <= b
 
-    # All comments below are for the case of a maximum, that is, is_max = True.
-    # I will call this Q a "stash": preliminary calculations minimally necessary to
-    # finish the job. Q will always be in ascending order regardless of min/max:
-    # these are indices into the "values" array. The values[Q[i]], however, will
-    # be in non-descending order for max, and non-ascending for min.
-    # Think of this deque as indices of maximums found so far for varying window
-    # positions. That is, there is only one maximum in the source array, but it may
-    # not fit each window. So there are many "secondary maximums", and each of them
-    # may potentially fit multiple windows (unless, of course you get a very special
-    # case of an arary of strictly descending values and constant rolling window size).
-    # In that case Q will be the longest, so here is an idea for the worst case
-    # scenario testing.
-
-    # We have to pretend, given that Numba has neither queue nor stack.
     Q: list = []  # this is a queue
     Dominators: list = []  # this is a stack
-    # END-OF numba-only init part
-
-    # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
-    # It was generalized to work with an arbitrary list of any window size and position
 
-    # Zero is apparently passed here as a default.
-    # It is important to have this value precise for calculations.
     if min_periods < 1:
         min_periods = 1
 
-    # We will say that node j "dominates" node i if j comes after i, yet requires a
-    # deeper deque Q at the time node i is processed in order to be able to finish
-    # the job for node j. This precisely means the following two conditions:
-    # - j > i
-    # - start[j] < start[i].
-    # We keep track of such nodes in the Dominators queue.
-    # In addition, if it so happens that two nodes j1 and j2 dominate another node,
-    # and j2 > j1, yet start[j2] <= start[j1], then we only need to keep track of j2.
-    # (To understand why this works, note that the algorithm requires that
-    # the "end" array is sorted in non-descending order, among other things.)
     if N > 2:
         i_next = N - 1  # equivalent to i_next = i+1 inside the loop
         for i in range(N - 2, -1, -1):
             next_dominates = start[i_next] < start[i]
             if next_dominates and (
                 not Dominators or start[Dominators[-1]] > start[i_next]
             ):
-                # Both ">" and ">=" would have been (logically) equivalent, but we are
-                # shooting for the shortest size of the Dominators list, hence the
-                # usage of ">"
                 Dominators.append(i_next)
             i_next = i
 
     valid_start = -min_periods
 
-    # Having these relieves us from having "if i>0" on each iteration for special
-    # handling
     last_end = 0
     last_start = -1
 
@@ -117,26 +85,16 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
         if Dominators and Dominators[-1] == i:
             Dominators.pop()
 
-        # TODO: Arguably there are benefits to having this consistency check before
-        # this function is even called (e.g. in rolling.py).
-        # Given the current implementation, it will be rather tricky at the moment
-        # to have this check in rolling.py. Additionally, this is only required for
-        # min/max, and may only ever be violated if user-defined window indexer is
-        # used. Thus this is the best spot for it, given the circumstances.
         if not (
             this_end > last_end or (this_end == last_end and this_start >= last_start)
         ):
             raise ValueError(
                 "Start/End ordering requirement is violated at index " + str(i)
             )
 
-        # This is the least restrictive starting index that will take care of current
-        # item (i) and all remaining items
         stash_start = (
             this_start if not Dominators else min(this_start, start[Dominators[-1]])
         )
-        # Discard entries outside of the "needed" window. Do it first as to keep the
-        # stash small.
         while Q and Q[0] < stash_start:
             Q.pop(0)
 
@@ -150,29 +108,13 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
                 Q.append(k)  # Q.push_back(k)
 
         if not Q or (this_start > valid_start):
-            # The "not Q" condition means we have not seen anything but NaNs yet in
-            # values[:this_end-1]. The "this_start > valid_start" condition means we
-            # have not accumulated enough (min_periods or more) non-NaN values.
             if values.dtype.kind != "i":
                 output[i] = np.nan
             else:
                 na_pos.append(i)
         elif Q[0] >= this_start:
-            # This is the only read-from-the-stash scenario that ever happens when
-            # window size is constant across the set. This is also likely 99+% of
-            # all use cases, thus we want to make sure we do not go into bisection
-            # as to incur neither the *log(k) penalty nor the function call penalty
-            # for this very common case. If we are here, then our stash is as "deep"
-            # as what the current node ("job") requires. Thus take the front item.
             output[i] = values[Q[0]]
         else:
-            # In this case our stash is bigger than what is necessary to compute this
-            # node's output due to a wider search window at (one of) the nodes that
-            # follow. We have to locate our value in the middle of the stash.
-            # Since our stash is sorted, we can use binary search:
-            # here we need to output the item closest to the front (idx=0) of the
-            # stash that fits our window bounds. Item 0 has been looked at (and
-            # discarded) by now, so lo=1
             q_idx = bisect_left(Q, this_start, lo=1)
             output[i] = values[Q[q_idx]]
         last_end = this_end