@@ -43,7 +43,10 @@ def sliding_min_max(
43
43
min_periods : int ,
44
44
is_max : bool ,
45
45
) -> tuple [np .ndarray , list [int ]]:
46
- # numba-only init part
46
+ # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
47
+ # It was generalized to work with an arbitrary list of any window size and position
48
+ # by adding the Dominators stack.
49
+
47
50
N = len (start )
48
51
na_pos = []
49
52
output = np .empty (N , dtype = result_dtype )
@@ -54,59 +57,24 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
54
57
else :
55
58
return a <= b
56
59
57
- # All comments below are for the case of a maximum, that is, is_max = True.
58
- # I will call this Q a "stash": preliminary calculations minimally necessary to
59
- # finish the job. Q will always be in ascending order regardless of min/max:
60
- # these are indices into the "values" array. The values[Q[i]], however, will
61
- # be in non-descending order for max, and non-ascending for min.
62
- # Think of this deque as indices of maximums found so far for varying window
63
- # positions. That is, there is only one maximum in the source array, but it may
64
- # not fit each window. So there are many "secondary maximums", and each of them
65
- # may potentially fit multiple windows (unless, of course you get a very special
66
- # case of an arary of strictly descending values and constant rolling window size).
67
- # In that case Q will be the longest, so here is an idea for the worst case
68
- # scenario testing.
69
-
70
- # We have to pretend, given that Numba has neither queue nor stack.
71
60
Q : list = [] # this is a queue
72
61
Dominators : list = [] # this is a stack
73
- # END-OF numba-only init part
74
-
75
- # Basic idea of the algorithm: https://stackoverflow.com/a/12239580
76
- # It was generalized to work with an arbitrary list of any window size and position
77
62
78
- # Zero is apparently passed here as a default.
79
- # It is important to have this value precise for calculations.
80
63
if min_periods < 1 :
81
64
min_periods = 1
82
65
83
- # We will say that node j "dominates" node i if j comes after i, yet requires a
84
- # deeper deque Q at the time node i is processed in order to be able to finish
85
- # the job for node j. This precisely means the following two conditions:
86
- # - j > i
87
- # - start[j] < start[i].
88
- # We keep track of such nodes in the Dominators queue.
89
- # In addition, if it so happens that two nodes j1 and j2 dominate another node,
90
- # and j2 > j1, yet start[j2] <= start[j1], then we only need to keep track of j2.
91
- # (To understand why this works, note that the algorithm requires that
92
- # the "end" array is sorted in non-descending order, among other things.)
93
66
if N > 2 :
94
67
i_next = N - 1 # equivalent to i_next = i+1 inside the loop
95
68
for i in range (N - 2 , - 1 , - 1 ):
96
69
next_dominates = start [i_next ] < start [i ]
97
70
if next_dominates and (
98
71
not Dominators or start [Dominators [- 1 ]] > start [i_next ]
99
72
):
100
- # Both ">" and ">=" would have been (logically) equivalent, but we are
101
- # shooting for the shortest size of the Dominators list, hence the
102
- # usage of ">"
103
73
Dominators .append (i_next )
104
74
i_next = i
105
75
106
76
valid_start = - min_periods
107
77
108
- # Having these relieves us from having "if i>0" on each iteration for special
109
- # handling
110
78
last_end = 0
111
79
last_start = - 1
112
80
@@ -117,26 +85,16 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
117
85
if Dominators and Dominators [- 1 ] == i :
118
86
Dominators .pop ()
119
87
120
- # TODO: Arguably there are benefits to having this consistency check before
121
- # this function is even called (e.g. in rolling.py).
122
- # Given the current implementation, it will be rather tricky at the moment
123
- # to have this check in rolling.py. Additionally, this is only required for
124
- # min/max, and may only ever be violated if user-defined window indexer is
125
- # used. Thus this is the best spot for it, given the circumstances.
126
88
if not (
127
89
this_end > last_end or (this_end == last_end and this_start >= last_start )
128
90
):
129
91
raise ValueError (
130
92
"Start/End ordering requirement is violated at index " + str (i )
131
93
)
132
94
133
- # This is the least restrictive starting index that will take care of current
134
- # item (i) and all remaining items
135
95
stash_start = (
136
96
this_start if not Dominators else min (this_start , start [Dominators [- 1 ]])
137
97
)
138
- # Discard entries outside of the "needed" window. Do it first as to keep the
139
- # stash small.
140
98
while Q and Q [0 ] < stash_start :
141
99
Q .pop (0 )
142
100
@@ -150,29 +108,13 @@ def cmp(a: Any, b: Any, is_max: bool) -> bool:
150
108
Q .append (k ) # Q.push_back(k)
151
109
152
110
if not Q or (this_start > valid_start ):
153
- # The "not Q" condition means we have not seen anything but NaNs yet in
154
- # values[:this_end-1]. The "this_start > valid_start" condition means we
155
- # have not accumulated enough (min_periods or more) non-NaN values.
156
111
if values .dtype .kind != "i" :
157
112
output [i ] = np .nan
158
113
else :
159
114
na_pos .append (i )
160
115
elif Q [0 ] >= this_start :
161
- # This is the only read-from-the-stash scenario that ever happens when
162
- # window size is constant across the set. This is also likely 99+% of
163
- # all use cases, thus we want to make sure we do not go into bisection
164
- # as to incur neither the *log(k) penalty nor the function call penalty
165
- # for this very common case. If we are here, then our stash is as "deep"
166
- # as what the current node ("job") requires. Thus take the front item.
167
116
output [i ] = values [Q [0 ]]
168
117
else :
169
- # In this case our stash is bigger than what is necessary to compute this
170
- # node's output due to a wider search window at (one of) the nodes that
171
- # follow. We have to locate our value in the middle of the stash.
172
- # Since our stash is sorted, we can use binary search:
173
- # here we need to output the item closest to the front (idx=0) of the
174
- # stash that fits our window bounds. Item 0 has been looked at (and
175
- # discarded) by now, so lo=1
176
118
q_idx = bisect_left (Q , this_start , lo = 1 )
177
119
output [i ] = values [Q [q_idx ]]
178
120
last_end = this_end
0 commit comments