Skip to content

Commit 63730ef

Browse files
BUG: Fix for bug pandas-dev#46726: wrong result with varying window size min/max rolling calc.
1 parent 184e84d commit 63730ef

File tree

10 files changed

+696
-189
lines changed

10 files changed

+696
-189
lines changed

pandas/_libs/window/aggregations.pyi

+7
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ def roll_median_c(
4848
end: np.ndarray, # np.ndarray[np.int64]
4949
minp: int, # int64_t
5050
) -> np.ndarray: ... # np.ndarray[float]
51+
def roll_min_max(
52+
values: np.ndarray, # np.ndarray[np.float64]
53+
start: np.ndarray, # np.ndarray[np.int64]
54+
end: np.ndarray, # np.ndarray[np.int64]
55+
minp: int, # int64_t
56+
is_max: bool,
57+
) -> np.ndarray: ... # np.ndarray[float]
5158
def roll_max(
5259
values: np.ndarray, # np.ndarray[np.float64]
5360
start: np.ndarray, # np.ndarray[np.int64]

pandas/_libs/window/aggregations.pyx

+155-101
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from libc.math cimport (
66
sqrt,
77
)
88
from libcpp.deque cimport deque
9+
from libcpp.stack cimport stack
910
from libcpp.unordered_map cimport unordered_map
1011

1112
from pandas._libs.algos cimport TiebreakEnumType
@@ -991,36 +992,161 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
991992
# Moving maximum / minimum code taken from Bottleneck
992993
# Licence at LICENSES/BOTTLENECK_LICENCE
993994

995+
cdef int64_t bisect_left(
996+
deque[int64_t]& a,
997+
int64_t x,
998+
int64_t lo=0,
999+
int64_t hi=-1
1000+
) nogil:
1001+
cdef int64_t mid
1002+
if hi == -1:
1003+
hi = a.size()
1004+
while lo < hi:
1005+
mid = (lo + hi) // 2
1006+
if a.at(mid) < x:
1007+
lo = mid + 1
1008+
else:
1009+
hi = mid
1010+
return lo
9941011

995-
cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil:
1012+
from libc.math cimport isnan
9961013

997-
if ai == ai:
998-
nobs[0] = nobs[0] + 1
999-
elif is_max:
1000-
ai = MINfloat64
1001-
else:
1002-
ai = MAXfloat64
10031014

1004-
return ai
1015+
def roll_min_max(
1016+
ndarray[float64_t] values,
1017+
ndarray[int64_t] start,
1018+
ndarray[int64_t] end,
1019+
int64_t minp,
1020+
bint is_max
1021+
):
1022+
"""
1023+
Moving min/max of 1d array of any numeric type along axis=0, ignoring NaNs.
10051024
1025+
Parameters
1026+
----------
1027+
values : np.ndarray[np.float64], the target array
1028+
start : np.ndarray[np.float64], array of window low bounds
1029+
end : np.ndarray[np.float64], array of window high bounds
1030+
minp : if number of observations in window
1031+
is below this, output a NaN
1032+
is_max : bint, True for max, False for min
10061033
1007-
cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) noexcept nogil:
1008-
""" remove a value from the mm calc """
1009-
if aold == aold:
1010-
nobs[0] = nobs[0] - 1
1034+
Returns
1035+
-------
1036+
np.ndarray[float]
1037+
"""
1038+
cdef:
1039+
Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start)
1040+
deque Q[int64_t]
1041+
stack Dominators[int64_t]
1042+
ndarray[float64_t, ndim=1] output
10111043

1044+
# ideally want these in the i-loop scope
1045+
Py_ssize_t this_start, this_end, stash_start
1046+
int64_t q_idx
10121047

1013-
cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs,
1014-
float64_t value) noexcept nogil:
1015-
cdef:
1016-
float64_t result
1048+
output = np.empty(N, dtype=np.float64)
1049+
Q = deque[int64_t]()
1050+
Dominators = stack[int64_t]()
1051+
1052+
# This function was "ported"/translated from python_sliding_min_max()
1053+
# in pandas/core/window/_min_max.py. (See there for detailed comments and credits.)
1054+
# Code translation assumptions/rules:
1055+
# - min_periods --> minp
1056+
# - deque[0] --> front()
1057+
# - deque[-1] --> back()
1058+
# - stack[-1] --> top()
1059+
# - bool(stack/deque) --> !empty()
1060+
# - deque.append() --> push_back()
1061+
# - stack.append() --> push()
1062+
# - deque.popleft --> pop_front()
1063+
# - deque.pop() --> pop_back()
10171064

1018-
if nobs >= minp:
1019-
result = value
1020-
else:
1021-
result = NaN
1065+
with nogil:
1066+
if minp < 1:
1067+
minp = 1
10221068

1023-
return result
1069+
if N>2:
1070+
i_next = N - 1
1071+
for i in range(N - 2, -1, -1):
1072+
if start[i_next] < start[i] \
1073+
and (
1074+
Dominators.empty()
1075+
or start[Dominators.top()] > start[i_next]
1076+
):
1077+
Dominators.push(i_next)
1078+
i_next = i
1079+
1080+
valid_start = -minp
1081+
1082+
last_end =0
1083+
last_start=-1
1084+
1085+
for i in range(N):
1086+
this_start = start[i]
1087+
this_end = end[i]
1088+
1089+
if (not Dominators.empty() and Dominators.top() == i):
1090+
Dominators.pop()
1091+
1092+
if not (this_end > last_end
1093+
or (this_end == last_end and this_start >= last_start)):
1094+
raise ValueError(
1095+
"Start/End ordering requirement is violated at index {}".format(i))
1096+
1097+
if Dominators.empty():
1098+
stash_start = this_start
1099+
else:
1100+
stash_start = min(this_start, start[Dominators.top()])
1101+
1102+
while not Q.empty() and Q.front() < stash_start:
1103+
Q.pop_front()
1104+
1105+
for k in range(last_end, this_end):
1106+
if not isnan(values[k]):
1107+
valid_start += 1
1108+
while valid_start>=0 and isnan(values[valid_start]):
1109+
valid_start += 1
1110+
1111+
# Sadly, this runs more than 15% faster than trying to use
1112+
# generic comparison functions.
1113+
# That is, I tried:
1114+
#
1115+
# | cdef inline bint le(float64_t a, float64_t b) nogil:
1116+
# | return a <= b
1117+
# | cdef inline bint ge(float64_t a, float64_t b) nogil:
1118+
# | return a >= b
1119+
# | ctypedef bint (*cmp_func_t) (float64_t a, float64_t b) nogil
1120+
# | ...
1121+
# | cmp_func_t cmp
1122+
# |
1123+
# | if is_max:
1124+
# | cmp = ge
1125+
# | else:
1126+
# | cmp = le
1127+
# and, finally
1128+
# | while not Q.empty() and cmp(values[k], values[Q.back()]):
1129+
# | Q.pop_back()
1130+
1131+
if is_max:
1132+
while not Q.empty() and values[k] >= values[Q.back()]:
1133+
Q.pop_back()
1134+
else:
1135+
while not Q.empty() and values[k] <= values[Q.back()]:
1136+
Q.pop_back()
1137+
Q.push_back(k)
1138+
1139+
if Q.empty() or this_start > valid_start:
1140+
output[i] = NaN
1141+
elif Q.front() >= this_start:
1142+
output[i] = values[Q.front()]
1143+
else:
1144+
q_idx = bisect_left(Q, this_start, lo=1)
1145+
output[i] = values[Q[q_idx]]
1146+
last_end = this_end
1147+
last_start = this_start
1148+
1149+
return output
10241150

10251151

10261152
def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1030,21 +1156,17 @@ def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
10301156

10311157
Parameters
10321158
----------
1033-
values : np.ndarray[np.float64]
1034-
window : int, size of rolling window
1159+
values : np.ndarray[np.float64], the target array
1160+
start : np.ndarray[np.float64], array of window low bounds
1161+
end : np.ndarray[np.float64], array of window high bounds
10351162
minp : if number of observations in window
10361163
is below this, output a NaN
1037-
index : ndarray, optional
1038-
index for window computation
1039-
closed : 'right', 'left', 'both', 'neither'
1040-
make the interval closed on the right, left,
1041-
both or neither endpoints
10421164

10431165
Returns
10441166
-------
10451167
np.ndarray[float]
10461168
"""
1047-
return _roll_min_max(values, start, end, minp, is_max=1)
1169+
return roll_min_max(values, start, end, minp, is_max=1)
10481170

10491171

10501172
def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1054,85 +1176,17 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
10541176

10551177
Parameters
10561178
----------
1057-
values : np.ndarray[np.float64]
1058-
window : int, size of rolling window
1179+
values : np.ndarray[np.float64], the target array
1180+
start : np.ndarray[np.float64], array of window low bounds
1181+
end : np.ndarray[np.float64], array of window high bounds
10591182
minp : if number of observations in window
10601183
is below this, output a NaN
1061-
index : ndarray, optional
1062-
index for window computation
10631184

10641185
Returns
10651186
-------
10661187
np.ndarray[float]
10671188
"""
1068-
return _roll_min_max(values, start, end, minp, is_max=0)
1069-
1070-
1071-
cdef _roll_min_max(ndarray[float64_t] values,
1072-
ndarray[int64_t] starti,
1073-
ndarray[int64_t] endi,
1074-
int64_t minp,
1075-
bint is_max):
1076-
cdef:
1077-
float64_t ai
1078-
int64_t curr_win_size, start
1079-
Py_ssize_t i, k, nobs = 0, N = len(starti)
1080-
deque Q[int64_t] # min/max always the front
1081-
deque W[int64_t] # track the whole window for nobs compute
1082-
ndarray[float64_t, ndim=1] output
1083-
1084-
output = np.empty(N, dtype=np.float64)
1085-
Q = deque[int64_t]()
1086-
W = deque[int64_t]()
1087-
1088-
with nogil:
1089-
1090-
# This is using a modified version of the C++ code in this
1091-
# SO post: https://stackoverflow.com/a/12239580
1092-
# The original impl didn't deal with variable window sizes
1093-
# So the code was optimized for that
1094-
1095-
# first window's size
1096-
curr_win_size = endi[0] - starti[0]
1097-
# GH 32865
1098-
# Anchor output index to values index to provide custom
1099-
# BaseIndexer support
1100-
for i in range(N):
1101-
1102-
curr_win_size = endi[i] - starti[i]
1103-
if i == 0:
1104-
start = starti[i]
1105-
else:
1106-
start = endi[i - 1]
1107-
1108-
for k in range(start, endi[i]):
1109-
ai = init_mm(values[k], &nobs, is_max)
1110-
# Discard previous entries if we find new min or max
1111-
if is_max:
1112-
while not Q.empty() and ((ai >= values[Q.back()]) or
1113-
values[Q.back()] != values[Q.back()]):
1114-
Q.pop_back()
1115-
else:
1116-
while not Q.empty() and ((ai <= values[Q.back()]) or
1117-
values[Q.back()] != values[Q.back()]):
1118-
Q.pop_back()
1119-
Q.push_back(k)
1120-
W.push_back(k)
1121-
1122-
# Discard entries outside and left of current window
1123-
while not Q.empty() and Q.front() <= starti[i] - 1:
1124-
Q.pop_front()
1125-
while not W.empty() and W.front() <= starti[i] - 1:
1126-
remove_mm(values[W.front()], &nobs)
1127-
W.pop_front()
1128-
1129-
# Save output based on index in input value array
1130-
if not Q.empty() and curr_win_size > 0:
1131-
output[i] = calc_mm(minp, nobs, values[Q.front()])
1132-
else:
1133-
output[i] = NaN
1134-
1135-
return output
1189+
return roll_min_max(values, start, end, minp, is_max=0)
11361190

11371191
# ----------------------------------------------------------------------
11381192
# Rolling first, last

pandas/bug_hunters/README.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Bug Hunters
2+
3+
## What Is It?
4+
Write your scripts that run a series of nondterministic tests "forever", and output any bugs found.
5+
6+
## But We Have Unit Tests for That (pytest)?
7+
Not quite!
8+
- Unit tests are only designed to run a handful of known tests, and are just a "sanity check" before letting your changes in. Pytest is also a deterministic set that is designed to, once passed, keep passing until a breaking change is made. Bug hunters, on the other hand, are designed to keep hunting for new bugs.
9+
- Unit tests are supposed to finish in finite time (to say the least!), and have to cover the entire codebase; therefore by design they cannot be very extensive or be able to cover all thinkable and untinkable inputs (which is what we expect from bug hunters).
10+
- You will simply not be able to commit a typical bug-hunter script into the "tests" directory: a typical bug hunter script has patterns (by design) that will violate pre-commit checks, namely random number generators without a seed, to promote nondeterminism.
11+
12+
## Good Tone Conventions
13+
This is a new directory, and is not covered by specialized pre-commit checks, so it is a good idea to follow these basic recommendations:
14+
- Have code in your bug hunter script that for a failed test will present (e.g. print or save to a file) all the inputs necessary to reproduce the behavior.
15+
- Try to reuse as much code as possible. Supposedly whatever code is written here can appear in unit-tests as well. One way to reuse it is to define your basic test structure in pytest (the "teats" directory), and reuse it here with imports.

0 commit comments

Comments
 (0)