Skip to content

Commit 2925b9e

Browse files
BUG: Fix for bug pandas-dev#46726: wrong result with varying window size min/max rolling calc.
1 parent 5230845 commit 2925b9e

File tree

8 files changed

+620
-145
lines changed

8 files changed

+620
-145
lines changed

Diff for: pandas/_libs/window/aggregations.pyx

+120-77
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from libc.math cimport (
66
sqrt,
77
)
88
from libcpp.deque cimport deque
9+
from libcpp.stack cimport stack
910
from libcpp.unordered_map cimport unordered_map
1011

1112
from pandas._libs.algos cimport TiebreakEnumType
@@ -991,36 +992,24 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
991992
# Moving maximum / minimum code taken from Bottleneck
992993
# Licence at LICENSES/BOTTLENECK_LICENCE
993994

995+
cdef int64_t bisect_left(
996+
deque[int64_t]& a,
997+
int64_t x,
998+
int64_t lo=0,
999+
int64_t hi=-1
1000+
) nogil:
1001+
cdef int64_t mid
1002+
if hi == -1:
1003+
hi = a.size()
1004+
while lo < hi:
1005+
mid = (lo + hi) // 2
1006+
if a.at(mid) < x:
1007+
lo = mid + 1
1008+
else:
1009+
hi = mid
1010+
return lo
9941011

995-
cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil:
996-
997-
if ai == ai:
998-
nobs[0] = nobs[0] + 1
999-
elif is_max:
1000-
ai = MINfloat64
1001-
else:
1002-
ai = MAXfloat64
1003-
1004-
return ai
1005-
1006-
1007-
cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) noexcept nogil:
1008-
""" remove a value from the mm calc """
1009-
if aold == aold:
1010-
nobs[0] = nobs[0] - 1
1011-
1012-
1013-
cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs,
1014-
float64_t value) noexcept nogil:
1015-
cdef:
1016-
float64_t result
1017-
1018-
if nobs >= minp:
1019-
result = value
1020-
else:
1021-
result = NaN
1022-
1023-
return result
1012+
from libc.math cimport isnan
10241013

10251014

10261015
def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1068,69 +1057,123 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
10681057
return _roll_min_max(values, start, end, minp, is_max=0)
10691058

10701059

1071-
cdef _roll_min_max(ndarray[float64_t] values,
1072-
ndarray[int64_t] starti,
1073-
ndarray[int64_t] endi,
1074-
int64_t minp,
1075-
bint is_max):
1060+
def _roll_min_max(
1061+
ndarray[float64_t] values,
1062+
ndarray[int64_t] start,
1063+
ndarray[int64_t] end,
1064+
int64_t minp,
1065+
bint is_max
1066+
):
10761067
cdef:
1077-
float64_t ai
1078-
int64_t curr_win_size, start
1079-
Py_ssize_t i, k, nobs = 0, N = len(starti)
1080-
deque Q[int64_t] # min/max always the front
1081-
deque W[int64_t] # track the whole window for nobs compute
1068+
Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start)
1069+
deque Q[int64_t]
1070+
stack Dominators[int64_t]
10821071
ndarray[float64_t, ndim=1] output
10831072

1073+
# ideally want these in the i-loop scope
1074+
Py_ssize_t this_start, this_end, stash_start
1075+
int64_t q_idx
1076+
10841077
output = np.empty(N, dtype=np.float64)
10851078
Q = deque[int64_t]()
1086-
W = deque[int64_t]()
1079+
Dominators = stack[int64_t]()
1080+
1081+
# This function was "ported"/translated from python_sliding_min_max()
1082+
# in pandas/core/window/_min_max.py. (See there for detailed comments and credits.)
1083+
# Code translation assumptions/rules:
1084+
# - min_periods --> minp
1085+
# - deque[0] --> front()
1086+
# - deque[-1] --> back()
1087+
# - stack[-1] --> top()
1088+
# - bool(stack/deque) --> !empty()
1089+
# - deque.append() --> push_back()
1090+
# - stack.append() --> push()
1091+
# - deque.popleft --> pop_front()
1092+
# - deque.pop() --> pop_back()
10871093

10881094
with nogil:
1095+
if minp < 1:
1096+
minp = 1
1097+
1098+
if N>2:
1099+
i_next = N - 1
1100+
for i in range(N - 2, -1, -1):
1101+
if start[i_next] < start[i] \
1102+
and (
1103+
Dominators.empty()
1104+
or start[Dominators.top()] > start[i_next]
1105+
):
1106+
Dominators.push(i_next)
1107+
i_next = i
10891108

1090-
# This is using a modified version of the C++ code in this
1091-
# SO post: https://stackoverflow.com/a/12239580
1092-
# The original impl didn't deal with variable window sizes
1093-
# So the code was optimized for that
1109+
valid_start = -minp
1110+
1111+
last_end =0
1112+
last_start=-1
10941113

1095-
# first window's size
1096-
curr_win_size = endi[0] - starti[0]
1097-
# GH 32865
1098-
# Anchor output index to values index to provide custom
1099-
# BaseIndexer support
11001114
for i in range(N):
1115+
this_start = start[i]
1116+
this_end = end[i]
1117+
1118+
if (not Dominators.empty() and Dominators.top() == i):
1119+
Dominators.pop()
11011120

1102-
curr_win_size = endi[i] - starti[i]
1103-
if i == 0:
1104-
start = starti[i]
1121+
if not (this_end > last_end
1122+
or (this_end == last_end and this_start >= last_start)):
1123+
raise ValueError(
1124+
"Start/End ordering requirement is violated at index {}".format(i))
1125+
1126+
if Dominators.empty():
1127+
stash_start = this_start
11051128
else:
1106-
start = endi[i - 1]
1107-
1108-
for k in range(start, endi[i]):
1109-
ai = init_mm(values[k], &nobs, is_max)
1110-
# Discard previous entries if we find new min or max
1111-
if is_max:
1112-
while not Q.empty() and ((ai >= values[Q.back()]) or
1113-
values[Q.back()] != values[Q.back()]):
1114-
Q.pop_back()
1115-
else:
1116-
while not Q.empty() and ((ai <= values[Q.back()]) or
1117-
values[Q.back()] != values[Q.back()]):
1118-
Q.pop_back()
1119-
Q.push_back(k)
1120-
W.push_back(k)
1121-
1122-
# Discard entries outside and left of current window
1123-
while not Q.empty() and Q.front() <= starti[i] - 1:
1129+
stash_start = min(this_start, start[Dominators.top()])
1130+
1131+
while not Q.empty() and Q.front() < stash_start:
11241132
Q.pop_front()
1125-
while not W.empty() and W.front() <= starti[i] - 1:
1126-
remove_mm(values[W.front()], &nobs)
1127-
W.pop_front()
11281133

1129-
# Save output based on index in input value array
1130-
if not Q.empty() and curr_win_size > 0:
1131-
output[i] = calc_mm(minp, nobs, values[Q.front()])
1132-
else:
1134+
for k in range(last_end, this_end):
1135+
if not isnan(values[k]):
1136+
valid_start += 1
1137+
while valid_start>=0 and isnan(values[valid_start]):
1138+
valid_start += 1
1139+
1140+
# Sadly, this runs more than 15% faster than trying to use
1141+
# generic comparison functions.
1142+
# That is, I tried:
1143+
#
1144+
# | cdef inline bint le(float64_t a, float64_t b) nogil:
1145+
# | return a <= b
1146+
# | cdef inline bint ge(float64_t a, float64_t b) nogil:
1147+
# | return a >= b
1148+
# | ctypedef bint (*cmp_func_t) (float64_t a, float64_t b) nogil
1149+
# | ...
1150+
# | cmp_func_t cmp
1151+
# |
1152+
# | if is_max:
1153+
# | cmp = ge
1154+
# | else:
1155+
# | cmp = le
1156+
# and, finally
1157+
# | while not Q.empty() and cmp(values[k], values[Q.back()]):
1158+
# | Q.pop_back()
1159+
1160+
if is_max:
1161+
while not Q.empty() and values[k] >= values[Q.back()]:
1162+
Q.pop_back()
1163+
else:
1164+
while not Q.empty() and values[k] <= values[Q.back()]:
1165+
Q.pop_back()
1166+
Q.push_back(k)
1167+
1168+
if Q.empty() or this_start > valid_start:
11331169
output[i] = NaN
1170+
elif Q.front() >= this_start:
1171+
output[i] = values[Q.front()]
1172+
else:
1173+
q_idx = bisect_left(Q, this_start, lo=1)
1174+
output[i] = values[Q[q_idx]]
1175+
last_end = this_end
1176+
last_start = this_start
11341177

11351178
return output
11361179

Diff for: pandas/bug_hunters/README.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Bug Hunters
2+
3+
## What Is It?
4+
Write your scripts that run a series of nondterministic tests "forever", and output any bugs found.
5+
6+
## But We Have Unit Tests for That (pytest)?
7+
Not quite!
8+
- Unit tests are only designed to run a handful of known tests, and are just a "sanity check" before letting your changes in. Pytest is also a deterministic set that is designed to, once passed, keep passing until a breaking change is made. Bug hunters, on the other hand, are designed to keep hunting for new bugs.
9+
- Unit tests are supposed to finish in finite time (to say the least!), and have to cover the entire codebase; therefore by design they cannot be very extensive or be able to cover all thinkable and untinkable inputs (which is what we expect from bug hunters).
10+
- You will simply not be able to commit a typical bug-hunter script into the "tests" directory: a typical bug hunter script has patterns (by design) that will violate pre-commit checks, namely random number generators without a seed, to promote nondeterminism.
11+
12+
## Good Tone Conventions
13+
This is a new directory, and is not covered by specialized pre-commit checks, so it is a good idea to follow these basic recommendations:
14+
- Have code in your bug hunter script that for a failed test will present (e.g. print or save to a file) all the inputs necessary to reproduce the behavior.
15+
- Try to reuse as much code as possible. Supposedly whatever code is written here can appear in unit-tests as well. One way to reuse it is to define your basic test structure in pytest (the "teats" directory), and reuse it here with imports.

Diff for: pandas/bug_hunters/window/bh_minmax.py

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
from typing import (
2+
Any,
3+
)
4+
5+
import numpy as np
6+
7+
from pandas.tests.window.test_minmax import (
8+
ArbitraryWindowIndexer,
9+
CustomLengthWindowIndexer,
10+
TestMinMax,
11+
)
12+
13+
14+
def get_name(x: Any) -> str:
15+
return x.__name__ if x else "None"
16+
17+
18+
indexers = [None, CustomLengthWindowIndexer, ArbitraryWindowIndexer]
19+
20+
result_map = {
21+
True: "PASS",
22+
False: "FAIL",
23+
}
24+
engines = ["python", "cython", "numba"]
25+
func_sides = [True, False]
26+
27+
specific_test: tuple[Any, ...] | None = None
28+
rng_state = None
29+
# specific_test = (True, "python", 41659, 443, 389, 0.15632619537481263, None)
30+
# rng_state = {'bit_generator': 'PCG64', 'state':
31+
# {'state': 277391646186114035749321587091198453320,
32+
# 'inc': 59022448233807883636838765922236075093},
33+
# 'has_uint32': 0, 'uinteger': 0}
34+
tester_rng: np.random.Generator | None
35+
if rng_state:
36+
tester_rng = np.random.default_rng()
37+
tester_rng.bit_generator.state = rng_state
38+
else:
39+
tester_rng = None
40+
41+
rng = np.random.default_rng()
42+
43+
keep_running = True
44+
tester = TestMinMax()
45+
cache = TestMinMax.Cache()
46+
while keep_running:
47+
if specific_test:
48+
(fs, eng, n, wnd_len, min_obs, na_frac, indexer_class) = specific_test
49+
engines = [eng]
50+
func_sides = [fs]
51+
else:
52+
ind_idx = rng.integers(0, 3)
53+
indexer_class = indexers[ind_idx]
54+
n = rng.integers(2000, 100000)
55+
# n = rng.integers(15,20).item()
56+
wnd_len = rng.integers(1, n // 2)
57+
min_obs = rng.integers(0, wnd_len + 1)
58+
na_frac = rng.random(1)[0]
59+
60+
# Preserve state across the same run with different engines to economize
61+
# on computing the control
62+
tester_rng_this_run = tester_rng
63+
for func_side in func_sides:
64+
for engine in engines:
65+
try:
66+
tester.test_minmax(
67+
func_side,
68+
engine,
69+
tester_rng_this_run,
70+
n,
71+
wnd_len,
72+
min_obs,
73+
na_frac,
74+
indexer_class,
75+
cache,
76+
)
77+
passed = True
78+
except AssertionError:
79+
passed = False
80+
msg_str = "{} is_max={} {} N ={} wnd ={} min_obs ={} na ={:.2f}% {}".format(
81+
result_map[passed],
82+
func_side,
83+
engine,
84+
n,
85+
wnd_len,
86+
min_obs,
87+
na_frac * 100,
88+
get_name(indexer_class),
89+
)
90+
print(msg_str)
91+
if not passed:
92+
print("")
93+
print("Bug found!")
94+
msg = "*** To reproduce the bug, paste this into the "
95+
msg += "script (around line 29): ***"
96+
print(msg)
97+
msg_str = 'specific_test = ({}, "{}", {}, {}, {}, {}, {})'.format(
98+
func_side != 0,
99+
engine,
100+
n,
101+
wnd_len,
102+
min_obs,
103+
na_frac,
104+
get_name(indexer_class),
105+
)
106+
print(msg_str)
107+
print(f"rng_state = {tester.last_rng_state}")
108+
keep_running = False
109+
break
110+
# raise AssertionError("Looks like a bug!")
111+
tester_rng_this_run = np.random.default_rng()
112+
tester_rng_this_run.bit_generator.state = tester.last_rng_state
113+
cache.ctrl.clear() # we only need it across different engines for the same run
114+
keep_running = keep_running and not specific_test
115+
if not keep_running:
116+
break

0 commit comments

Comments
 (0)