Skip to content

Commit 0e0bafb

Browse files
BUG: Wrong result with varying window size min/max rolling calc. (#61288)
1 parent 0fce6b4 commit 0e0bafb

File tree

5 files changed

+336
-126
lines changed

5 files changed

+336
-126
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,7 @@ Groupby/resample/rolling
810810
- Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
811811
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
812812
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
813+
- Bug in :meth:`.Series.rolling` when used with a :class:`.BaseIndexer` subclass and computing min/max (:issue:`46726`)
813814
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
814815
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`)
815816
- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`)

pandas/_libs/window/aggregations.pyx

+115-83
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from libc.math cimport (
66
sqrt,
77
)
88
from libcpp.deque cimport deque
9+
from libcpp.stack cimport stack
910
from libcpp.unordered_map cimport unordered_map
1011

1112
from pandas._libs.algos cimport TiebreakEnumType
@@ -988,39 +989,29 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
988989

989990
# ----------------------------------------------------------------------
990991

991-
# Moving maximum / minimum code taken from Bottleneck
992-
# Licence at LICENSES/BOTTLENECK_LICENCE
993-
994-
995-
cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil:
996-
997-
if ai == ai:
998-
nobs[0] = nobs[0] + 1
999-
elif is_max:
1000-
ai = MINfloat64
1001-
else:
1002-
ai = MAXfloat64
1003-
1004-
return ai
1005-
1006-
1007-
cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) noexcept nogil:
1008-
""" remove a value from the mm calc """
1009-
if aold == aold:
1010-
nobs[0] = nobs[0] - 1
1011-
1012-
1013-
cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs,
1014-
float64_t value) noexcept nogil:
1015-
cdef:
1016-
float64_t result
992+
cdef int64_t bisect_left(
993+
deque[int64_t]& a,
994+
int64_t x,
995+
int64_t lo=0,
996+
int64_t hi=-1
997+
) nogil:
998+
"""Same as https://docs.python.org/3/library/bisect.html."""
999+
1000+
cdef int64_t mid
1001+
if hi == -1:
1002+
hi = a.size()
1003+
while lo < hi:
1004+
mid = (lo + hi) // 2
1005+
if a.at(mid) < x:
1006+
lo = mid + 1
1007+
else:
1008+
hi = mid
1009+
return lo
10171010

1018-
if nobs >= minp:
1019-
result = value
1020-
else:
1021-
result = NaN
1011+
from libc.math cimport isnan
10221012

1023-
return result
1013+
# Prior version of moving maximum / minimum code taken from Bottleneck
1014+
# Licence at LICENSES/BOTTLENECK_LICENCE
10241015

10251016

10261017
def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
@@ -1068,69 +1059,110 @@ def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
10681059
return _roll_min_max(values, start, end, minp, is_max=0)
10691060

10701061

1071-
cdef _roll_min_max(ndarray[float64_t] values,
1072-
ndarray[int64_t] starti,
1073-
ndarray[int64_t] endi,
1074-
int64_t minp,
1075-
bint is_max):
1062+
def _roll_min_max(
1063+
ndarray[float64_t] values,
1064+
ndarray[int64_t] start,
1065+
ndarray[int64_t] end,
1066+
int64_t minp,
1067+
bint is_max
1068+
):
10761069
cdef:
1077-
float64_t ai
1078-
int64_t curr_win_size, start
1079-
Py_ssize_t i, k, nobs = 0, N = len(starti)
1080-
deque Q[int64_t] # min/max always the front
1081-
deque W[int64_t] # track the whole window for nobs compute
1070+
Py_ssize_t i, i_next, k, valid_start, last_end, last_start, N = len(start)
1071+
# Indices of bounded extrema in `values`. `candidates[i]` is always increasing.
1072+
# `values[candidates[i]]` is decreasing for max and increasing for min.
1073+
deque candidates[int64_t]
1074+
# Indices of largest windows that "cover" preceding windows.
1075+
stack dominators[int64_t]
10821076
ndarray[float64_t, ndim=1] output
10831077

1078+
Py_ssize_t this_start, this_end, stash_start
1079+
int64_t q_idx
1080+
10841081
output = np.empty(N, dtype=np.float64)
1085-
Q = deque[int64_t]()
1086-
W = deque[int64_t]()
1082+
candidates = deque[int64_t]()
1083+
dominators = stack[int64_t]()
1084+
1085+
# This function was "ported" / translated from sliding_min_max()
1086+
# in /pandas/core/_numba/kernels/min_max_.py.
1087+
# (See there for credits and some comments.)
1088+
# Code translation assumptions/rules:
1089+
# - min_periods --> minp
1090+
# - deque[0] --> front()
1091+
# - deque[-1] --> back()
1092+
# - stack[-1] --> top()
1093+
# - bool(stack/deque) --> !empty()
1094+
# - deque.append() --> push_back()
1095+
# - stack.append() --> push()
1096+
# - deque.popleft --> pop_front()
1097+
# - deque.pop() --> pop_back()
10871098

10881099
with nogil:
1100+
if minp < 1:
1101+
minp = 1
1102+
1103+
if N>2:
1104+
i_next = N - 1
1105+
for i in range(N - 2, -1, -1):
1106+
if start[i_next] < start[i] \
1107+
and (
1108+
dominators.empty()
1109+
or start[dominators.top()] > start[i_next]
1110+
):
1111+
dominators.push(i_next)
1112+
i_next = i
1113+
1114+
# NaN tracking to guarantee minp
1115+
valid_start = -minp
1116+
1117+
last_end = 0
1118+
last_start = -1
10891119

1090-
# This is using a modified version of the C++ code in this
1091-
# SO post: https://stackoverflow.com/a/12239580
1092-
# The original impl didn't deal with variable window sizes
1093-
# So the code was optimized for that
1094-
1095-
# first window's size
1096-
curr_win_size = endi[0] - starti[0]
1097-
# GH 32865
1098-
# Anchor output index to values index to provide custom
1099-
# BaseIndexer support
11001120
for i in range(N):
1121+
this_start = start[i]
1122+
this_end = end[i]
11011123

1102-
curr_win_size = endi[i] - starti[i]
1103-
if i == 0:
1104-
start = starti[i]
1105-
else:
1106-
start = endi[i - 1]
1107-
1108-
for k in range(start, endi[i]):
1109-
ai = init_mm(values[k], &nobs, is_max)
1110-
# Discard previous entries if we find new min or max
1111-
if is_max:
1112-
while not Q.empty() and ((ai >= values[Q.back()]) or
1113-
values[Q.back()] != values[Q.back()]):
1114-
Q.pop_back()
1115-
else:
1116-
while not Q.empty() and ((ai <= values[Q.back()]) or
1117-
values[Q.back()] != values[Q.back()]):
1118-
Q.pop_back()
1119-
Q.push_back(k)
1120-
W.push_back(k)
1121-
1122-
# Discard entries outside and left of current window
1123-
while not Q.empty() and Q.front() <= starti[i] - 1:
1124-
Q.pop_front()
1125-
while not W.empty() and W.front() <= starti[i] - 1:
1126-
remove_mm(values[W.front()], &nobs)
1127-
W.pop_front()
1128-
1129-
# Save output based on index in input value array
1130-
if not Q.empty() and curr_win_size > 0:
1131-
output[i] = calc_mm(minp, nobs, values[Q.front()])
1124+
if (not dominators.empty() and dominators.top() == i):
1125+
dominators.pop()
1126+
1127+
if not (this_end > last_end
1128+
or (this_end == last_end and this_start >= last_start)):
1129+
raise ValueError(
1130+
"Start/End ordering requirement is violated at index {}".format(i))
1131+
1132+
if dominators.empty():
1133+
stash_start = this_start
11321134
else:
1135+
stash_start = min(this_start, start[dominators.top()])
1136+
1137+
while not candidates.empty() and candidates.front() < stash_start:
1138+
candidates.pop_front()
1139+
1140+
for k in range(last_end, this_end):
1141+
if not isnan(values[k]):
1142+
valid_start += 1
1143+
while valid_start >= 0 and isnan(values[valid_start]):
1144+
valid_start += 1
1145+
1146+
if is_max:
1147+
while (not candidates.empty()
1148+
and values[k] >= values[candidates.back()]):
1149+
candidates.pop_back()
1150+
else:
1151+
while (not candidates.empty()
1152+
and values[k] <= values[candidates.back()]):
1153+
candidates.pop_back()
1154+
candidates.push_back(k)
1155+
1156+
if candidates.empty() or this_start > valid_start:
11331157
output[i] = NaN
1158+
elif candidates.front() >= this_start:
1159+
# ^^ This is here to avoid costly bisection for fixed window sizes.
1160+
output[i] = values[candidates.front()]
1161+
else:
1162+
q_idx = bisect_left(candidates, this_start, lo=1)
1163+
output[i] = values[candidates[q_idx]]
1164+
last_end = this_end
1165+
last_start = this_start
11341166

11351167
return output
11361168

0 commit comments

Comments
 (0)