Skip to content

Commit 181d671

Browse files
committed
BUG: datetime rolling min/max segfaults when closed=left (pandas-dev#21704)
User reported that `df.rolling(to_offset('3D'), closed='left').max()` segfaults when df has a datetime index. The bug was in PR pandas-dev#19549. In that PR, in https://github.com/pandas-dev/pandas/blame/master/pandas/_libs/window.pyx#L1268 `i` is initialized to `endi[0]`, which is 0 when `closed=left`. So in the next line when it tries to set `output[i-1]` it goes out of bounds. In addition, there are 2 more bugs in the `roll_min_max` code. The second bug is that for variable size windows, the `nobs` is never updated when elements leave the window. The third bug is at the end of the fixed window where all output elements up to `minp` are initialized to 0 if the input is not float. This PR fixes all three of the aforementioned bugs, at the cost of casting the output array to floating point even if the input is integer. This is less than ideal if the output has no NaNs, but is still consistent with roll_sum behavior.
1 parent 5d0daa0 commit 181d671

File tree

3 files changed

+88
-21
lines changed

3 files changed

+88
-21
lines changed

doc/source/whatsnew/v0.24.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,9 @@ Groupby/Resample/Rolling
398398
-
399399
-
400400

401+
- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'` and a
402+
datetime-like index leading to incorrect results and also segfault. (:issue:`21704`)
403+
401404
Sparse
402405
^^^^^^
403406

pandas/_libs/window.pyx

+42-21
Original file line numberDiff line numberDiff line change
@@ -1222,11 +1222,12 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp,
12221222
cdef:
12231223
numeric ai
12241224
bint is_variable, should_replace
1225-
int64_t N, i, removed, window_i
1225+
int64_t N, i, removed, window_i, close_offset
12261226
Py_ssize_t nobs = 0
12271227
deque Q[int64_t]
1228+
deque W[int64_t]
12281229
ndarray[int64_t] starti, endi
1229-
ndarray[numeric, ndim=1] output
1230+
ndarray[double_t, ndim=1] output
12301231
cdef:
12311232
int64_t* death
12321233
numeric* ring
@@ -1241,49 +1242,69 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp,
12411242
input, win,
12421243
minp, index, closed)
12431244

1244-
output = np.empty(N, dtype=input.dtype)
1245+
output = np.empty(N, dtype=float)
12451246

1247+
# The front should always be the minimum in the window
12461248
Q = deque[int64_t]()
12471249

12481250
if is_variable:
1249-
1251+
# need this to maintain the window for nobs calc
1252+
W = deque[int64_t]()
12501253
with nogil:
12511254

12521255
# This is using a modified version of the C++ code in this
12531256
# SO post: http://bit.ly/2nOoHlY
12541257
# The original impl didn't deal with variable window sizes
12551258
# So the code was optimized for that
12561259

1257-
for i from starti[0] <= i < endi[0]:
1258-
ai = init_mm(input[i], &nobs, is_max)
1259-
1260-
if is_max:
1261-
while not Q.empty() and ai >= input[Q.back()]:
1262-
Q.pop_back()
1263-
else:
1264-
while not Q.empty() and ai <= input[Q.back()]:
1265-
Q.pop_back()
1266-
Q.push_back(i)
1267-
1268-
for i from endi[0] <= i < N:
1269-
output[i-1] = calc_mm(minp, nobs, input[Q.front()])
1260+
if endi[0] > starti[0]:
1261+
# right is closed
1262+
for i from starti[0] <= i < endi[0]:
1263+
ai = init_mm(input[i], &nobs, is_max)
12701264

1265+
if is_max:
1266+
while not Q.empty() and ai >= input[Q.back()]:
1267+
Q.pop_back()
1268+
else:
1269+
while not Q.empty() and ai <= input[Q.back()]:
1270+
Q.pop_back()
1271+
Q.push_back(i)
1272+
W.push_back(i)
1273+
output[0] = calc_mm(minp, nobs, input[Q.front()])
1274+
close_offset = 0
1275+
else:
1276+
init_mm(input[endi[0]], &nobs, is_max)
1277+
Q.push_back(endi[0])
1278+
W.push_back(endi[0])
1279+
# if right is open then the first output is always NaN
1280+
output[0] = NaN
1281+
close_offset = 1
1282+
1283+
for i from endi[close_offset] <= i < N:
12711284
ai = init_mm(input[i], &nobs, is_max)
12721285

1286+
# Discard previous entries if we find new min or max
12731287
if is_max:
12741288
while not Q.empty() and ai >= input[Q.back()]:
12751289
Q.pop_back()
12761290
else:
12771291
while not Q.empty() and ai <= input[Q.back()]:
12781292
Q.pop_back()
12791293

1280-
while not Q.empty() and Q.front() <= i - (endi[i] - starti[i]):
1294+
# Remove elements leaving the window
1295+
while not Q.empty() and Q.front() <= (i - (endi[i] - starti[i])
1296+
- close_offset):
12811297
Q.pop_front()
12821298

1283-
Q.push_back(i)
1284-
1285-
output[N-1] = calc_mm(minp, nobs, input[Q.front()])
1299+
# Update the nobs
1300+
while not W.empty() and W.front() <= (i - (endi[i] - starti[i])
1301+
- close_offset):
1302+
remove_mm(input[W.front()], &nobs)
1303+
W.pop_front()
12861304

1305+
Q.push_back(i)
1306+
W.push_back(i)
1307+
output[i] = calc_mm(minp, nobs, input[Q.front()])
12871308
else:
12881309
# setup the rings of death!
12891310
ring = <numeric *>malloc(win * sizeof(numeric))

pandas/tests/test_window.py

+43
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,49 @@ def test_closed(self):
464464
with pytest.raises(ValueError):
465465
df.rolling(window=3, closed='neither')
466466

467+
def test_closed_min_max_datetime(self):
468+
# see gh-21704
469+
ser = pd.Series(data=np.arange(10),
470+
index=pd.date_range('2000', periods=10))
471+
472+
result = ser.rolling('3D', closed='right').min()
473+
expected = pd.Series([0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7],
474+
index=ser.index)
475+
tm.assert_series_equal(result, expected)
476+
477+
result = ser.rolling('3D', closed='both').min()
478+
expected = pd.Series([0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6],
479+
index=ser.index)
480+
tm.assert_series_equal(result, expected)
481+
482+
result = ser.rolling('3D', closed='neither').min()
483+
expected = pd.Series([np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7],
484+
index=ser.index)
485+
tm.assert_series_equal(result, expected)
486+
487+
# shouldn't segfault
488+
result = ser.rolling('3D', closed='left').min()
489+
expected = pd.Series([np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6],
490+
index=ser.index)
491+
tm.assert_series_equal(result, expected)
492+
493+
# uneven
494+
ser = ser.drop(index=ser.index[[1, 5]])
495+
result = ser.rolling('3D', closed='left').min()
496+
expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6],
497+
index=ser.index)
498+
tm.assert_series_equal(result, expected)
499+
500+
def test_closed_min_max_minp(self):
501+
# see gh-21704
502+
ser = pd.Series(data=np.arange(10),
503+
index=pd.date_range('2000', periods=10))
504+
ser[ser.index[-3:]] = np.nan
505+
result = ser.rolling('3D', min_periods=2, closed='left').min()
506+
expected = pd.Series([np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan],
507+
index=ser.index)
508+
tm.assert_series_equal(result, expected)
509+
467510
@pytest.mark.parametrize('roller', ['1s', 1])
468511
def tests_empty_df_rolling(self, roller):
469512
# GH 15819 Verifies that datetime and integer rolling windows can be

0 commit comments

Comments
 (0)