Skip to content

Commit 084448d

Browse files
committed
BUG: datetime rolling min/max segfaults when closed=left (pandas-dev#21704)
User reported that `df.rolling(to_offset('3D'), closed='left').max()` segfaults when df has a datetime index. The bug was in PR pandas-dev#19549. In that PR, in https://github.com/pandas-dev/pandas/blame/master/pandas/_libs/window.pyx#L1268 `i` is initialized to `endi[0]`, which is 0 when `closed=left`. So in the next line when it tries to set `output[i-1]` it goes out of bounds. In addition, there are 2 more bugs in the `roll_min_max` code. The second bug is that for variable size windows, the `nobs` is never updated when elements leave the window. The third bug is at the end of the fixed window where all output elements up to `minp` are initialized to 0 if the input is not float. This PR fixes these three bugs, at the cost of casting the output array to floating point even if the input is integer. This is less than ideal if the output has no NaNs, but is consistent with roll_sum behavior.
1 parent 5d0daa0 commit 084448d

File tree

3 files changed

+201
-97
lines changed

3 files changed

+201
-97
lines changed

doc/source/whatsnew/v0.24.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,9 @@ Groupby/Resample/Rolling
398398
-
399399
-
400400

401+
- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'` and a
402+
datetime-like index leading to incorrect results and also segfault. (:issue:`21704`)
403+
401404
Sparse
402405
^^^^^^
403406

pandas/_libs/window.pyx

+144-97
Original file line numberDiff line numberDiff line change
@@ -1218,141 +1218,188 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp,
12181218
Moving min/max of 1d array of any numeric type along axis=0
12191219
ignoring NaNs.
12201220
"""
1221-
12221221
cdef:
1223-
numeric ai
1224-
bint is_variable, should_replace
1225-
int64_t N, i, removed, window_i
1226-
Py_ssize_t nobs = 0
1227-
deque Q[int64_t]
12281222
ndarray[int64_t] starti, endi
1229-
ndarray[numeric, ndim=1] output
1230-
cdef:
1231-
int64_t* death
1232-
numeric* ring
1233-
numeric* minvalue
1234-
numeric* end
1235-
numeric* last
1236-
1237-
cdef:
1238-
cdef numeric r
1223+
int64_t N
1224+
bint is_variable
12391225

12401226
starti, endi, N, win, minp, is_variable = get_window_indexer(
12411227
input, win,
12421228
minp, index, closed)
12431229

1244-
output = np.empty(N, dtype=input.dtype)
1230+
if is_variable:
1231+
return _roll_min_max_variable(input, starti, endi, N, win, minp,
1232+
is_max)
1233+
else:
1234+
return _roll_min_max_fixed(input, starti, endi, N, win, minp, is_max)
1235+
12451236

1237+
cdef _roll_min_max_variable(ndarray[numeric] input,
1238+
ndarray[int64_t] starti,
1239+
ndarray[int64_t] endi,
1240+
int64_t N,
1241+
int64_t win,
1242+
int64_t minp,
1243+
bint is_max):
1244+
cdef:
1245+
numeric ai
1246+
int64_t i, close_offset, curr_win_size
1247+
Py_ssize_t nobs = 0
1248+
deque Q[int64_t] # min/max always the front
1249+
deque W[int64_t] # track the whole window for nobs compute
1250+
ndarray[double_t, ndim=1] output
1251+
1252+
output = np.empty(N, dtype=float)
12461253
Q = deque[int64_t]()
1254+
W = deque[int64_t]()
12471255

1248-
if is_variable:
1256+
with nogil:
12491257

1250-
with nogil:
1258+
# This is using a modified version of the C++ code in this
1259+
# SO post: http://bit.ly/2nOoHlY
1260+
# The original impl didn't deal with variable window sizes
1261+
# So the code was optimized for that
12511262

1252-
# This is using a modified version of the C++ code in this
1253-
# SO post: http://bit.ly/2nOoHlY
1254-
# The original impl didn't deal with variable window sizes
1255-
# So the code was optimized for that
1263+
for i from starti[0] <= i < endi[0]:
1264+
ai = init_mm(input[i], &nobs, is_max)
12561265

1257-
for i from starti[0] <= i < endi[0]:
1258-
ai = init_mm(input[i], &nobs, is_max)
1266+
# Discard previous entries if we find new min or max
1267+
if is_max:
1268+
while not Q.empty() and ((ai >= input[Q.back()]) or
1269+
(input[Q.back()] != input[Q.back()])):
1270+
Q.pop_back()
1271+
else:
1272+
while not Q.empty() and ((ai <= input[Q.back()]) or
1273+
(input[Q.back()] != input[Q.back()])):
1274+
Q.pop_back()
1275+
Q.push_back(i)
1276+
W.push_back(i)
1277+
1278+
# if right is open then the first window is empty
1279+
close_offset = 0 if endi[0] > starti[0] else 1
1280+
1281+
for i in range(endi[0], endi[N-1]):
1282+
if not Q.empty():
1283+
output[i-1+close_offset] = calc_mm(
1284+
minp, nobs, input[Q.front()])
1285+
else:
1286+
output[i-1+close_offset] = NaN
12591287

1260-
if is_max:
1261-
while not Q.empty() and ai >= input[Q.back()]:
1262-
Q.pop_back()
1263-
else:
1264-
while not Q.empty() and ai <= input[Q.back()]:
1265-
Q.pop_back()
1266-
Q.push_back(i)
1288+
ai = init_mm(input[i], &nobs, is_max)
1289+
1290+
# Discard previous entries if we find new min or max
1291+
if is_max:
1292+
while not Q.empty() and ((ai >= input[Q.back()]) or
1293+
(input[Q.back()] != input[Q.back()])):
1294+
Q.pop_back()
1295+
else:
1296+
while not Q.empty() and ((ai <= input[Q.back()]) or
1297+
(input[Q.back()] != input[Q.back()])):
1298+
Q.pop_back()
12671299

1268-
for i from endi[0] <= i < N:
1269-
output[i-1] = calc_mm(minp, nobs, input[Q.front()])
1300+
# Maintain window/nobs retention
1301+
curr_win_size = endi[i + close_offset] - starti[i + close_offset]
1302+
while not Q.empty() and Q.front() <= i - curr_win_size:
1303+
Q.pop_front()
1304+
while not W.empty() and W.front() <= i - curr_win_size:
1305+
remove_mm(input[W.front()], &nobs)
1306+
W.pop_front()
12701307

1271-
ai = init_mm(input[i], &nobs, is_max)
1308+
Q.push_back(i)
1309+
W.push_back(i)
12721310

1273-
if is_max:
1274-
while not Q.empty() and ai >= input[Q.back()]:
1275-
Q.pop_back()
1276-
else:
1277-
while not Q.empty() and ai <= input[Q.back()]:
1278-
Q.pop_back()
1311+
output[N-1] = calc_mm(minp, nobs, input[Q.front()])
12791312

1280-
while not Q.empty() and Q.front() <= i - (endi[i] - starti[i]):
1281-
Q.pop_front()
1313+
return output
12821314

1283-
Q.push_back(i)
12841315

1285-
output[N-1] = calc_mm(minp, nobs, input[Q.front()])
1316+
cdef _roll_min_max_fixed(ndarray[numeric] input,
1317+
ndarray[int64_t] starti,
1318+
ndarray[int64_t] endi,
1319+
int64_t N,
1320+
int64_t win,
1321+
int64_t minp,
1322+
bint is_max):
1323+
cdef:
1324+
numeric ai
1325+
bint should_replace
1326+
int64_t i, removed, window_i,
1327+
Py_ssize_t nobs = 0
1328+
int64_t* death
1329+
numeric* ring
1330+
numeric* minvalue
1331+
numeric* end
1332+
numeric* last
1333+
ndarray[double_t, ndim=1] output
12861334

1287-
else:
1288-
# setup the rings of death!
1289-
ring = <numeric *>malloc(win * sizeof(numeric))
1290-
death = <int64_t *>malloc(win * sizeof(int64_t))
1291-
1292-
end = ring + win
1293-
last = ring
1294-
minvalue = ring
1295-
ai = input[0]
1296-
minvalue[0] = init_mm(input[0], &nobs, is_max)
1297-
death[0] = win
1298-
nobs = 0
1335+
output = np.empty(N, dtype=float)
1336+
# setup the rings of death!
1337+
ring = <numeric *>malloc(win * sizeof(numeric))
1338+
death = <int64_t *>malloc(win * sizeof(int64_t))
1339+
1340+
end = ring + win
1341+
last = ring
1342+
minvalue = ring
1343+
ai = input[0]
1344+
minvalue[0] = init_mm(input[0], &nobs, is_max)
1345+
death[0] = win
1346+
nobs = 0
12991347

1300-
with nogil:
1348+
with nogil:
13011349

1302-
for i in range(N):
1303-
ai = init_mm(input[i], &nobs, is_max)
1350+
for i in range(N):
1351+
ai = init_mm(input[i], &nobs, is_max)
13041352

1305-
if i >= win:
1306-
remove_mm(input[i - win], &nobs)
1353+
if i >= win:
1354+
remove_mm(input[i - win], &nobs)
13071355

1308-
if death[minvalue - ring] == i:
1309-
minvalue = minvalue + 1
1310-
if minvalue >= end:
1311-
minvalue = ring
1356+
if death[minvalue - ring] == i:
1357+
minvalue = minvalue + 1
1358+
if minvalue >= end:
1359+
minvalue = ring
13121360

1313-
if is_max:
1314-
should_replace = ai >= minvalue[0]
1315-
else:
1316-
should_replace = ai <= minvalue[0]
1317-
if should_replace:
1361+
if is_max:
1362+
should_replace = ai >= minvalue[0]
1363+
else:
1364+
should_replace = ai <= minvalue[0]
1365+
if should_replace:
13181366

1319-
minvalue[0] = ai
1320-
death[minvalue - ring] = i + win
1321-
last = minvalue
1367+
minvalue[0] = ai
1368+
death[minvalue - ring] = i + win
1369+
last = minvalue
13221370

1323-
else:
1371+
else:
13241372

1373+
if is_max:
1374+
should_replace = last[0] <= ai
1375+
else:
1376+
should_replace = last[0] >= ai
1377+
while should_replace:
1378+
if last == ring:
1379+
last = end
1380+
last -= 1
13251381
if is_max:
13261382
should_replace = last[0] <= ai
13271383
else:
13281384
should_replace = last[0] >= ai
1329-
while should_replace:
1330-
if last == ring:
1331-
last = end
1332-
last -= 1
1333-
if is_max:
1334-
should_replace = last[0] <= ai
1335-
else:
1336-
should_replace = last[0] >= ai
13371385

1338-
last += 1
1339-
if last == end:
1340-
last = ring
1341-
last[0] = ai
1342-
death[last - ring] = i + win
1386+
last += 1
1387+
if last == end:
1388+
last = ring
1389+
last[0] = ai
1390+
death[last - ring] = i + win
13431391

1344-
output[i] = calc_mm(minp, nobs, minvalue[0])
1392+
output[i] = calc_mm(minp, nobs, minvalue[0])
13451393

1346-
for i in range(minp - 1):
1347-
if numeric in cython.floating:
1348-
output[i] = NaN
1349-
else:
1350-
output[i] = 0
1394+
for i in range(minp - 1):
1395+
if numeric in cython.floating:
1396+
output[i] = NaN
1397+
else:
1398+
output[i] = 0
13511399

1352-
free(ring)
1353-
free(death)
1400+
free(ring)
1401+
free(death)
13541402

1355-
# print("output: {0}".format(output))
13561403
return output
13571404

13581405

pandas/tests/test_window.py

+54
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,60 @@ def test_closed(self):
464464
with pytest.raises(ValueError):
465465
df.rolling(window=3, closed='neither')
466466

467+
@pytest.mark.parametrize("input_dtype", ['int', 'float'])
468+
@pytest.mark.parametrize("func,closed,expected", [
469+
('min', 'right', [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]),
470+
('min', 'both', [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]),
471+
('min', 'neither', [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]),
472+
('min', 'left', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]),
473+
('max', 'right', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
474+
('max', 'both', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
475+
('max', 'neither', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]),
476+
('max', 'left', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8])
477+
])
478+
def test_closed_min_max_datetime(self, input_dtype,
479+
func, closed,
480+
expected):
481+
# see gh-21704
482+
ser = pd.Series(data=np.arange(10).astype(input_dtype),
483+
index=pd.date_range('2000', periods=10))
484+
485+
result = getattr(ser.rolling('3D', closed=closed), func)()
486+
expected = pd.Series(expected, index=ser.index)
487+
tm.assert_series_equal(result, expected)
488+
489+
def test_closed_uneven(self):
490+
# see gh-21704
491+
ser = pd.Series(data=np.arange(10),
492+
index=pd.date_range('2000', periods=10))
493+
494+
# uneven
495+
ser = ser.drop(index=ser.index[[1, 5]])
496+
result = ser.rolling('3D', closed='left').min()
497+
expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6],
498+
index=ser.index)
499+
tm.assert_series_equal(result, expected)
500+
501+
@pytest.mark.parametrize("func,closed,expected", [
502+
('min', 'right', [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]),
503+
('min', 'both', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]),
504+
('min', 'neither', [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]),
505+
('min', 'left', [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]),
506+
('max', 'right', [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]),
507+
('max', 'both', [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]),
508+
('max', 'neither', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]),
509+
('max', 'left', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan])
510+
])
511+
def test_closed_min_max_minp(self, func, closed, expected):
512+
# see gh-21704
513+
ser = pd.Series(data=np.arange(10),
514+
index=pd.date_range('2000', periods=10))
515+
ser[ser.index[-3:]] = np.nan
516+
result = getattr(ser.rolling('3D', min_periods=2, closed=closed),
517+
func)()
518+
expected = pd.Series(expected, index=ser.index)
519+
tm.assert_series_equal(result, expected)
520+
467521
@pytest.mark.parametrize('roller', ['1s', 1])
468522
def tests_empty_df_rolling(self, roller):
469523
# GH 15819 Verifies that datetime and integer rolling windows can be

0 commit comments

Comments
 (0)