diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 034a56b2ac0cb..798e414b3e60c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -398,6 +398,9 @@ Groupby/Resample/Rolling - - +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) + Sparse ^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 9e704a9bd8d3f..bd6cd476595f3 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1218,141 +1218,188 @@ cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs. """ - cdef: - numeric ai - bint is_variable, should_replace - int64_t N, i, removed, window_i - Py_ssize_t nobs = 0 - deque Q[int64_t] ndarray[int64_t] starti, endi - ndarray[numeric, ndim=1] output - cdef: - int64_t* death - numeric* ring - numeric* minvalue - numeric* end - numeric* last - - cdef: - cdef numeric r + int64_t N + bint is_variable starti, endi, N, win, minp, is_variable = get_window_indexer( input, win, minp, index, closed) - output = np.empty(N, dtype=input.dtype) + if is_variable: + return _roll_min_max_variable(input, starti, endi, N, win, minp, + is_max) + else: + return _roll_min_max_fixed(input, starti, endi, N, win, minp, is_max) + +cdef _roll_min_max_variable(ndarray[numeric] input, + ndarray[int64_t] starti, + ndarray[int64_t] endi, + int64_t N, + int64_t win, + int64_t minp, + bint is_max): + cdef: + numeric ai + int64_t i, close_offset, curr_win_size + Py_ssize_t nobs = 0 + deque Q[int64_t] # min/max always the front + deque W[int64_t] # track the whole window for nobs compute + ndarray[double_t, ndim=1] output + + output = np.empty(N, dtype=float) Q = deque[int64_t]() + W = deque[int64_t]() - if is_variable: + with nogil: - with nogil: + # This is using a modified version of the C++ code in this + # SO post: http://bit.ly/2nOoHlY + # The original impl didn't deal with variable window sizes + # So the code was optimized for that - # This is using a modified version of the C++ code in this - # SO post: http://bit.ly/2nOoHlY - # The original impl didn't deal with variable window sizes - # So the code was optimized for that + for i from starti[0] <= i < endi[0]: + ai = init_mm(input[i], &nobs, is_max) - for i from starti[0] <= i < endi[0]: - ai = init_mm(input[i], &nobs, is_max) + # Discard previous entries if we find new min or max + if is_max: + while not Q.empty() and ((ai >= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() + else: + while not Q.empty() and ((ai <= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() + Q.push_back(i) + W.push_back(i) + + # if right is open then the first window is empty + close_offset = 0 if endi[0] > starti[0] else 1 + + for i in range(endi[0], endi[N-1]): + if not Q.empty(): + output[i-1+close_offset] = calc_mm( + minp, nobs, input[Q.front()]) + else: + output[i-1+close_offset] = NaN - if is_max: - while not Q.empty() and ai >= input[Q.back()]: - Q.pop_back() - else: - while not Q.empty() and ai <= input[Q.back()]: - Q.pop_back() - Q.push_back(i) + ai = init_mm(input[i], &nobs, is_max) + + # Discard previous entries if we find new min or max + if is_max: + while not Q.empty() and ((ai >= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() + else: + while not Q.empty() and ((ai <= input[Q.back()]) or + (input[Q.back()] != input[Q.back()])): + Q.pop_back() - for i from endi[0] <= i < N: - output[i-1] = calc_mm(minp, nobs, input[Q.front()]) + # Maintain window/nobs retention + curr_win_size = endi[i + close_offset] - starti[i + close_offset] + while not Q.empty() and Q.front() <= i - curr_win_size: + Q.pop_front() + while not W.empty() and W.front() <= i - curr_win_size: + remove_mm(input[W.front()], &nobs) + W.pop_front() - ai = init_mm(input[i], &nobs, is_max) + Q.push_back(i) + W.push_back(i) - if is_max: - while not Q.empty() and ai >= input[Q.back()]: - Q.pop_back() - else: - while not Q.empty() and ai <= input[Q.back()]: - Q.pop_back() + output[N-1] = calc_mm(minp, nobs, input[Q.front()]) - while not Q.empty() and Q.front() <= i - (endi[i] - starti[i]): - Q.pop_front() + return output - Q.push_back(i) - output[N-1] = calc_mm(minp, nobs, input[Q.front()]) +cdef _roll_min_max_fixed(ndarray[numeric] input, + ndarray[int64_t] starti, + ndarray[int64_t] endi, + int64_t N, + int64_t win, + int64_t minp, + bint is_max): + cdef: + numeric ai + bint should_replace + int64_t i, removed, window_i, + Py_ssize_t nobs = 0 + int64_t* death + numeric* ring + numeric* minvalue + numeric* end + numeric* last + ndarray[double_t, ndim=1] output - else: - # setup the rings of death! - ring = malloc(win * sizeof(numeric)) - death = malloc(win * sizeof(int64_t)) - - end = ring + win - last = ring - minvalue = ring - ai = input[0] - minvalue[0] = init_mm(input[0], &nobs, is_max) - death[0] = win - nobs = 0 + output = np.empty(N, dtype=float) + # setup the rings of death! + ring = malloc(win * sizeof(numeric)) + death = malloc(win * sizeof(int64_t)) + + end = ring + win + last = ring + minvalue = ring + ai = input[0] + minvalue[0] = init_mm(input[0], &nobs, is_max) + death[0] = win + nobs = 0 - with nogil: + with nogil: - for i in range(N): - ai = init_mm(input[i], &nobs, is_max) + for i in range(N): + ai = init_mm(input[i], &nobs, is_max) - if i >= win: - remove_mm(input[i - win], &nobs) + if i >= win: + remove_mm(input[i - win], &nobs) - if death[minvalue - ring] == i: - minvalue = minvalue + 1 - if minvalue >= end: - minvalue = ring + if death[minvalue - ring] == i: + minvalue = minvalue + 1 + if minvalue >= end: + minvalue = ring - if is_max: - should_replace = ai >= minvalue[0] - else: - should_replace = ai <= minvalue[0] - if should_replace: + if is_max: + should_replace = ai >= minvalue[0] + else: + should_replace = ai <= minvalue[0] + if should_replace: - minvalue[0] = ai - death[minvalue - ring] = i + win - last = minvalue + minvalue[0] = ai + death[minvalue - ring] = i + win + last = minvalue - else: + else: + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 if is_max: should_replace = last[0] <= ai else: should_replace = last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - if is_max: - should_replace = last[0] <= ai - else: - should_replace = last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i + win + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i + win - output[i] = calc_mm(minp, nobs, minvalue[0]) + output[i] = calc_mm(minp, nobs, minvalue[0]) - for i in range(minp - 1): - if numeric in cython.floating: - output[i] = NaN - else: - output[i] = 0 + for i in range(minp - 1): + if numeric in cython.floating: + output[i] = NaN + else: + output[i] = 0 - free(ring) - free(death) + free(ring) + free(death) - # print("output: {0}".format(output)) return output diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 78d1fa84cc5db..14966177978f4 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -464,6 +464,60 @@ def test_closed(self): with pytest.raises(ValueError): df.rolling(window=3, closed='neither') + @pytest.mark.parametrize("input_dtype", ['int', 'float']) + @pytest.mark.parametrize("func,closed,expected", [ + ('min', 'right', [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ('min', 'both', [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ('min', 'neither', [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ('min', 'left', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ('max', 'right', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ('max', 'both', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ('max', 'neither', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ('max', 'left', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]) + ]) + def test_closed_min_max_datetime(self, input_dtype, + func, closed, + expected): + # see gh-21704 + ser = pd.Series(data=np.arange(10).astype(input_dtype), + index=pd.date_range('2000', periods=10)) + + result = getattr(ser.rolling('3D', closed=closed), func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + + def test_closed_uneven(self): + # see gh-21704 + ser = pd.Series(data=np.arange(10), + index=pd.date_range('2000', periods=10)) + + # uneven + ser = ser.drop(index=ser.index[[1, 5]]) + result = ser.rolling('3D', closed='left').min() + expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], + index=ser.index) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func,closed,expected", [ + ('min', 'right', [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ('min', 'both', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ('min', 'neither', [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ('min', 'left', [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ('max', 'right', [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), + ('max', 'both', [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), + ('max', 'neither', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), + ('max', 'left', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]) + ]) + def test_closed_min_max_minp(self, func, closed, expected): + # see gh-21704 + ser = pd.Series(data=np.arange(10), + index=pd.date_range('2000', periods=10)) + ser[ser.index[-3:]] = np.nan + result = getattr(ser.rolling('3D', min_periods=2, closed=closed), + func)() + expected = pd.Series(expected, index=ser.index) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('roller', ['1s', 1]) def tests_empty_df_rolling(self, roller): # GH 15819 Verifies that datetime and integer rolling windows can be