From 875b66b226bfeea8a22912f5924e83c93a30dcbf Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 16 Oct 2020 20:57:44 +0200 Subject: [PATCH 1/6] BUG: Bug in quantile() and median() returned wrong result for non monotonic window borders --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 91 ++++++++++++++++-------- pandas/tests/window/test_base_indexer.py | 28 ++++++++ 3 files changed, 90 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9fc094330fb36..0268bd3f4ca3b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -462,6 +462,7 @@ Groupby/resample/rolling - Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) - Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`). - Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`Rolling.median()` and :meth:`Rolling.quantile()` returned wrong values for ``CustomIndexer`` with non-monotonic starting or ending points for windows (:issue:`37153`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index fe97c048d4472..2c4e03694149d 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -698,22 +698,38 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, else: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - skiplist_remove(sl, val) - nobs -= 1 - + if end[i - 1] > e: + for j in range(e, end[i - 1]): + val = values[j] + if notnan(val): + skiplist_remove(sl, val) + nobs -= 1 + else: + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if notnan(val): + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + # if start was shifted back, add these again + if start[i -1] > s: + for j in range(s, start[i -1]): + val = values[j] + if notnan(val): + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + else: + # calculate deletes if start is shifted forward + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(sl, val) + nobs -= 1 if nobs >= minp: midpoint = (nobs / 2) if nobs % 2: @@ -955,20 +971,35 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, skiplist_insert(skiplist, val) else: - - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - nobs += 1 - skiplist_insert(skiplist, val) - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - skiplist_remove(skiplist, val) - nobs -= 1 + # Remove values again if end was moved back + if end[i - 1] > e: + for j in range(e, end[i - 1]): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 + else: + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if notnan(val): + nobs += 1 + skiplist_insert(skiplist, val) + + # if start was shifted back, add these again + if start[i -1] > s: + for j in range(s, start[i -1]): + val = values[j] + if notnan(val): + nobs += 1 + skiplist_insert(skiplist, val) + else: + # calculate deletes if start is shifted forward + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 if nobs >= minp: if nobs == 1: diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 7f2d58effe1ae..1723330ec40e1 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -263,3 +263,31 @@ def test_fixed_forward_indexer_count(): result = df.rolling(window=indexer, min_periods=0).count() expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("end_value", "values"), [(1, [0.0, 1, 1, 3, 2]), (-1, [0.0, 1, 0, 3, 1])] +) +@pytest.mark.parametrize(("func", "args"), [("median", []), ("quantile", [0.5])]) +def test_indexer_quantile_sum(end_value, values, func, args): + # GH 37153 + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = max(i + end_value, 1) + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + use_expanding = [True, False, True, False, True] + df = DataFrame({"values": range(5)}) + + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + result = getattr(df.rolling(indexer), func)(*args) + expected = DataFrame({"values": values}) + tm.assert_frame_equal(result, expected) From 9195352a3dff8bfb253f6580c9ceaa6d40a5fc60 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 16 Oct 2020 21:33:24 +0200 Subject: [PATCH 2/6] Adjust whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0268bd3f4ca3b..6d57765f52865 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -462,7 +462,7 @@ Groupby/resample/rolling - Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) - Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`). - Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) -- Bug in :meth:`Rolling.median()` and :meth:`Rolling.quantile()` returned wrong values for ``CustomIndexer`` with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) Reshaping ^^^^^^^^^ From 540df125e006288ba9f05b28c860fc76ee3e3b15 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 16 Oct 2020 23:40:04 +0200 Subject: [PATCH 3/6] Use is monotonic instead --- pandas/_libs/window/aggregations.pyx | 73 +++++++++++++++------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3cebd3c1818f1..e3fc6277d3b5b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -155,6 +155,8 @@ def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, e = end[i] if i == 0 or not is_monotonic_bounds: + with gil: + print(sum_x) # setup @@ -669,6 +671,11 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, int64_t nobs = 0, N = len(values), s, e, win int midpoint ndarray[float64_t] output + bint is_monotonic_increasing_bounds + + is_monotonic_increasing_bounds = is_monotonic_start_end_bounds( + start, end + ) # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs @@ -688,7 +695,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0: + if i == 0 or not is_monotonic_increasing_bounds: # setup for j in range(s, e): @@ -701,38 +708,21 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, else: - if end[i - 1] > e: - for j in range(e, end[i - 1]): - val = values[j] - if notnan(val): - skiplist_remove(sl, val) - nobs -= 1 - else: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if notnan(val): + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break - # if start was shifted back, add these again - if start[i -1] > s: - for j in range(s, start[i -1]): - val = values[j] - if notnan(val): - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - else: - # calculate deletes if start is shifted forward - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - skiplist_remove(sl, val) - nobs -= 1 + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(sl, val) + nobs -= 1 if nobs >= minp: midpoint = (nobs / 2) if nobs % 2: @@ -747,6 +737,10 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = res + if not is_monotonic_increasing_bounds: + nobs = 0 + sl = skiplist_init(win) + skiplist_destroy(sl) if err: raise MemoryError("skiplist_insert failed") @@ -934,7 +928,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, cdef: float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), win Py_ssize_t idx ndarray[float64_t] output float64_t vlow, vhigh @@ -949,6 +943,9 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, except KeyError: raise ValueError(f"Interpolation '{interpolation}' is not supported") + is_monotonic_increasing_bounds = is_monotonic_start_end_bounds( + start, end + ) # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs output = np.empty(N, dtype=float) @@ -965,8 +962,13 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, for i in range(0, N): s = start[i] e = end[i] + with gil: + print(nobs) - if i == 0: + if i == 0 or not is_monotonic_increasing_bounds: + if not is_monotonic_increasing_bounds: + nobs = 0 + skiplist = skiplist_init(win) # setup for j in range(s, e): @@ -1005,7 +1007,8 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, if notnan(val): skiplist_remove(skiplist, val) nobs -= 1 - + with gil: + print(nobs) if nobs >= minp: if nobs == 1: # Single value in skip list From abaca8248b322b9456a8c17e07d549eb6be1b4c3 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 16 Oct 2020 23:43:27 +0200 Subject: [PATCH 4/6] Remove unncecessary code --- pandas/_libs/window/aggregations.pyx | 46 ++++++++-------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index e3fc6277d3b5b..bef274051cb4a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -155,8 +155,6 @@ def roll_sum(ndarray[float64_t] values, ndarray[int64_t] start, e = end[i] if i == 0 or not is_monotonic_bounds: - with gil: - print(sum_x) # setup @@ -978,37 +976,19 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, skiplist_insert(skiplist, val) else: - # Remove values again if end was moved back - if end[i - 1] > e: - for j in range(e, end[i - 1]): - val = values[j] - if notnan(val): - skiplist_remove(skiplist, val) - nobs -= 1 - else: - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - nobs += 1 - skiplist_insert(skiplist, val) - - # if start was shifted back, add these again - if start[i -1] > s: - for j in range(s, start[i -1]): - val = values[j] - if notnan(val): - nobs += 1 - skiplist_insert(skiplist, val) - else: - # calculate deletes if start is shifted forward - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - skiplist_remove(skiplist, val) - nobs -= 1 - with gil: - print(nobs) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if notnan(val): + nobs += 1 + skiplist_insert(skiplist, val) + + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 if nobs >= minp: if nobs == 1: # Single value in skip list From 1aa8503df53cc15cdddc2444c1b7656eb3ded28d Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 16 Oct 2020 23:44:13 +0200 Subject: [PATCH 5/6] Remove unncecessary code --- pandas/_libs/window/aggregations.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index bef274051cb4a..3b53a73af5327 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -960,8 +960,6 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, for i in range(0, N): s = start[i] e = end[i] - with gil: - print(nobs) if i == 0 or not is_monotonic_increasing_bounds: if not is_monotonic_increasing_bounds: From 51be9e9a4d63c4898692e4dac8a386c5738b5777 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 25 Oct 2020 23:14:29 +0100 Subject: [PATCH 6/6] Rename function after related pr was merged --- pandas/_libs/window/aggregations.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 1e36b8f2f90fc..b2dbf7802e6f0 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -688,7 +688,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[float64_t] output bint is_monotonic_increasing_bounds - is_monotonic_increasing_bounds = is_monotonic_start_end_bounds( + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end ) @@ -958,7 +958,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, except KeyError: raise ValueError(f"Interpolation '{interpolation}' is not supported") - is_monotonic_increasing_bounds = is_monotonic_start_end_bounds( + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end ) # we use the Fixed/Variable Indexer here as the