Skip to content

Commit 5a45c0d

Browse files
authored
BUG: Bug in quantile() and median() returned wrong result for non monotonic window borders (#37166)
* BUG: Bug in quantile() and median() returned wrong result for non monotonic window borders * Adjust whatsnew * Use is monotonic instead * Remove unncecessary code * Remove unncecessary code * Rename function after related pr was merged
1 parent 8324081 commit 5a45c0d

File tree

3 files changed

+47
-6
lines changed

3 files changed

+47
-6
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@ Groupby/resample/rolling
506506
- Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`)
507507
- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`)
508508
- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`)
509+
- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`)
509510

510511
Reshaping
511512
^^^^^^^^^

pandas/_libs/window/aggregations.pyx

+18-6
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,11 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start,
686686
int64_t nobs = 0, N = len(values), s, e, win
687687
int midpoint
688688
ndarray[float64_t] output
689+
bint is_monotonic_increasing_bounds
690+
691+
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
692+
start, end
693+
)
689694

690695
# we use the Fixed/Variable Indexer here as the
691696
# actual skiplist ops outweigh any window computation costs
@@ -705,7 +710,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start,
705710
s = start[i]
706711
e = end[i]
707712

708-
if i == 0:
713+
if i == 0 or not is_monotonic_increasing_bounds:
709714

710715
# setup
711716
for j in range(s, e):
@@ -733,7 +738,6 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start,
733738
if notnan(val):
734739
skiplist_remove(sl, val)
735740
nobs -= 1
736-
737741
if nobs >= minp:
738742
midpoint = <int>(nobs / 2)
739743
if nobs % 2:
@@ -748,6 +752,10 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start,
748752

749753
output[i] = res
750754

755+
if not is_monotonic_increasing_bounds:
756+
nobs = 0
757+
sl = skiplist_init(<int>win)
758+
751759
skiplist_destroy(sl)
752760
if err:
753761
raise MemoryError("skiplist_insert failed")
@@ -935,7 +943,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start,
935943
cdef:
936944
float64_t val, prev, midpoint, idx_with_fraction
937945
skiplist_t *skiplist
938-
int64_t nobs = 0, i, j, s, e, N = len(values)
946+
int64_t nobs = 0, i, j, s, e, N = len(values), win
939947
Py_ssize_t idx
940948
ndarray[float64_t] output
941949
float64_t vlow, vhigh
@@ -950,6 +958,9 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start,
950958
except KeyError:
951959
raise ValueError(f"Interpolation '{interpolation}' is not supported")
952960

961+
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
962+
start, end
963+
)
953964
# we use the Fixed/Variable Indexer here as the
954965
# actual skiplist ops outweigh any window computation costs
955966
output = np.empty(N, dtype=float)
@@ -967,7 +978,10 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start,
967978
s = start[i]
968979
e = end[i]
969980

970-
if i == 0:
981+
if i == 0 or not is_monotonic_increasing_bounds:
982+
if not is_monotonic_increasing_bounds:
983+
nobs = 0
984+
skiplist = skiplist_init(<int>win)
971985

972986
# setup
973987
for j in range(s, e):
@@ -977,7 +991,6 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start,
977991
skiplist_insert(skiplist, val)
978992

979993
else:
980-
981994
# calculate adds
982995
for j in range(end[i - 1], e):
983996
val = values[j]
@@ -991,7 +1004,6 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start,
9911004
if notnan(val):
9921005
skiplist_remove(skiplist, val)
9931006
nobs -= 1
994-
9951007
if nobs >= minp:
9961008
if nobs == 1:
9971009
# Single value in skip list

pandas/tests/window/test_base_indexer.py

+28
Original file line numberDiff line numberDiff line change
@@ -263,3 +263,31 @@ def test_fixed_forward_indexer_count():
263263
result = df.rolling(window=indexer, min_periods=0).count()
264264
expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]})
265265
tm.assert_frame_equal(result, expected)
266+
267+
268+
@pytest.mark.parametrize(
269+
("end_value", "values"), [(1, [0.0, 1, 1, 3, 2]), (-1, [0.0, 1, 0, 3, 1])]
270+
)
271+
@pytest.mark.parametrize(("func", "args"), [("median", []), ("quantile", [0.5])])
272+
def test_indexer_quantile_sum(end_value, values, func, args):
273+
# GH 37153
274+
class CustomIndexer(BaseIndexer):
275+
def get_window_bounds(self, num_values, min_periods, center, closed):
276+
start = np.empty(num_values, dtype=np.int64)
277+
end = np.empty(num_values, dtype=np.int64)
278+
for i in range(num_values):
279+
if self.use_expanding[i]:
280+
start[i] = 0
281+
end[i] = max(i + end_value, 1)
282+
else:
283+
start[i] = i
284+
end[i] = i + self.window_size
285+
return start, end
286+
287+
use_expanding = [True, False, True, False, True]
288+
df = DataFrame({"values": range(5)})
289+
290+
indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
291+
result = getattr(df.rolling(indexer), func)(*args)
292+
expected = DataFrame({"values": values})
293+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)