From 4e1da00a7fde96bc87c0d63112ad5cd1f5153be7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 14 Oct 2020 17:23:32 -0700 Subject: [PATCH 1/9] Move min_periods less than zero check to validate --- pandas/_libs/window/aggregations.pyx | 2 -- pandas/core/window/rolling.py | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 131e0154ce8fc..d0b1845f6db19 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1093,8 +1093,6 @@ cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, f"window (win)") elif minp > in_n: minp = in_n + 1 - elif minp < 0: - raise ValueError('min_periods must be >= 0') minp = max(minp, 1) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index cc0927437ad1d..29d729874478d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -120,8 +120,6 @@ def calculate_min_periods( raise ValueError(f"min_periods {min_periods} must be <= window {window}") elif min_periods > num_values: min_periods = num_values + 1 - elif min_periods < 0: - raise ValueError("min_periods must be >= 0") return max(min_periods, floor) @@ -201,8 +199,11 @@ def is_freq_type(self) -> bool: def validate(self) -> None: if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if self.min_periods is not None and not is_integer(self.min_periods): - raise ValueError("min_periods must be an integer") + if self.min_periods is not None: + if not is_integer(self.min_periods): + raise ValueError("min_periods must be an integer") + elif self.min_periods < 0: + raise ValueError("min_periods must be >= 0") if self.closed is not None and self.closed not in [ "right", "both", From 11ee69e41df6c1a3efe8cf682b5a5c9d514adca0 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 14 Oct 2020 18:23:20 -0700 Subject: [PATCH 2/9] move min period > window check to validate, adjust incorrect tests --- pandas/_libs/window/aggregations.pyx | 3 --- pandas/core/window/rolling.py | 8 +++++--- pandas/tests/window/test_rolling.py | 8 ++++---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index d0b1845f6db19..5521ec8aa5b6f 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1088,9 +1088,6 @@ cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, if avg: tot_wgt = np.zeros(in_n, dtype=np.float64) - if minp > win_n: - raise ValueError(f"min_periods (minp) must be <= " - f"window (win)") elif minp > in_n: minp = in_n + 1 diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 29d729874478d..3440394de878b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -116,9 +116,7 @@ def calculate_min_periods( min_periods = window else: min_periods = max(required_min_periods, min_periods) - if min_periods > window: - raise ValueError(f"min_periods {min_periods} must be <= window {window}") - elif min_periods > num_values: + if min_periods > num_values: min_periods = num_values + 1 return max(min_periods, floor) @@ -204,6 +202,10 @@ def validate(self) -> None: raise ValueError("min_periods must be an integer") elif self.min_periods < 0: raise ValueError("min_periods must be >= 0") + elif is_integer(self.window) and self.min_periods > self.window: + raise ValueError( + f"min_periods {self.min_periods} must be <= window {self.window}" + ) if self.closed is not None and self.closed not in [ "right", "both", diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 3d59f6cdd4996..f1d54d91c6d22 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -498,7 +498,7 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): ({"A": [2, 3], "B": [5, 6]}, [1, 2]), ], 2, - 3, + 2, ), ( DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), @@ -518,7 +518,7 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): ({"A": [3], "B": [6]}, [2]), ], 1, - 2, + 0, ), (DataFrame({"A": [1], "B": [4]}), [], 2, None), (DataFrame({"A": [1], "B": [4]}), [], 2, 1), @@ -605,9 +605,9 @@ def test_iter_rolling_on_dataframe(expected, window): 1, ), (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 1), - (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 3), + (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 2), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0), - (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 2), + (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 1), (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0), (Series([], dtype="int64"), [], 2, 1), ], From 3660862929c015c8ebe8add06bb4b52f4b1ddd71 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 14 Oct 2020 18:38:25 -0700 Subject: [PATCH 3/9] Move required min periods to agg functions --- pandas/_libs/window/aggregations.pyx | 3 +++ pandas/core/window/rolling.py | 15 +-------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5521ec8aa5b6f..7fdbaa999a732 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -370,6 +370,7 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[float64_t] output bint is_monotonic_bounds + minp = max(minp, 1) is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) output = np.empty(N, dtype=float) @@ -488,6 +489,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[float64_t] output bint is_monotonic_bounds + minp = max(minp, 3) is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) output = np.empty(N, dtype=float) @@ -612,6 +614,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[float64_t] output bint is_monotonic_bounds + minp = max(minp, 4) is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) output = np.empty(N, dtype=float) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3440394de878b..73d5606617b7d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -94,7 +94,6 @@ def calculate_min_periods( window: int, min_periods: Optional[int], num_values: int, - required_min_periods: int, floor: int, ) -> int: """ @@ -105,7 +104,6 @@ def calculate_min_periods( window : passed window value min_periods : passed min periods value num_values : total number of values - required_min_periods : required min periods per aggregation function floor : required min periods per aggregation function Returns @@ -114,8 +112,6 @@ def calculate_min_periods( """ if min_periods is None: min_periods = window - else: - min_periods = max(required_min_periods, min_periods) if min_periods > num_values: min_periods = num_values + 1 return max(min_periods, floor) @@ -517,7 +513,6 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def _apply( self, func: Callable, - require_min_periods: int = 0, floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, @@ -532,7 +527,6 @@ def _apply( Parameters ---------- func : callable function to apply - require_min_periods : int floor : int is_weighted : bool name : str, @@ -562,14 +556,13 @@ def homogeneous_func(values: np.ndarray): def calc(x): if not isinstance(self.window, BaseIndexer): min_periods = calculate_min_periods( - window, self.min_periods, len(x), require_min_periods, floor + window, self.min_periods, len(x), floor ) else: min_periods = calculate_min_periods( window_indexer.window_size, self.min_periods, len(x), - require_min_periods, floor, ) start, end = window_indexer.get_window_bounds( @@ -894,7 +887,6 @@ def __init__(self, obj, *args, **kwargs): def _apply( self, func: Callable, - require_min_periods: int = 0, floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, @@ -903,7 +895,6 @@ def _apply( ): result = super()._apply( func, - require_min_periods, floor, is_weighted, name, @@ -1601,7 +1592,6 @@ def zsqrt_func(values, begin, end, min_periods): return self._apply( zsqrt_func, - require_min_periods=1, name="std", **kwargs, ) @@ -1611,7 +1601,6 @@ def var(self, ddof=1, *args, **kwargs): window_func = partial(self._get_roll_func("roll_var"), ddof=ddof) return self._apply( window_func, - require_min_periods=1, name="var", **kwargs, ) @@ -1631,7 +1620,6 @@ def skew(self, **kwargs): window_func = self._get_roll_func("roll_skew") return self._apply( window_func, - require_min_periods=3, name="skew", **kwargs, ) @@ -1725,7 +1713,6 @@ def kurt(self, **kwargs): window_func = self._get_roll_func("roll_kurt") return self._apply( window_func, - require_min_periods=4, name="kurt", **kwargs, ) From 35aeaf721151741c0cec7beef71fa9c8699e44a7 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 15 Oct 2020 16:43:05 -0700 Subject: [PATCH 4/9] Remove require_min_periods call --- pandas/core/window/rolling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b767585f9e541..3e51aaf86721c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1146,7 +1146,6 @@ def _get_window_weights( def _apply( self, func: Callable[[np.ndarray, int, int], np.ndarray], - require_min_periods: int = 0, floor: int = 1, name: Optional[str] = None, use_numba_cache: bool = False, From 3470a0cfd1ebcbda486fcc5118beb1608cbb0e09 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 15 Oct 2020 23:25:55 -0700 Subject: [PATCH 5/9] Remove floor --- pandas/_libs/window/aggregations.pyx | 9 +++++++-- pandas/core/window/rolling.py | 17 +++-------------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 41aed4c1d6e5b..b50eaf800533a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -658,7 +658,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win=0): + ndarray[int64_t] end, int64_t minp): # GH 32865. win argument kept for compatibility cdef: float64_t val, res, prev @@ -666,7 +666,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, int ret = 0 skiplist_t *sl Py_ssize_t i, j - int64_t nobs = 0, N = len(values), s, e + int64_t nobs = 0, N = len(values), s, e, win int midpoint ndarray[float64_t] output @@ -724,6 +724,8 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, else: res = (skiplist_get(sl, midpoint, &ret) + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + if ret == 0: + res = NaN else: res = NaN @@ -1011,6 +1013,9 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) output[i] = (vlow + vhigh) / 2 + + if ret == 0: + output[i] = NaN else: output[i] = NaN diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3e51aaf86721c..7e11ee10f6fbf 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -78,7 +78,6 @@ def calculate_min_periods( window: int, min_periods: Optional[int], num_values: int, - floor: int, ) -> int: """ Calculate final minimum periods value for rolling aggregations. @@ -88,7 +87,6 @@ def calculate_min_periods( window : passed window value min_periods : passed min periods value num_values : total number of values - floor : required min periods per aggregation function Returns ------- @@ -98,7 +96,7 @@ def calculate_min_periods( min_periods = window if min_periods > num_values: min_periods = num_values + 1 - return max(min_periods, floor) + return min_periods class BaseWindow(ShallowMixin, SelectionMixin): @@ -432,7 +430,6 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def _apply( self, func: Callable[..., Any], - floor: int = 1, name: Optional[str] = None, use_numba_cache: bool = False, **kwargs, @@ -445,7 +442,6 @@ def _apply( Parameters ---------- func : callable function to apply - floor : int name : str, use_numba_cache : bool whether to cache a numba compiled function. Only available for numba @@ -469,14 +465,13 @@ def homogeneous_func(values: np.ndarray): def calc(x): if not isinstance(self.window, BaseIndexer): min_periods = calculate_min_periods( - window, self.min_periods, len(x), floor + window, self.min_periods, len(x) ) else: min_periods = calculate_min_periods( window_indexer.window_size, self.min_periods, len(x), - floor, ) start, end = window_indexer.get_window_bounds( @@ -790,14 +785,12 @@ def __init__(self, obj, *args, **kwargs): def _apply( self, func: Callable[..., Any], - floor: int = 1, name: Optional[str] = None, use_numba_cache: bool = False, **kwargs, ) -> FrameOrSeries: result = super()._apply( func, - floor, name, use_numba_cache, **kwargs, @@ -1146,7 +1139,6 @@ def _get_window_weights( def _apply( self, func: Callable[[np.ndarray, int, int], np.ndarray], - floor: int = 1, name: Optional[str] = None, use_numba_cache: bool = False, **kwargs, @@ -1159,8 +1151,6 @@ def _apply( Parameters ---------- func : callable function to apply - require_min_periods : int - floor : int name : str, use_numba_cache : bool whether to cache a numba compiled function. Only available for numba @@ -1414,7 +1404,6 @@ def apply( return self._apply( apply_func, - floor=0, use_numba_cache=maybe_use_numba(engine), original_func=func, args=args, @@ -1448,7 +1437,7 @@ def apply_func(values, begin, end, min_periods, raw=raw): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_sum") - return self._apply(window_func, floor=0, name="sum", **kwargs) + return self._apply(window_func, name="sum", **kwargs) _shared_docs["max"] = dedent( """ From 56a862f78f9c648c2b8c12930f3949e277308dd0 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 15 Oct 2020 23:45:54 -0700 Subject: [PATCH 6/9] Remove if condition --- pandas/core/window/rolling.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7e11ee10f6fbf..8f71f144350f5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -463,16 +463,11 @@ def homogeneous_func(values: np.ndarray): return values.copy() def calc(x): - if not isinstance(self.window, BaseIndexer): - min_periods = calculate_min_periods( - window, self.min_periods, len(x) - ) - else: - min_periods = calculate_min_periods( - window_indexer.window_size, - self.min_periods, - len(x), - ) + min_periods = calculate_min_periods( + window_indexer.window_size, + self.min_periods, + len(x), + ) start, end = window_indexer.get_window_bounds( num_values=len(x), From d4f6ece0eacdf5bc3e72973a0e97dbbd570be3b6 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 15 Oct 2020 23:54:02 -0700 Subject: [PATCH 7/9] Remove another clause in calculate_min_periods --- pandas/core/window/rolling.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8f71f144350f5..b15ec4202dd70 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -77,7 +77,6 @@ def calculate_min_periods( window: int, min_periods: Optional[int], - num_values: int, ) -> int: """ Calculate final minimum periods value for rolling aggregations. @@ -86,7 +85,6 @@ def calculate_min_periods( ---------- window : passed window value min_periods : passed min periods value - num_values : total number of values Returns ------- @@ -94,8 +92,6 @@ def calculate_min_periods( """ if min_periods is None: min_periods = window - if min_periods > num_values: - min_periods = num_values + 1 return min_periods From 8e8469bb23b940cbb6981e6e912c20b8a7491234 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 16 Oct 2020 00:09:33 -0700 Subject: [PATCH 8/9] Inline min_periods calculation --- pandas/core/window/rolling.py | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b15ec4202dd70..452e1c252183f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -74,27 +74,6 @@ from pandas.core.internals import Block # noqa:F401 -def calculate_min_periods( - window: int, - min_periods: Optional[int], -) -> int: - """ - Calculate final minimum periods value for rolling aggregations. - - Parameters - ---------- - window : passed window value - min_periods : passed min periods value - - Returns - ------- - min_periods : int - """ - if min_periods is None: - min_periods = window - return min_periods - - class BaseWindow(ShallowMixin, SelectionMixin): """Provides utilities for performing windowing operations.""" @@ -451,6 +430,11 @@ def _apply( """ window = self._get_window() window_indexer = self._get_window_indexer(window) + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) def homogeneous_func(values: np.ndarray): # calculation function @@ -459,15 +443,9 @@ def homogeneous_func(values: np.ndarray): return values.copy() def calc(x): - min_periods = calculate_min_periods( - window_indexer.window_size, - self.min_periods, - len(x), - ) - start, end = window_indexer.get_window_bounds( num_values=len(x), - min_periods=self.min_periods, + min_periods=min_periods, center=self.center, closed=self.closed, ) From 6affa3cc7f69822b6813c341284439ed35b9102b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Fri, 16 Oct 2020 09:13:17 -0700 Subject: [PATCH 9/9] Add whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9fc094330fb36..dac75349ccff5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -462,6 +462,7 @@ Groupby/resample/rolling - Bug in :meth:`RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) - Bug in :meth:`DataFrame.groupby.rolling` returning wrong values with partial centered window (:issue:`36040`). - Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) Reshaping ^^^^^^^^^