From 1c8c24a83ed10a2a36d4101a01b3f9ea69c55eed Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 24 Nov 2019 22:58:09 -0800 Subject: [PATCH 01/26] Add BaseIndexer class --- pandas/_libs/window/indexers.pyx | 209 ++++++++++++++++++++----------- 1 file changed, 137 insertions(+), 72 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index eab9f0f8aab43..5539710de87f9 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,6 +1,6 @@ # cython: boundscheck=False, wraparound=False, cdivision=True -from typing import Tuple +from typing import Optional, Tuple import numpy as np from numpy cimport ndarray, int64_t @@ -10,68 +10,147 @@ from numpy cimport ndarray, int64_t # These define start/end indexers to compute offsets -class FixedWindowIndexer: - """ - create a fixed length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - index: object - index of the values - closed: string - closed behavior - """ - def __init__(self, ndarray values, int64_t win, object closed, object index=None): +class BaseIndexer: + """Base class for window bounds calculations""" + + def __init__( + self, + **kwargs, + ): + """ + Parameters + ---------- + **kwargs : + keyword argument that will be available when get_window_bounds is called + """ + self.__dict__.update(kwargs) + + def get_window_bounds( + self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ + raise NotImplementedError + + +class FixedWindowIndexer(BaseIndexer): + """Creates window boundaries that are of fixed length.""" + + def get_window_bounds(self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the fixed bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ cdef: - ndarray[int64_t, ndim=1] start_s, start_e, end_s, end_e - int64_t N = len(values) + ndarray[int64_t, ndim=1] start, start_s, start_e, end, end_s, end_e - start_s = np.zeros(win, dtype='int64') - start_e = np.arange(win, N, dtype='int64') - win + 1 - self.start = np.concatenate([start_s, start_e])[:N] + start_s = np.zeros(window_size, dtype='int64') + start_e = np.arange(window_size, num_values, dtype='int64') - window_size + 1 + start = np.concatenate([start_s, start_e])[:num_values] - end_s = np.arange(win, dtype='int64') + 1 - end_e = start_e + win - self.end = np.concatenate([end_s, end_e])[:N] - - def get_window_bounds(self) -> Tuple[np.ndarray, np.ndarray]: - return self.start, self.end + end_s = np.arange(window_size, dtype='int64') + 1 + end_e = start_e + window_size + end = np.concatenate([end_s, end_e])[:num_values] + return start, end class VariableWindowIndexer: - """ - create a variable length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - index: ndarray - index of the values - closed: string - closed behavior - """ - def __init__(self, ndarray values, int64_t win, object closed, ndarray index): + """Creates window boundaries that are of variable length, namely for time series.""" + + def get_window_bounds(self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the variable bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ cdef: bint left_closed = False bint right_closed = False - int64_t N = len(index) + ndarray[int64_t, ndim=1] start, end + int64_t start_bound, end_bound + Py_ssize_t i, j # if windows is variable, default is 'right', otherwise default is 'both' if closed is None: - closed = 'right' if index is not None else 'both' + closed = 'right' if self.index is not None else 'both' if closed in ['right', 'both']: right_closed = True @@ -79,20 +158,9 @@ class VariableWindowIndexer: if closed in ['left', 'both']: left_closed = True - self.start, self.end = self.build(index, win, left_closed, right_closed, N) - - @staticmethod - def build(const int64_t[:] index, int64_t win, bint left_closed, - bint right_closed, int64_t N) -> Tuple[np.ndarray, np.ndarray]: - - cdef: - ndarray[int64_t] start, end - int64_t start_bound, end_bound - Py_ssize_t i, j - - start = np.empty(N, dtype='int64') + start = np.empty(num_values, dtype='int64') start.fill(-1) - end = np.empty(N, dtype='int64') + end = np.empty(num_values, dtype='int64') end.fill(-1) start[0] = 0 @@ -108,9 +176,9 @@ class VariableWindowIndexer: # start is start of slice interval (including) # end is end of slice interval (not including) - for i in range(1, N): - end_bound = index[i] - start_bound = index[i] - win + for i in range(1, num_values): + end_bound = self.index[i] + start_bound = self.index[i] - window_size # left endpoint is closed if left_closed: @@ -120,13 +188,13 @@ class VariableWindowIndexer: # within the constraint start[i] = i for j in range(start[i - 1], i): - if index[j] > start_bound: + if self.index[j] > start_bound: start[i] = j break # end bound is previous end # or current index - if index[end[i - 1]] <= end_bound: + if self.index[end[i - 1]] <= end_bound: end[i] = i + 1 else: end[i] = end[i - 1] @@ -135,6 +203,3 @@ class VariableWindowIndexer: if not right_closed: end[i] -= 1 return start, end - - def get_window_bounds(self) -> Tuple[np.ndarray, np.ndarray]: - return self.start, self.end From 6b5e894fbfd5b5ea85c936c7e791a0f9243c86fc Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Nov 2019 22:34:06 -0800 Subject: [PATCH 02/26] Reformat Indexers --- pandas/_libs/window/indexers.pyx | 95 +++++++++++++++++++------------- pandas/core/window/rolling.py | 14 ++--- 2 files changed, 65 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 5539710de87f9..1d7acd3b264ce 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -63,6 +63,9 @@ class BaseIndexer: class FixedWindowIndexer(BaseIndexer): """Creates window boundaries that are of fixed length.""" + def __init__(self, **kwargs): + super().__init__(**kwargs) + def get_window_bounds(self, num_values: int = 0, window_size: int = 0, @@ -107,40 +110,22 @@ class FixedWindowIndexer(BaseIndexer): return start, end -class VariableWindowIndexer: +class VariableWindowIndexer(BaseIndexer): """Creates window boundaries that are of variable length, namely for time series.""" - def get_window_bounds(self, - num_values: int = 0, - window_size: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - win_type: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the variable bounds of a window. - - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @staticmethod + def _get_window_bound( + int64_t num_values, + int64_t window_size, + object min_periods, + object center, + object closed, + object win_type, + const int64_t[:] index + ): cdef: bint left_closed = False bint right_closed = False @@ -150,7 +135,7 @@ class VariableWindowIndexer: # if windows is variable, default is 'right', otherwise default is 'both' if closed is None: - closed = 'right' if self.index is not None else 'both' + closed = 'right' if index is not None else 'both' if closed in ['right', 'both']: right_closed = True @@ -177,8 +162,8 @@ class VariableWindowIndexer: # start is start of slice interval (including) # end is end of slice interval (not including) for i in range(1, num_values): - end_bound = self.index[i] - start_bound = self.index[i] - window_size + end_bound = index[i] + start_bound = index[i] - window_size # left endpoint is closed if left_closed: @@ -188,13 +173,13 @@ class VariableWindowIndexer: # within the constraint start[i] = i for j in range(start[i - 1], i): - if self.index[j] > start_bound: + if index[j] > start_bound: start[i] = j break # end bound is previous end # or current index - if self.index[end[i - 1]] <= end_bound: + if index[end[i - 1]] <= end_bound: end[i] = i + 1 else: end[i] = end[i - 1] @@ -203,3 +188,39 @@ class VariableWindowIndexer: if not right_closed: end[i] -= 1 return start, end + + def get_window_bounds(self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the variable bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ + # We do this since cython doesn't like accessing class attributes in nogil + return self._get_window_bound( + num_values, window_size, min_periods, center, closed, win_type, self.index + ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7f3404100f71c..cd09e59430545 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -400,13 +400,13 @@ def _get_cython_func_type(self, func): self._get_roll_func("{}_fixed".format(func)), win=self._get_window() ) - def _get_window_indexer(self): + def _get_window_indexer(self, index_as_array): """ Return an indexer class that will compute the window start and end bounds """ if self.is_freq_type: - return window_indexers.VariableWindowIndexer - return window_indexers.FixedWindowIndexer + return window_indexers.VariableWindowIndexer(index=index_as_array) + return window_indexers.FixedWindowIndexer() def _apply( self, @@ -445,7 +445,7 @@ def _apply( blocks, obj = self._create_blocks() block_list = list(blocks) index_as_array = self._get_index() - window_indexer = self._get_window_indexer() + window_indexer = self._get_window_indexer(index_as_array) results = [] exclude: List[Scalar] = [] @@ -476,9 +476,9 @@ def calc(x): min_periods = calculate_min_periods( window, self.min_periods, len(x), require_min_periods, floor ) - start, end = window_indexer( - x, window, self.closed, index_as_array - ).get_window_bounds() + start, end = window_indexer.get_window_bounds( + num_values=len(x), window_size=window, closed=self.closed + ) return func(x, start, end, min_periods) else: From 3a310f6563b81a631360ee2c539dfdfaff052a5a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Nov 2019 22:39:35 -0800 Subject: [PATCH 03/26] Remove init --- pandas/_libs/window/indexers.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 1d7acd3b264ce..7e82052c4ce89 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -63,9 +63,6 @@ class BaseIndexer: class FixedWindowIndexer(BaseIndexer): """Creates window boundaries that are of fixed length.""" - def __init__(self, **kwargs): - super().__init__(**kwargs) - def get_window_bounds(self, num_values: int = 0, window_size: int = 0, @@ -113,9 +110,6 @@ class FixedWindowIndexer(BaseIndexer): class VariableWindowIndexer(BaseIndexer): """Creates window boundaries that are of variable length, namely for time series.""" - def __init__(self, **kwargs): - super().__init__(**kwargs) - @staticmethod def _get_window_bound( int64_t num_values, From d1d07754668cf7ab367528a97bc335b8a108597f Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Nov 2019 23:05:36 -0800 Subject: [PATCH 04/26] Add BaseIndexer to api and allow rolling to accept BaseIndexer subclasses --- pandas/api/__init__.py | 2 +- pandas/api/indexers/__init__.py | 2 ++ pandas/core/window/rolling.py | 20 ++++++++++++++++++-- pandas/tests/api/test_api.py | 2 +- 4 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 pandas/api/indexers/__init__.py diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 58422811990c4..d0a26864a1102 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,2 +1,2 @@ """ public toolkit API """ -from . import extensions, types # noqa +from . import extensions, indexers, types # noqa diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py new file mode 100644 index 0000000000000..ba1b0c287d9f9 --- /dev/null +++ b/pandas/api/indexers/__init__.py @@ -0,0 +1,2 @@ +"""Public API for Rolling Window Indexers""" +from pandas._libs.window.indexers import BaseIndexer # noqa: F401 diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index cd09e59430545..588d0defffe7a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -404,6 +404,8 @@ def _get_window_indexer(self, index_as_array): """ Return an indexer class that will compute the window start and end bounds """ + if isinstance(self.window, window_indexers.BaseIndexer): + return self.window if self.is_freq_type: return window_indexers.VariableWindowIndexer(index=index_as_array) return window_indexers.FixedWindowIndexer() @@ -759,13 +761,18 @@ class Window(_Window): Parameters ---------- - window : int, or offset + window : int, offset, or BaseIndexer subclass Size of the moving window. This is the number of observations used for calculating the statistic. Each window will be a fixed size. If its an offset then this will be the time period of each window. Each window will be a variable sized based on the observations included in the time-period. This is only valid for datetimelike indexes. + + If a BaseIndexer subclass is passed, calculates the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely `min_periods`, `center`, `win_type`, and + `closed` will be passed to `get_window_bounds`. min_periods : int, default None Minimum number of observations in window required to have a value (otherwise result is NA). For a window that is specified by an offset, @@ -906,7 +913,7 @@ def validate(self): super().validate() window = self.window - if isinstance(window, (list, tuple, np.ndarray)): + if isinstance(window, (list, tuple, np.ndarray, window_indexers.BaseIndexer)): pass elif is_integer(window): if window <= 0: @@ -995,6 +1002,13 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) + elif isinstance(window, window_indexers.BaseIndexer): + return window.get_window_bounds( + win_type=self.win_type, + min_periods=self.min_periods, + center=self.center, + closed=self.closed, + ) def _get_weighted_roll_func( self, cfunc: Callable, check_minp: Callable, **kwargs @@ -1762,6 +1776,8 @@ def validate(self): if self.min_periods is None: self.min_periods = 1 + elif isinstance(self.window, window_indexers.BaseIndexer): + pass elif not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 1282aa6edd538..b9bfd15070719 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -218,7 +218,7 @@ def test_api(self): class TestApi(Base): - allowed = ["types", "extensions"] + allowed = ["types", "extensions", "indexers"] def test_api(self): From c10854d1456ffbf51fb85304ab86623182847162 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Nov 2019 09:55:33 -0800 Subject: [PATCH 05/26] Lint cython files --- pandas/_libs/window/indexers.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 7e82052c4ce89..2b26188be0d58 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -63,7 +63,8 @@ class BaseIndexer: class FixedWindowIndexer(BaseIndexer): """Creates window boundaries that are of fixed length.""" - def get_window_bounds(self, + def get_window_bounds( + self, num_values: int = 0, window_size: int = 0, min_periods: Optional[int] = None, @@ -183,7 +184,8 @@ class VariableWindowIndexer(BaseIndexer): end[i] -= 1 return start, end - def get_window_bounds(self, + def get_window_bounds( + self, num_values: int = 0, window_size: int = 0, min_periods: Optional[int] = None, From c237090048b6c5f0a3e5bef4226f1eeb95d5b0d2 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Nov 2019 12:05:48 -0800 Subject: [PATCH 06/26] Move indexers to pandas/core/window/indexers --- pandas/_libs/window/indexers.pyx | 290 ++++++++----------------------- pandas/api/indexers/__init__.py | 2 +- pandas/core/window/indexers.py | 141 +++++++++++++++ pandas/core/window/rolling.py | 2 +- 4 files changed, 216 insertions(+), 219 deletions(-) create mode 100644 pandas/core/window/indexers.py diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 2b26188be0d58..f4a84651d2ea7 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,222 +1,78 @@ # cython: boundscheck=False, wraparound=False, cdivision=True -from typing import Optional, Tuple - import numpy as np from numpy cimport ndarray, int64_t -# ---------------------------------------------------------------------- -# The indexer objects for rolling -# These define start/end indexers to compute offsets - - -class BaseIndexer: - """Base class for window bounds calculations""" - - def __init__( - self, - **kwargs, - ): - """ - Parameters - ---------- - **kwargs : - keyword argument that will be available when get_window_bounds is called - """ - self.__dict__.update(kwargs) - - def get_window_bounds( - self, - num_values: int = 0, - window_size: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - win_type: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the bounds of a window. - - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ - raise NotImplementedError - - -class FixedWindowIndexer(BaseIndexer): - """Creates window boundaries that are of fixed length.""" - - def get_window_bounds( - self, - num_values: int = 0, - window_size: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - win_type: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the fixed bounds of a window. - - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ - cdef: - ndarray[int64_t, ndim=1] start, start_s, start_e, end, end_s, end_e - - start_s = np.zeros(window_size, dtype='int64') - start_e = np.arange(window_size, num_values, dtype='int64') - window_size + 1 - start = np.concatenate([start_s, start_e])[:num_values] - - end_s = np.arange(window_size, dtype='int64') + 1 - end_e = start_e + window_size - end = np.concatenate([end_s, end_e])[:num_values] - return start, end - - -class VariableWindowIndexer(BaseIndexer): - """Creates window boundaries that are of variable length, namely for time series.""" - - @staticmethod - def _get_window_bound( - int64_t num_values, - int64_t window_size, - object min_periods, - object center, - object closed, - object win_type, - const int64_t[:] index - ): - cdef: - bint left_closed = False - bint right_closed = False - ndarray[int64_t, ndim=1] start, end - int64_t start_bound, end_bound - Py_ssize_t i, j - - # if windows is variable, default is 'right', otherwise default is 'both' - if closed is None: - closed = 'right' if index is not None else 'both' - - if closed in ['right', 'both']: - right_closed = True - - if closed in ['left', 'both']: - left_closed = True - - start = np.empty(num_values, dtype='int64') - start.fill(-1) - end = np.empty(num_values, dtype='int64') - end.fill(-1) - - start[0] = 0 - - # right endpoint is closed - if right_closed: - end[0] = 1 - # right endpoint is open - else: - end[0] = 0 - - with nogil: - - # start is start of slice interval (including) - # end is end of slice interval (not including) - for i in range(1, num_values): - end_bound = index[i] - start_bound = index[i] - window_size - - # left endpoint is closed - if left_closed: - start_bound -= 1 - - # advance the start bound until we are - # within the constraint - start[i] = i - for j in range(start[i - 1], i): - if index[j] > start_bound: - start[i] = j - break - - # end bound is previous end - # or current index - if index[end[i - 1]] <= end_bound: - end[i] = i + 1 - else: - end[i] = end[i - 1] - - # right endpoint is open - if not right_closed: - end[i] -= 1 - return start, end - - def get_window_bounds( - self, - num_values: int = 0, - window_size: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - win_type: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the variable bounds of a window. - - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ - # We do this since cython doesn't like accessing class attributes in nogil - return self._get_window_bound( - num_values, window_size, min_periods, center, closed, win_type, self.index - ) +# Cython routines for window indexers + +def calculate_variable_window_bounds( + int64_t num_values, + int64_t window_size, + object min_periods, + object center, + object closed, + object win_type, + const int64_t[:] index +): + cdef: + bint left_closed = False + bint right_closed = False + ndarray[int64_t, ndim=1] start, end + int64_t start_bound, end_bound + Py_ssize_t i, j + + # if windows is variable, default is 'right', otherwise default is 'both' + if closed is None: + closed = 'right' if index is not None else 'both' + + if closed in ['right', 'both']: + right_closed = True + + if closed in ['left', 'both']: + left_closed = True + + start = np.empty(num_values, dtype='int64') + start.fill(-1) + end = np.empty(num_values, dtype='int64') + end.fill(-1) + + start[0] = 0 + + # right endpoint is closed + if right_closed: + end[0] = 1 + # right endpoint is open + else: + end[0] = 0 + + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, num_values): + end_bound = index[i] + start_bound = index[i] - window_size + + # left endpoint is closed + if left_closed: + start_bound -= 1 + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + # right endpoint is open + if not right_closed: + end[i] -= 1 + return start, end diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index ba1b0c287d9f9..a5d6bc07da3eb 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -1,2 +1,2 @@ """Public API for Rolling Window Indexers""" -from pandas._libs.window.indexers import BaseIndexer # noqa: F401 +from pandas.core.window.indexers import BaseIndexer # noqa: F401 diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py new file mode 100644 index 0000000000000..6c40c8fda344a --- /dev/null +++ b/pandas/core/window/indexers.py @@ -0,0 +1,141 @@ +"""Indexer objects for computing start/end window bounds for rolling operations""" +from typing import Optional, Tuple + +import numpy as np + +from pandas._libs.window.indexers import calculate_variable_window_bounds + + +class BaseIndexer: + """Base class for window bounds calculations""" + + def __init__( + self, index: Optional[np.ndarray] = None, **kwargs, + ): + """ + Parameters + ---------- + **kwargs : + keyword argument that will be available when get_window_bounds is called + """ + self.index = index + self.__dict__.update(kwargs) + + def get_window_bounds( + self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ + raise NotImplementedError + + +class FixedWindowIndexer(BaseIndexer): + """Creates window boundaries that are of fixed length.""" + + def get_window_bounds( + self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the fixed bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ + start_s = np.zeros(window_size, dtype="int64") + start_e = np.arange(window_size, num_values, dtype="int64") - window_size + 1 + start = np.concatenate([start_s, start_e])[:num_values] + + end_s = np.arange(window_size, dtype="int64") + 1 + end_e = start_e + window_size + end = np.concatenate([end_s, end_e])[:num_values] + return start, end + + +class VariableWindowIndexer(BaseIndexer): + """Creates window boundaries that are of variable length, namely for time series.""" + + def get_window_bounds( + self, + num_values: int = 0, + window_size: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Computes the variable bounds of a window. + + Parameters + ---------- + num_values : int, default 0 + number of values that will be aggregated over + window_size : int, default 0 + the number of rows in a window + min_periods : int, default None + min_periods passed from the top level rolling API + center : bool, default None + center passed from the top level rolling API + closed : str, default None + closed passed from the top level rolling API + win_type : str, default None + win_type passed from the top level rolling API + + Returns + ------- + A tuple of ndarray[int64]s, indicating the boundaries of each + window + """ + return calculate_variable_window_bounds( + num_values, window_size, min_periods, center, closed, win_type, self.index + ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 588d0defffe7a..24605231266ba 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -10,7 +10,6 @@ import numpy as np import pandas._libs.window.aggregations as window_aggregations -import pandas._libs.window.indexers as window_indexers from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -49,6 +48,7 @@ _zsqrt, calculate_min_periods, ) +import pandas.core.window.indexers as window_indexers class _Window(PandasObject, ShallowMixin, SelectionMixin): From 1ddc82818e389a73cfc91b7fd8d70ffa5cb041fe Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Nov 2019 12:13:16 -0800 Subject: [PATCH 07/26] Share get_window_bounds docstring --- pandas/core/window/indexers.py | 95 +++++++++++----------------------- 1 file changed, 29 insertions(+), 66 deletions(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 6c40c8fda344a..b2fd7abc029c6 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -4,6 +4,32 @@ import numpy as np from pandas._libs.window.indexers import calculate_variable_window_bounds +from pandas.util._decorators import Appender + + +get_window_bounds_doc = """ +Computes the bounds of a window. + +Parameters +---------- +num_values : int, default 0 + number of values that will be aggregated over +window_size : int, default 0 + the number of rows in a window +min_periods : int, default None + min_periods passed from the top level rolling API +center : bool, default None + center passed from the top level rolling API +closed : str, default None + closed passed from the top level rolling API +win_type : str, default None + win_type passed from the top level rolling API + +Returns +------- +A tuple of ndarray[int64]s, indicating the boundaries of each +window +""" class BaseIndexer: @@ -21,6 +47,7 @@ def __init__( self.index = index self.__dict__.update(kwargs) + @Appender(get_window_bounds_doc) def get_window_bounds( self, num_values: int = 0, @@ -30,35 +57,14 @@ def get_window_bounds( closed: Optional[str] = None, win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the bounds of a window. - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ raise NotImplementedError class FixedWindowIndexer(BaseIndexer): """Creates window boundaries that are of fixed length.""" + @Appender(get_window_bounds_doc) def get_window_bounds( self, num_values: int = 0, @@ -68,29 +74,7 @@ def get_window_bounds( closed: Optional[str] = None, win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the fixed bounds of a window. - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ start_s = np.zeros(window_size, dtype="int64") start_e = np.arange(window_size, num_values, dtype="int64") - window_size + 1 start = np.concatenate([start_s, start_e])[:num_values] @@ -104,6 +88,7 @@ def get_window_bounds( class VariableWindowIndexer(BaseIndexer): """Creates window boundaries that are of variable length, namely for time series.""" + @Appender(get_window_bounds_doc) def get_window_bounds( self, num_values: int = 0, @@ -113,29 +98,7 @@ def get_window_bounds( closed: Optional[str] = None, win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes the variable bounds of a window. - Parameters - ---------- - num_values : int, default 0 - number of values that will be aggregated over - window_size : int, default 0 - the number of rows in a window - min_periods : int, default None - min_periods passed from the top level rolling API - center : bool, default None - center passed from the top level rolling API - closed : str, default None - closed passed from the top level rolling API - win_type : str, default None - win_type passed from the top level rolling API - - Returns - ------- - A tuple of ndarray[int64]s, indicating the boundaries of each - window - """ return calculate_variable_window_bounds( num_values, window_size, min_periods, center, closed, win_type, self.index ) From a861982dd3f74093064dac07c6fedd696fc3398b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Nov 2019 12:14:53 -0800 Subject: [PATCH 08/26] isort --- pandas/core/window/indexers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index b2fd7abc029c6..5e340c7ba10e9 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -6,7 +6,6 @@ from pandas._libs.window.indexers import calculate_variable_window_bounds from pandas.util._decorators import Appender - get_window_bounds_doc = """ Computes the bounds of a window. From 8f482f7909fc513422747818ad22e06c201f928a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Nov 2019 12:43:47 -0800 Subject: [PATCH 09/26] Validate signature of get_window_bounds --- pandas/tests/window/test_base_indexer.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pandas/tests/window/test_base_indexer.py diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 38691c732d49f272d16cdd1c1766a5ea6c78fa34 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 27 Nov 2019 12:44:13 -0800 Subject: [PATCH 10/26] Validate signature of get_window_bounds --- pandas/core/window/rolling.py | 21 +++++++++++++++++++++ pandas/tests/window/test_base_indexer.py | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 24605231266ba..ff2339612db69 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -4,6 +4,7 @@ """ from datetime import timedelta from functools import partial +import inspect from textwrap import dedent from typing import Callable, Dict, List, Optional, Set, Tuple, Union @@ -118,6 +119,26 @@ def validate(self): raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): raise TypeError("invalid type: {}".format(type(self))) + if isinstance(self.window, window_indexers.BaseIndexer): + self._validate_get_window_bounds_signature(self.window) + + @staticmethod + def _validate_get_window_bounds_signature(window): + """ + Validate that the passed BaseIndexer subclass has + a get_window_bounds with the correct signature. + """ + get_window_bounds_signature = inspect.signature( + window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + window_indexers.BaseIndexer.get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(window).__name__} does not implement the correct signature for" + f"get_window_bounds" + ) def _create_blocks(self): """ diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index e69de29bb2d1d..64278ce812942 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -0,0 +1,18 @@ +import pytest + +from pandas import Series + +from pandas.api.indexers import BaseIndexer + + +def test_bad_get_window_bounds_signature(): + class BadIndexer(BaseIndexer): + def get_window_bounds(self): + return None + + indexer = BadIndexer() + with pytest.raises( + ValueError, + match="BadIndexer does not implement the correct signature forget_window_bounds.", + ): + Series(range(5)).rolling(indexer) From d18e9549f18fd09bfafbad87eab7ed47b4f2468e Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 28 Nov 2019 12:53:34 -0800 Subject: [PATCH 11/26] Lint --- pandas/_libs/window/indexers.pyx | 1 + pandas/tests/window/test_base_indexer.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index f4a84651d2ea7..61cec1d03228c 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -5,6 +5,7 @@ from numpy cimport ndarray, int64_t # Cython routines for window indexers + def calculate_variable_window_bounds( int64_t num_values, int64_t window_size, diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 64278ce812942..a15c3361fc688 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -1,7 +1,6 @@ import pytest from pandas import Series - from pandas.api.indexers import BaseIndexer @@ -13,6 +12,7 @@ def get_window_bounds(self): indexer = BadIndexer() with pytest.raises( ValueError, - match="BadIndexer does not implement the correct signature forget_window_bounds.", + match="BadIndexer does not implement the correct signature " + "forget_window_bounds.", ): Series(range(5)).rolling(indexer) From f06e8e6e40f29e83f313c973d150274d7239aa88 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 28 Nov 2019 12:56:42 -0800 Subject: [PATCH 12/26] Comment on unused variables in calculate_variable_window_bounds --- pandas/_libs/window/indexers.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 61cec1d03228c..2f0f0be691c5f 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -9,10 +9,10 @@ from numpy cimport ndarray, int64_t def calculate_variable_window_bounds( int64_t num_values, int64_t window_size, - object min_periods, - object center, + object min_periods, # unused but here to match get_window_bounds signature + object center, # unused but here to match get_window_bounds signature object closed, - object win_type, + object win_type, # unused but here to match get_window_bounds signature const int64_t[:] index ): cdef: From 7ccbcd062889718de83c6c085fcf93420fdeec05 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 28 Nov 2019 12:59:16 -0800 Subject: [PATCH 13/26] Type annotate _get_window_indexer & black --- pandas/core/window/rolling.py | 4 +++- pandas/tests/window/test_base_indexer.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ff2339612db69..35b99a4df259e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -421,7 +421,9 @@ def _get_cython_func_type(self, func): self._get_roll_func("{}_fixed".format(func)), win=self._get_window() ) - def _get_window_indexer(self, index_as_array): + def _get_window_indexer( + self, index_as_array: Optional[np.ndarray] + ) -> window_indexers.BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index a15c3361fc688..75f2419e1bee1 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -13,6 +13,6 @@ def get_window_bounds(self): with pytest.raises( ValueError, match="BadIndexer does not implement the correct signature " - "forget_window_bounds.", + "forget_window_bounds.", ): Series(range(5)).rolling(indexer) From 4e2fd3052377dbc096828658a56ed2259f57379b Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 28 Nov 2019 13:00:58 -0800 Subject: [PATCH 14/26] self.index -> self.index_array --- pandas/core/window/indexers.py | 6 +++--- pandas/core/window/rolling.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 5e340c7ba10e9..9aa2b9783a90f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -35,7 +35,7 @@ class BaseIndexer: """Base class for window bounds calculations""" def __init__( - self, index: Optional[np.ndarray] = None, **kwargs, + self, index_array: Optional[np.ndarray] = None, **kwargs, ): """ Parameters @@ -43,7 +43,7 @@ def __init__( **kwargs : keyword argument that will be available when get_window_bounds is called """ - self.index = index + self.index_array = index_array self.__dict__.update(kwargs) @Appender(get_window_bounds_doc) @@ -99,5 +99,5 @@ def get_window_bounds( ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, window_size, min_periods, center, closed, win_type, self.index + num_values, window_size, min_periods, center, closed, win_type, self.index_array ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 35b99a4df259e..b5aa80db3adbf 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -430,7 +430,7 @@ def _get_window_indexer( if isinstance(self.window, window_indexers.BaseIndexer): return self.window if self.is_freq_type: - return window_indexers.VariableWindowIndexer(index=index_as_array) + return window_indexers.VariableWindowIndexer(index_array=index_as_array) return window_indexers.FixedWindowIndexer() def _apply( From c3153d8c7d83007cdc9999f54cf3c03e3fb1a597 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Thu, 28 Nov 2019 14:45:49 -0800 Subject: [PATCH 15/26] Add test for ExpandingIndexer --- pandas/core/window/indexers.py | 42 +++++++++++++++----- pandas/core/window/rolling.py | 50 +++++++++++++++--------- pandas/tests/window/test_base_indexer.py | 16 +++++--- 3 files changed, 76 insertions(+), 32 deletions(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 9aa2b9783a90f..738524d2cc712 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -35,7 +35,7 @@ class BaseIndexer: """Base class for window bounds calculations""" def __init__( - self, index_array: Optional[np.ndarray] = None, **kwargs, + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, ): """ Parameters @@ -44,13 +44,13 @@ def __init__( keyword argument that will be available when get_window_bounds is called """ self.index_array = index_array + self.window_size = window_size self.__dict__.update(kwargs) @Appender(get_window_bounds_doc) def get_window_bounds( self, num_values: int = 0, - window_size: int = 0, min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, @@ -67,19 +67,22 @@ class FixedWindowIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - window_size: int = 0, min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - start_s = np.zeros(window_size, dtype="int64") - start_e = np.arange(window_size, num_values, dtype="int64") - window_size + 1 + start_s = np.zeros(self.window_size, dtype="int64") + start_e = ( + np.arange(self.window_size, num_values, dtype="int64") + - self.window_size + + 1 + ) start = np.concatenate([start_s, start_e])[:num_values] - end_s = np.arange(window_size, dtype="int64") + 1 - end_e = start_e + window_size + end_s = np.arange(self.window_size, dtype="int64") + 1 + end_e = start_e + self.window_size end = np.concatenate([end_s, end_e])[:num_values] return start, end @@ -91,7 +94,6 @@ class VariableWindowIndexer(BaseIndexer): def get_window_bounds( self, num_values: int = 0, - window_size: int = 0, min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, @@ -99,5 +101,27 @@ def get_window_bounds( ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, window_size, min_periods, center, closed, win_type, self.index_array + num_values, + self.window_size, + min_periods, + center, + closed, + win_type, + self.index_array, ) + + +class ExpandingIndexer(BaseIndexer): + """Calculate expanding window bounds, mimicking df.expanding()""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + win_type: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return np.zeros(num_values, dtype=np.int64), np.arange(1, num_values + 1) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b5aa80db3adbf..1b3c91e7fc858 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -132,11 +132,11 @@ def _validate_get_window_bounds_signature(window): window.get_window_bounds ).parameters.keys() expected_signature = inspect.signature( - window_indexers.BaseIndexer.get_window_bounds + window_indexers.BaseIndexer().get_window_bounds ).parameters.keys() if get_window_bounds_signature != expected_signature: raise ValueError( - f"{type(window).__name__} does not implement the correct signature for" + f"{type(window).__name__} does not implement the correct signature for " f"get_window_bounds" ) @@ -221,6 +221,8 @@ def _get_window(self, other=None, win_type: Optional[str] = None) -> int: ------- window : int """ + if isinstance(self.window, window_indexers.BaseIndexer): + return 0 return self.window @property @@ -415,14 +417,14 @@ def _get_cython_func_type(self, func): Variable algorithms do not use window while fixed do. """ - if self.is_freq_type: + if self.is_freq_type or isinstance(self.window, window_indexers.BaseIndexer): return self._get_roll_func("{}_variable".format(func)) return partial( self._get_roll_func("{}_fixed".format(func)), win=self._get_window() ) def _get_window_indexer( - self, index_as_array: Optional[np.ndarray] + self, index_as_array: Optional[np.ndarray], window: int ) -> window_indexers.BaseIndexer: """ Return an indexer class that will compute the window start and end bounds @@ -430,8 +432,12 @@ def _get_window_indexer( if isinstance(self.window, window_indexers.BaseIndexer): return self.window if self.is_freq_type: - return window_indexers.VariableWindowIndexer(index_array=index_as_array) - return window_indexers.FixedWindowIndexer() + return window_indexers.VariableWindowIndexer( + index_array=index_as_array, window_size=window + ) + return window_indexers.FixedWindowIndexer( + index_array=index_as_array, window_size=window + ) def _apply( self, @@ -470,7 +476,7 @@ def _apply( blocks, obj = self._create_blocks() block_list = list(blocks) index_as_array = self._get_index() - window_indexer = self._get_window_indexer(index_as_array) + window_indexer = self._get_window_indexer(index_as_array, window) results = [] exclude: List[Scalar] = [] @@ -498,11 +504,24 @@ def _apply( def calc(x): x = np.concatenate((x, additional_nans)) - min_periods = calculate_min_periods( - window, self.min_periods, len(x), require_min_periods, floor - ) + if not isinstance(window, window_indexers.BaseIndexer): + min_periods = calculate_min_periods( + window, self.min_periods, len(x), require_min_periods, floor + ) + else: + min_periods = calculate_min_periods( + self.min_periods or 1, + self.min_periods, + len(x), + require_min_periods, + floor, + ) start, end = window_indexer.get_window_bounds( - num_values=len(x), window_size=window, closed=self.closed + num_values=len(x), + min_periods=self.min_periods, + center=self.center, + closed=self.closed, + win_type=self.win_type, ) return func(x, start, end, min_periods) @@ -1000,7 +1019,7 @@ def _pop_args(win_type, arg_names, kwargs): def _get_window( self, other=None, win_type: Optional[Union[str, Tuple]] = None - ) -> np.ndarray: + ) -> Optional[np.ndarray]: """ Get the window, weights. @@ -1026,12 +1045,7 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) elif isinstance(window, window_indexers.BaseIndexer): - return window.get_window_bounds( - win_type=self.win_type, - min_periods=self.min_periods, - center=self.center, - closed=self.closed, - ) + return None def _get_weighted_roll_func( self, cfunc: Callable, check_minp: Callable, **kwargs diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 75f2419e1bee1..577d85b7062a2 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -2,6 +2,8 @@ from pandas import Series from pandas.api.indexers import BaseIndexer +from pandas.core.window.indexers import ExpandingIndexer +import pandas.util.testing as tm def test_bad_get_window_bounds_signature(): @@ -10,9 +12,13 @@ def get_window_bounds(self): return None indexer = BadIndexer() - with pytest.raises( - ValueError, - match="BadIndexer does not implement the correct signature " - "forget_window_bounds.", - ): + with pytest.raises(ValueError, match="BadIndexer does not implement"): Series(range(5)).rolling(indexer) + + +def test_expanding_indexer(): + s = Series(range(10)) + indexer = ExpandingIndexer() + result = s.rolling(indexer).mean() + expected = s.expanding().mean() + tm.assert_series_equal(result, expected) From 2704c59d0033b45adc0bf674d62672aa5ca8dffe Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Dec 2019 00:36:06 -0800 Subject: [PATCH 16/26] Add doc example in computation.rst with test + handle start, end bounds that are not monitonic --- doc/source/user_guide/computation.rst | 49 ++++++++++++++++++++ pandas/_libs/window/aggregations.pyx | 59 +++++++++++++++++------- pandas/core/window/rolling.py | 6 +++ pandas/tests/window/test_base_indexer.py | 27 ++++++++++- 4 files changed, 124 insertions(+), 17 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index bc00cd7f13e13..49b6b28d75609 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -466,6 +466,55 @@ default of the index) in a DataFrame. dft dft.rolling('2s', on='foo').sum() +.. _stats.custom_rolling_window: + +Custom window rolling +~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.0 + +In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts +a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. +The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns +a tuple of two arrays, the first being the starting indices of the windows and second being the +ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed`` +and will automatically be passed to ``get_window_bounds`` and the defined method must +always accept these arguments. + +For example, if we have the following ``DataFrame``: + +.. ipython:: python + + use_expanding = [True, False, True, False, True] + use_expanding + df = pd.DataFrame({'values': range(5)}) + df + +and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size +1, we can create the following ``BaseIndexer``: + +.. ipython:: python + + from pandas.api.indexers import BaseIndexer + + class CustomIndexer(BaseIndexer): + + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + df.rolling(indexer).sum() + + .. _stats.rolling_window.endpoints: Rolling window endpoints diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 303b4f6f24eac..1fdecbca32102 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -183,7 +183,8 @@ cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogi def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): + ndarray[int64_t] end, int64_t minp, + bint is_monotonic_bounds=True): cdef: float64_t sum_x = 0 int64_t s, e @@ -198,11 +199,10 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0: + if i == 0 or not is_monotonic_bounds: # setup - sum_x = 0.0 - nobs = 0 + for j in range(s, e): add_sum(values[j], &nobs, &sum_x) @@ -218,6 +218,10 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_sum(minp, nobs, sum_x) + if not is_monotonic_bounds: + for j in range(s, e): + remove_sum(values[j], &nobs, &sum_x) + return output @@ -327,7 +331,8 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): + ndarray[int64_t] end, int64_t minp, + bint is_monotonic_bounds=True): cdef: float64_t val, sum_x = 0 int64_t s, e @@ -342,11 +347,9 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0: + if i == 0 or not is_monotonic_bounds: # setup - sum_x = 0.0 - nobs = 0 for j in range(s, e): val = values[j] add_mean(val, &nobs, &sum_x, &neg_ct) @@ -365,6 +368,10 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + if not is_monotonic_bounds: + for j in range(s, e): + val = values[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) return output # ---------------------------------------------------------------------- @@ -486,7 +493,8 @@ def roll_var_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int ddof=1): + ndarray[int64_t] end, int64_t minp, int ddof=1, + bint is_monotonic_bounds=True): """ Numerically stable implementation using Welford's method. """ @@ -508,7 +516,7 @@ def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0: + if i == 0 or not is_monotonic_bounds: for j in range(s, e): add_var(values[j], &nobs, &mean_x, &ssqdm_x) @@ -528,6 +536,10 @@ def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + if not is_monotonic_bounds: + for j in range(s, e): + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) + return output # ---------------------------------------------------------------------- @@ -629,7 +641,8 @@ def roll_skew_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): + ndarray[int64_t] end, int64_t minp, + bint is_monotonic_bounds=True): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0 @@ -648,7 +661,7 @@ def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0: + if i == 0 or not is_monotonic_bounds: for j in range(s, e): val = values[j] @@ -671,6 +684,11 @@ def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_skew(minp, nobs, x, xx, xxx) + if not is_monotonic_bounds: + for j in range(s, e): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + return output # ---------------------------------------------------------------------- @@ -776,7 +794,8 @@ def roll_kurt_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): + ndarray[int64_t] end, int64_t minp, + bint is_monotonic_bounds=True): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 @@ -794,7 +813,7 @@ def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0: + if i == 0 or not is_monotonic_bounds: for j in range(s, e): add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) @@ -814,6 +833,10 @@ def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + if not is_monotonic_bounds: + for j in range(s, e): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + return output @@ -1007,7 +1030,8 @@ def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): + ndarray[int64_t] end, int64_t minp, + bint is_monotonic_bounds=True): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. @@ -1400,7 +1424,10 @@ def roll_generic_variable(object obj, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int offset, object func, bint raw, - object args, object kwargs): + object args, object kwargs, + bint is_monotonic_bounds=True): + # is_monotonic_bounds unused since variable algorithm doesn't calculate + # adds/subtracts across windows, but matches other *_variable functions cdef: ndarray[float64_t] output, counts, bufarr ndarray[float64_t, cast=True] arr diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 84a8bc54629ee..216b92e7d4337 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -523,6 +523,12 @@ def calc(x): closed=self.closed, win_type=self.win_type, ) + if np.any(np.diff(start) < 0) or np.any(np.diff(end) < 0): + # Our "variable" algorithms assume monotonically increasing bounds + # and a custom window indexer can produce any start, end sequence + return func( + x, start, end, min_periods, is_monotonic_bounds=False + ) return func(x, start, end, min_periods) else: diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 577d85b7062a2..05058e42d7fd0 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -1,6 +1,7 @@ +import numpy as np import pytest -from pandas import Series +from pandas import DataFrame, Series from pandas.api.indexers import BaseIndexer from pandas.core.window.indexers import ExpandingIndexer import pandas.util.testing as tm @@ -22,3 +23,27 @@ def test_expanding_indexer(): result = s.rolling(indexer).mean() expected = s.expanding().mean() tm.assert_series_equal(result, expected) + + +def test_indexer_constructor_arg(): + # Example found in computation.rst + use_expanding = [True, False, True, False, True] + df = DataFrame({"values": range(5)}) + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed, win_type): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + result = df.rolling(indexer).sum() + expected = DataFrame({"values": [0.0, 1.0, 3.0, 3.0, 10.0]}) + tm.assert_frame_equal(result, expected) From 87768ea4c8ee4b05f37e6dac48fb5ca9d21ca455 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Dec 2019 00:39:57 -0800 Subject: [PATCH 17/26] Add back win_type (for now) --- doc/source/user_guide/computation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 49b6b28d75609..d52b90d99ccae 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -499,7 +499,7 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def get_window_bounds(self, num_values, min_periods, center, closed, win_type): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): From 6a6d896e915e51315143b357711b71c623514aa0 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Dec 2019 11:02:10 -0800 Subject: [PATCH 18/26] Add 1.0.0 whatsnew note --- doc/source/user_guide/computation.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index d52b90d99ccae..b3b3f29bcee63 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -469,7 +469,7 @@ default of the index) in a DataFrame. .. _stats.custom_rolling_window: Custom window rolling -~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 1.0 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 14f36a808c468..c5b8eadc2d60a 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -125,6 +125,16 @@ You can use the alias ``"boolean"`` as well. s = pd.Series([True, False, None], dtype="boolean") s +.. _whatsnew_1000.custom_window: + +Defining custom windows for rolling operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added a :func:`pandas.api.indexers.BaseIndexer` class that allows users to define how +window bounds are created during ``rolling`` operations. Users can define their own ``get_window_bounds`` +method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate the start and end +indices used for each window during the rolling aggregation. For more details and example usage, see +the :ref:`custom window rolling documentation ` .. _whatsnew_1000.enhancements.other: From 2864e95b98bd331368e70786aff1fb8c4be42447 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Dec 2019 12:06:49 -0800 Subject: [PATCH 19/26] Remove BaseIndexers accepting win_type (weighted rolling) --- doc/source/user_guide/computation.rst | 2 +- pandas/_libs/window/indexers.pyx | 1 - pandas/core/window/indexers.py | 12 +------- pandas/core/window/rolling.py | 16 ++++++----- pandas/tests/window/test_base_indexer.py | 35 +++++++++++++++++++++++- 5 files changed, 45 insertions(+), 21 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index b3b3f29bcee63..c8ebe745e1982 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -499,7 +499,7 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed, win_type): + def get_window_bounds(self, num_values, min_periods, center, closed): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 2f0f0be691c5f..197c33b2f733b 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -12,7 +12,6 @@ def calculate_variable_window_bounds( object min_periods, # unused but here to match get_window_bounds signature object center, # unused but here to match get_window_bounds signature object closed, - object win_type, # unused but here to match get_window_bounds signature const int64_t[:] index ): cdef: diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 738524d2cc712..c800aacd87680 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -54,7 +54,6 @@ def get_window_bounds( min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, - win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: raise NotImplementedError @@ -70,7 +69,6 @@ def get_window_bounds( min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, - win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: start_s = np.zeros(self.window_size, dtype="int64") @@ -97,17 +95,10 @@ def get_window_bounds( min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, - win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, - self.window_size, - min_periods, - center, - closed, - win_type, - self.index_array, + num_values, self.window_size, min_periods, center, closed, self.index_array, ) @@ -121,7 +112,6 @@ def get_window_bounds( min_periods: Optional[int] = None, center: Optional[bool] = None, closed: Optional[str] = None, - win_type: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: return np.zeros(num_values, dtype=np.int64), np.arange(1, num_values + 1) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 216b92e7d4337..aea2d779688b3 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -222,7 +222,7 @@ def _get_window(self, other=None, win_type: Optional[str] = None) -> int: window : int """ if isinstance(self.window, window_indexers.BaseIndexer): - return 0 + return self.min_periods or 0 return self.window @property @@ -521,7 +521,6 @@ def calc(x): min_periods=self.min_periods, center=self.center, closed=self.closed, - win_type=self.win_type, ) if np.any(np.diff(start) < 0) or np.any(np.diff(end) < 0): # Our "variable" algorithms assume monotonically increasing bounds @@ -961,7 +960,11 @@ def validate(self): super().validate() window = self.window - if isinstance(window, (list, tuple, np.ndarray, window_indexers.BaseIndexer)): + if isinstance(window, window_indexers.BaseIndexer): + raise NotImplementedError( + "BaseIndexer subclasses not implemented with win_types." + ) + elif isinstance(window, (list, tuple, np.ndarray)): pass elif is_integer(window): if window <= 0: @@ -1025,7 +1028,7 @@ def _pop_args(win_type, arg_names, kwargs): def _get_window( self, other=None, win_type: Optional[Union[str, Tuple]] = None - ) -> Optional[np.ndarray]: + ) -> np.ndarray: """ Get the window, weights. @@ -1050,8 +1053,6 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) - elif isinstance(window, window_indexers.BaseIndexer): - return None def _get_weighted_roll_func( self, cfunc: Callable, check_minp: Callable, **kwargs @@ -1820,7 +1821,8 @@ def validate(self): self.min_periods = 1 elif isinstance(self.window, window_indexers.BaseIndexer): - pass + # Passed BaseIndexer subclass should handle all other rolling kwargs + return elif not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 05058e42d7fd0..6a3f2c19babdc 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -31,7 +31,7 @@ def test_indexer_constructor_arg(): df = DataFrame({"values": range(5)}) class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed, win_type): + def get_window_bounds(self, num_values, min_periods, center, closed): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) for i in range(num_values): @@ -47,3 +47,36 @@ def get_window_bounds(self, num_values, min_periods, center, closed, win_type): result = df.rolling(indexer).sum() expected = DataFrame({"values": [0.0, 1.0, 3.0, 3.0, 10.0]}) tm.assert_frame_equal(result, expected) + + +def test_indexer_accepts_rolling_args(): + df = DataFrame({"values": range(5)}) + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if center and min_periods == 1 and closed == "both" and i == 2: + start[i] = 0 + end[i] = num_values + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1) + result = df.rolling(indexer, center=True, min_periods=1, closed="both").sum() + expected = DataFrame({"values": [0.0, 1.0, 10.0, 3.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + +def test_win_type_not_implemented(): + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + return np.array([0, 1]), np.array([1, 2]) + + df = DataFrame({"values": range(2)}) + indexer = CustomIndexer() + with pytest.raises(NotImplementedError, match="BaseIndexer subclasses not"): + df.rolling(indexer, win_type="boxcar") From b16e711ae055ad0b40348a24748107b7619efcd5 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 1 Dec 2019 12:11:24 -0800 Subject: [PATCH 20/26] Lint --- pandas/core/window/rolling.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index aea2d779688b3..932d23fa04c4e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -523,8 +523,9 @@ def calc(x): closed=self.closed, ) if np.any(np.diff(start) < 0) or np.any(np.diff(end) < 0): - # Our "variable" algorithms assume monotonically increasing bounds - # and a custom window indexer can produce any start, end sequence + # Our "variable" algorithms assume start/end are + # monotonically increasing. A custom window indexer + # can produce a non monotonic start/end. return func( x, start, end, min_periods, is_monotonic_bounds=False ) From f358466916ca0f12c8eaa41cd65aa3230d8a82d8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 2 Dec 2019 00:02:56 -0800 Subject: [PATCH 21/26] Try changing import --- pandas/core/window/rolling.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 932d23fa04c4e..cd231f35db396 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -49,7 +49,11 @@ _zsqrt, calculate_min_periods, ) -import pandas.core.window.indexers as window_indexers +from pandas.core.window.indexers import ( + BaseIndexer, + FixedWindowIndexer, + VariableWindowIndexer, +) class _Window(PandasObject, ShallowMixin, SelectionMixin): @@ -119,7 +123,7 @@ def validate(self): raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): raise TypeError("invalid type: {}".format(type(self))) - if isinstance(self.window, window_indexers.BaseIndexer): + if isinstance(self.window, BaseIndexer): self._validate_get_window_bounds_signature(self.window) @staticmethod @@ -132,7 +136,7 @@ def _validate_get_window_bounds_signature(window): window.get_window_bounds ).parameters.keys() expected_signature = inspect.signature( - window_indexers.BaseIndexer().get_window_bounds + BaseIndexer().get_window_bounds ).parameters.keys() if get_window_bounds_signature != expected_signature: raise ValueError( @@ -221,7 +225,7 @@ def _get_window(self, other=None, win_type: Optional[str] = None) -> int: ------- window : int """ - if isinstance(self.window, window_indexers.BaseIndexer): + if isinstance(self.window, BaseIndexer): return self.min_periods or 0 return self.window @@ -417,7 +421,7 @@ def _get_cython_func_type(self, func): Variable algorithms do not use window while fixed do. """ - if self.is_freq_type or isinstance(self.window, window_indexers.BaseIndexer): + if self.is_freq_type or isinstance(self.window, BaseIndexer): return self._get_roll_func("{}_variable".format(func)) return partial( self._get_roll_func("{}_fixed".format(func)), win=self._get_window() @@ -425,19 +429,15 @@ def _get_cython_func_type(self, func): def _get_window_indexer( self, index_as_array: Optional[np.ndarray], window: int - ) -> window_indexers.BaseIndexer: + ) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ - if isinstance(self.window, window_indexers.BaseIndexer): + if isinstance(self.window, BaseIndexer): return self.window if self.is_freq_type: - return window_indexers.VariableWindowIndexer( - index_array=index_as_array, window_size=window - ) - return window_indexers.FixedWindowIndexer( - index_array=index_as_array, window_size=window - ) + return VariableWindowIndexer(index_array=index_as_array, window_size=window) + return FixedWindowIndexer(index_array=index_as_array, window_size=window) def _apply( self, @@ -504,7 +504,7 @@ def _apply( def calc(x): x = np.concatenate((x, additional_nans)) - if not isinstance(window, window_indexers.BaseIndexer): + if not isinstance(window, BaseIndexer): min_periods = calculate_min_periods( window, self.min_periods, len(x), require_min_periods, floor ) @@ -961,7 +961,7 @@ def validate(self): super().validate() window = self.window - if isinstance(window, window_indexers.BaseIndexer): + if isinstance(window, BaseIndexer): raise NotImplementedError( "BaseIndexer subclasses not implemented with win_types." ) @@ -1821,7 +1821,7 @@ def validate(self): if self.min_periods is None: self.min_periods = 1 - elif isinstance(self.window, window_indexers.BaseIndexer): + elif isinstance(self.window, BaseIndexer): # Passed BaseIndexer subclass should handle all other rolling kwargs return elif not is_integer(self.window): From 25a05fef2c763ac0fdf5d6456cc929b1ba9a4944 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 2 Dec 2019 23:03:28 -0800 Subject: [PATCH 22/26] Make doc example a code block, add docstring --- doc/source/user_guide/computation.rst | 47 ++++++++++++++++----------- pandas/_libs/window/indexers.pyx | 27 +++++++++++++++ pandas/core/window/indexers.py | 5 ++- pandas/core/window/rolling.py | 4 +-- 4 files changed, 60 insertions(+), 23 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index c8ebe745e1982..627a83b7359bb 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -493,26 +493,35 @@ For example, if we have the following ``DataFrame``: and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size 1, we can create the following ``BaseIndexer``: -.. ipython:: python - - from pandas.api.indexers import BaseIndexer - - class CustomIndexer(BaseIndexer): - - def get_window_bounds(self, num_values, min_periods, center, closed): - start = np.empty(num_values, dtype=np.int64) - end = np.empty(num_values, dtype=np.int64) - for i in range(num_values): - if self.use_expanding[i]: - start[i] = 0 - end[i] = i + 1 - else: - start[i] = i - end[i] = i + self.window_size - return start, end +.. code-block:: ipython - indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - df.rolling(indexer).sum() + In [2]: from pandas.api.indexers import BaseIndexer + ...: + ...: class CustomIndexer(BaseIndexer): + ...: + ...: def get_window_bounds(self, num_values, min_periods, center, closed): + ...: start = np.empty(num_values, dtype=np.int64) + ...: end = np.empty(num_values, dtype=np.int64) + ...: for i in range(num_values): + ...: if self.use_expanding[i]: + ...: start[i] = 0 + ...: end[i] = i + 1 + ...: else: + ...: start[i] = i + ...: end[i] = i + self.window_size + ...: return start, end + ...: + + In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + In [4]: df.rolling(indexer).sum() + Out[4]: + values + 0 0.0 + 1 1.0 + 2 3.0 + 3 3.0 + 4 10.0 .. _stats.rolling_window.endpoints: diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 197c33b2f733b..2d01d1964c043 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -14,6 +14,33 @@ def calculate_variable_window_bounds( object closed, const int64_t[:] index ): + """ + Calculate window boundaries for rolling windows from a time offset. + + Parameters + ---------- + num_values : int64 + total number of values + + window_size : int64 + window size calculated from the offset + + min_periods : object + ignored, exists for compatibility + + center : object + ignored, exists for compatibility + + closed : str + string of side of the window that should be closed + + index : ndarray[int64] + time series index to roll over + + Returns + ------- + (ndarray[int64], ndarray[int64]) + """ cdef: bint left_closed = False bint right_closed = False diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index c800aacd87680..95f3b4d398b45 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -114,4 +114,7 @@ def get_window_bounds( closed: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - return np.zeros(num_values, dtype=np.int64), np.arange(1, num_values + 1) + return ( + np.zeros(num_values, dtype=np.int64), + np.arange(1, num_values + 1, dtype=np.int64), + ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index c3cf1e823f210..7e2210530052d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -420,9 +420,7 @@ def _get_cython_func_type(self, func): """ if self.is_freq_type or isinstance(self.window, BaseIndexer): return self._get_roll_func(f"{func}_variable") - return partial( - self._get_roll_func(f"{func}_fixed"), win=self._get_window() - ) + return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) def _get_window_indexer( self, index_as_array: Optional[np.ndarray], window: int From 5d8819f00d6cd85a6aae42299ad5722c31fd6489 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 2 Dec 2019 23:09:24 -0800 Subject: [PATCH 23/26] Change self.__dict__(kwargs) to more explicit setattr --- pandas/core/window/indexers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 95f3b4d398b45..0fa24a0ba1b5a 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -41,11 +41,13 @@ def __init__( Parameters ---------- **kwargs : - keyword argument that will be available when get_window_bounds is called + keyword arguments that will be available when get_window_bounds is called """ self.index_array = index_array self.window_size = window_size - self.__dict__.update(kwargs) + # Set user defined kwargs as attributes that can be used in get_window_bounds + for key, value in kwargs.items(): + setattr(self, key, value) @Appender(get_window_bounds_doc) def get_window_bounds( From e7e106120d79b74443ea8e8f4051c793de292340 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 3 Dec 2019 10:22:18 -0800 Subject: [PATCH 24/26] Fix docstring --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7e2210530052d..c10b7554a9ee7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -814,7 +814,7 @@ class Window(_Window): If a BaseIndexer subclass is passed, calculates the window boundaries based on the defined ``get_window_bounds`` method. Additional rolling - keyword arguments, namely `min_periods`, `center`, `win_type`, and + keyword arguments, namely `min_periods`, `center`, and `closed` will be passed to `get_window_bounds`. min_periods : int, default None Minimum number of observations in window required to have a value From 9089f7b407c55301e8fed917b87e51c3a69ddfb0 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 4 Dec 2019 10:16:51 -0800 Subject: [PATCH 25/26] Add BaseIndexer in doc/source/reference/window/rst --- doc/source/reference/window.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index d09ac0d1fa7f7..3db1aa12a4275 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -74,3 +74,14 @@ Exponentially-weighted moving window functions EWM.var EWM.corr EWM.cov + +Window Indexer +-------------- +.. currentmodule:: pandas + +Base class for defining custom window boundaries. + +.. autosummary:: + :toctree: api/ + + api.indexers.BaseIndexer From 87e391ff3d314e3174bdbc17da71cde40125cd40 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 4 Dec 2019 12:15:31 -0800 Subject: [PATCH 26/26] Add typing for _validate_get_window_bounds_signature --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index c10b7554a9ee7..9f804584f532a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -127,7 +127,7 @@ def validate(self): self._validate_get_window_bounds_signature(self.window) @staticmethod - def _validate_get_window_bounds_signature(window): + def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: """ Validate that the passed BaseIndexer subclass has a get_window_bounds with the correct signature.