diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 2f6addf607877..d09ac0d1fa7f7 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -34,6 +34,8 @@ Standard moving window functions Rolling.quantile Window.mean Window.sum + Window.var + Window.std .. _api.functions_expanding: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cb1d80a34514c..36c1889e7df06 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -110,6 +110,7 @@ Other enhancements (depending on the presence of missing data) or object dtype column. (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) +- Implemented :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` functions (:issue:`26597`) - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index b51d61d05ce98..62066c5f66ea3 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1752,6 +1752,226 @@ cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, return np.asarray(output) +# ---------------------------------------------------------------------- +# Rolling var for weighted window + + +cdef inline float64_t calc_weighted_var(float64_t t, + float64_t sum_w, + Py_ssize_t win_n, + unsigned int ddof, + float64_t nobs, + int64_t minp) nogil: + """ + Calculate weighted variance for a window using West's method. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + t: float64_t + sum of weighted squared differences + sum_w: float64_t + sum of weights + win_n: Py_ssize_t + window size + ddof: unsigned int + delta degrees of freedom + nobs: float64_t + number of observations + minp: int64_t + minimum number of observations + + Returns + ------- + result : float64_t + weighted variance of the window + """ + + cdef: + float64_t result + + # Variance is unchanged if no observation is added or removed + if (nobs >= minp) and (nobs > ddof): + + # pathological case + if nobs == 1: + result = 0 + else: + result = t * win_n / ((win_n - ddof) * sum_w) + if result < 0: + result = 0 + else: + result = NaN + + return result + + +cdef inline void add_weighted_var(float64_t val, + float64_t w, + float64_t *t, + float64_t *sum_w, + float64_t *mean, + float64_t *nobs) nogil: + """ + Update weighted mean, sum of weights and sum of weighted squared + differences to include value and weight pair in weighted variance + calculation using West's method. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + val: float64_t + window values + w: float64_t + window weights + t: float64_t + sum of weighted squared differences + sum_w: float64_t + sum of weights + mean: float64_t + weighted mean + nobs: float64_t + number of observations + """ + + cdef: + float64_t temp, q, r + + if isnan(val): + return + + nobs[0] = nobs[0] + 1 + + q = val - mean[0] + temp = sum_w[0] + w + r = q * w / temp + + mean[0] = mean[0] + r + t[0] = t[0] + r * sum_w[0] * q + sum_w[0] = temp + + +cdef inline void remove_weighted_var(float64_t val, + float64_t w, + float64_t *t, + float64_t *sum_w, + float64_t *mean, + float64_t *nobs) nogil: + """ + Update weighted mean, sum of weights and sum of weighted squared + differences to remove value and weight pair from weighted variance + calculation using West's method. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + val: float64_t + window values + w: float64_t + window weights + t: float64_t + sum of weighted squared differences + sum_w: float64_t + sum of weights + mean: float64_t + weighted mean + nobs: float64_t + number of observations + """ + + cdef: + float64_t temp, q, r + + if notnan(val): + nobs[0] = nobs[0] - 1 + + if nobs[0]: + q = val - mean[0] + temp = sum_w[0] - w + r = q * w / temp + + mean[0] = mean[0] - r + t[0] = t[0] - r * sum_w[0] * q + sum_w[0] = temp + + else: + t[0] = 0 + sum_w[0] = 0 + mean[0] = 0 + + +def roll_weighted_var(float64_t[:] values, float64_t[:] weights, + int64_t minp, unsigned int ddof): + """ + Calculates weighted rolling variance using West's online algorithm. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + values: float64_t[:] + values to roll window over + weights: float64_t[:] + array of weights whose lenght is window size + minp: int64_t + minimum number of observations to calculate + variance of a window + ddof: unsigned int + the divisor used in variance calculations + is the window size - ddof + + Returns + ------- + output: float64_t[:] + weighted variances of windows + """ + + cdef: + float64_t t = 0, sum_w = 0, mean = 0, nobs = 0 + float64_t val, pre_val, w, pre_w + Py_ssize_t i, n, win_n + float64_t[:] output + + n = len(values) + win_n = len(weights) + output = np.empty(n, dtype=float) + + with nogil: + + for i in range(win_n): + add_weighted_var(values[i], weights[i], &t, + &sum_w, &mean, &nobs) + + output[i] = calc_weighted_var(t, sum_w, win_n, + ddof, nobs, minp) + + for i in range(win_n, n): + val = values[i] + pre_val = values[i - win_n] + + w = weights[i % win_n] + pre_w = weights[(i - win_n) % win_n] + + if notnan(val): + if pre_val == pre_val: + remove_weighted_var(pre_val, pre_w, &t, + &sum_w, &mean, &nobs) + + add_weighted_var(val, w, &t, &sum_w, &mean, &nobs) + + elif pre_val == pre_val: + remove_weighted_var(pre_val, pre_w, &t, + &sum_w, &mean, &nobs) + + output[i] = calc_weighted_var(t, sum_w, win_n, + ddof, nobs, minp) + + return output + + # ---------------------------------------------------------------------- # Exponentially weighted moving average diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 47bd8f2ec593b..c3b7531ce5904 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -181,13 +181,13 @@ def mean(self, *args, **kwargs): def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name="expanding") + @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name="expanding") + @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): nv.validate_expanding_func("var", args, kwargs) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index bf5ea9c457e8a..ab59cb1170b75 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -4,7 +4,7 @@ """ from datetime import timedelta from textwrap import dedent -from typing import Callable, List, Optional, Set, Union +from typing import Callable, Dict, List, Optional, Set, Tuple, Union import warnings import numpy as np @@ -169,13 +169,30 @@ def __getattr__(self, attr): def _dir_additions(self): return self.obj._dir_additions() - def _get_window(self, other=None, **kwargs) -> int: + def _get_win_type(self, kwargs: Dict): """ - Returns window length + Exists for compatibility, overriden by subclass Window. Parameters ---------- - other: + kwargs : dict + ignored, exists for compatibility + + Returns + ------- + None + """ + return None + + def _get_window(self, other=None, win_type: Optional[str] = None) -> int: + """ + Return window length. + + Parameters + ---------- + other : + ignored, exists for compatibility + win_type : ignored, exists for compatibility Returns @@ -405,6 +422,7 @@ def _apply( ------- y : type of input """ + if center is None: center = self.center @@ -412,7 +430,8 @@ def _apply( check_minp = _use_window if window is None: - window = self._get_window(**kwargs) + win_type = self._get_win_type(kwargs) + window = self._get_window(win_type=win_type) blocks, obj = self._create_blocks() block_list = list(blocks) @@ -612,6 +631,126 @@ def aggregate(self, func, *args, **kwargs): """ ) + _shared_docs["var"] = dedent( + """ + Calculate unbiased %(name)s variance. + %(versionadded)s + Normalized by N-1 by default. This can be changed using the `ddof` + argument. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller of the %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.var : Equivalent method for Series. + DataFrame.var : Equivalent method for DataFrame. + numpy.var : Equivalent method for Numpy array. + + Notes + ----- + The default `ddof` of 1 used in :meth:`Series.var` is different than the + default `ddof` of 0 in :func:`numpy.var`. + + A minimum of 1 period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 + dtype: float64 + + >>> s.expanding(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 0.916667 + 4 0.800000 + 5 0.700000 + 6 0.619048 + dtype: float64 + """ + ) + + _shared_docs["std"] = dedent( + """ + Calculate %(name)s standard deviation. + %(versionadded)s + Normalized by N-1 by default. This can be changed using the `ddof` + argument. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller of the %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.std : Equivalent method for Series. + DataFrame.std : Equivalent method for DataFrame. + numpy.std : Equivalent method for Numpy array. + + Notes + ----- + The default `ddof` of 1 used in Series.std is different than the default + `ddof` of 0 in numpy.std. + + A minimum of one period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 + dtype: float64 + + >>> s.expanding(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 0.957427 + 4 0.894427 + 5 0.836660 + 6 0.786796 + dtype: float64 + """ + ) + class Window(_Window): """ @@ -783,15 +922,63 @@ def validate(self): else: raise ValueError("Invalid window {0}".format(window)) - def _get_window(self, other=None, **kwargs) -> np.ndarray: + def _get_win_type(self, kwargs: Dict) -> Union[str, Tuple]: """ - Provide validation for the window type, return the window - which has already been validated. + Extract arguments for the window type, provide validation for it + and return the validated window type. Parameters ---------- - other: + kwargs : dict + + Returns + ------- + win_type : str, or tuple + """ + # the below may pop from kwargs + def _validate_win_type(win_type, kwargs): + arg_map = { + "kaiser": ["beta"], + "gaussian": ["std"], + "general_gaussian": ["power", "width"], + "slepian": ["width"], + "exponential": ["tau"], + } + + if win_type in arg_map: + win_args = _pop_args(win_type, arg_map[win_type], kwargs) + if win_type == "exponential": + # exponential window requires the first arg (center) + # to be set to None (necessary for symmetric window) + win_args.insert(0, None) + + return tuple([win_type] + win_args) + + return win_type + + def _pop_args(win_type, arg_names, kwargs): + msg = "%s window requires %%s" % win_type + all_args = [] + for n in arg_names: + if n not in kwargs: + raise ValueError(msg % n) + all_args.append(kwargs.pop(n)) + return all_args + + return _validate_win_type(self.win_type, kwargs) + + def _get_window( + self, other=None, win_type: Optional[Union[str, Tuple]] = None + ) -> np.ndarray: + """ + Get the window, weights. + + Parameters + ---------- + other : ignored, exists for compatibility + win_type : str, or tuple + type of window to create Returns ------- @@ -805,37 +992,6 @@ def _get_window(self, other=None, **kwargs) -> np.ndarray: elif is_integer(window): import scipy.signal as sig - # the below may pop from kwargs - def _validate_win_type(win_type, kwargs): - arg_map = { - "kaiser": ["beta"], - "gaussian": ["std"], - "general_gaussian": ["power", "width"], - "slepian": ["width"], - "exponential": ["tau"], - } - - if win_type in arg_map: - win_args = _pop_args(win_type, arg_map[win_type], kwargs) - if win_type == "exponential": - # exponential window requires the first arg (center) - # to be set to None (necessary for symmetric window) - win_args.insert(0, None) - - return tuple([win_type] + win_args) - - return win_type - - def _pop_args(win_type, arg_names, kwargs): - msg = "%s window requires %%s" % win_type - all_args = [] - for n in arg_names: - if n not in kwargs: - raise ValueError(msg % n) - all_args.append(kwargs.pop(n)) - return all_args - - win_type = _validate_win_type(self.win_type, kwargs) # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) @@ -844,7 +1000,7 @@ def _get_roll_func( ) -> Callable: def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, len(window)) - return cfunc(arg, window, minp) + return cfunc(arg, window, minp, **kwargs) return func @@ -922,6 +1078,18 @@ def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) return self._apply("roll_weighted_mean", **kwargs) + @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_window_func("var", args, kwargs) + return self._apply("roll_weighted_var", ddof=ddof, **kwargs) + + @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_window_func("std", args, kwargs) + return _zsqrt(self.var(ddof=ddof, **kwargs)) + class _Rolling(_Window): @property @@ -1176,66 +1344,6 @@ def mean(self, *args, **kwargs): def median(self, **kwargs): return self._apply("roll_median_c", "median", **kwargs) - _shared_docs["std"] = dedent( - """ - Calculate %(name)s standard deviation. - - Normalized by N-1 by default. This can be changed using the `ddof` - argument. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller of the %(name)s calculation. - - See Also - -------- - Series.%(name)s : Calling object with Series data. - DataFrame.%(name)s : Calling object with DataFrames. - Series.std : Equivalent method for Series. - DataFrame.std : Equivalent method for DataFrame. - numpy.std : Equivalent method for Numpy array. - - Notes - ----- - The default `ddof` of 1 used in Series.std is different than the default - `ddof` of 0 in numpy.std. - - A minimum of one period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - >>> s.rolling(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 1.000000 - 4 1.000000 - 5 1.154701 - 6 0.000000 - dtype: float64 - - >>> s.expanding(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 0.957427 - 4 0.894427 - 5 0.836660 - 6 0.786796 - dtype: float64 - """ - ) - def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) window = self._get_window() @@ -1251,66 +1359,6 @@ def f(arg, *args, **kwargs): f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs ) - _shared_docs["var"] = dedent( - """ - Calculate unbiased %(name)s variance. - - Normalized by N-1 by default. This can be changed using the `ddof` - argument. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller of the %(name)s calculation. - - See Also - -------- - Series.%(name)s : Calling object with Series data. - DataFrame.%(name)s : Calling object with DataFrames. - Series.var : Equivalent method for Series. - DataFrame.var : Equivalent method for DataFrame. - numpy.var : Equivalent method for Numpy array. - - Notes - ----- - The default `ddof` of 1 used in :meth:`Series.var` is different than the - default `ddof` of 0 in :func:`numpy.var`. - - A minimum of 1 period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - >>> s.rolling(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 1.000000 - 4 1.000000 - 5 1.333333 - 6 0.000000 - dtype: float64 - - >>> s.expanding(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 0.916667 - 4 0.800000 - 5 0.700000 - 6 0.619048 - dtype: float64 - """ - ) - def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) return self._apply( @@ -1845,13 +1893,13 @@ def mean(self, *args, **kwargs): def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name="rolling") + @Substitution(name="rolling", versionadded="") @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_rolling_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name="rolling") + @Substitution(name="rolling", versionadded="") @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): nv.validate_rolling_func("var", args, kwargs) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index 3d6cd7d10bd10..36a0ddb3e02d7 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -119,64 +119,95 @@ def test_cmov_window_corner(self): assert len(result) == 5 @td.skip_if_no_scipy - def test_cmov_window_frame(self): + @pytest.mark.parametrize( + "f,xp", + [ + ( + "mean", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [9.252, 9.392], + [8.644, 9.906], + [8.87, 10.208], + [6.81, 8.588], + [7.792, 8.644], + [9.05, 7.824], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + "std", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [3.789706, 4.068313], + [3.429232, 3.237411], + [3.589269, 3.220810], + [3.405195, 2.380655], + [3.281839, 2.369869], + [3.676846, 1.801799], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + "var", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [14.36187, 16.55117], + [11.75963, 10.48083], + [12.88285, 10.37362], + [11.59535, 5.66752], + [10.77047, 5.61628], + [13.51920, 3.24648], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + "sum", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [46.26, 46.96], + [43.22, 49.53], + [44.35, 51.04], + [34.05, 42.94], + [38.96, 43.22], + [45.25, 39.12], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ], + ) + def test_cmov_window_frame(self, f, xp): # Gh 8238 - vals = np.array( - [ - [12.18, 3.64], - [10.18, 9.16], - [13.24, 14.61], - [4.51, 8.11], - [6.15, 11.44], - [9.14, 6.21], - [11.31, 10.67], - [2.94, 6.51], - [9.42, 8.39], - [12.44, 7.34], - ] - ) - - xp = np.array( - [ - [np.nan, np.nan], - [np.nan, np.nan], - [9.252, 9.392], - [8.644, 9.906], - [8.87, 10.208], - [6.81, 8.588], - [7.792, 8.644], - [9.05, 7.824], - [np.nan, np.nan], - [np.nan, np.nan], - ] + df = DataFrame( + np.array( + [ + [12.18, 3.64], + [10.18, 9.16], + [13.24, 14.61], + [4.51, 8.11], + [6.15, 11.44], + [9.14, 6.21], + [11.31, 10.67], + [2.94, 6.51], + [9.42, 8.39], + [12.44, 7.34], + ] + ) ) + xp = DataFrame(np.array(xp)) - # DataFrame - rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).mean() - tm.assert_frame_equal(DataFrame(xp), rs) - - # invalid method - with pytest.raises(AttributeError): - (DataFrame(vals).rolling(5, win_type="boxcar", center=True).std()) - - # sum - xp = np.array( - [ - [np.nan, np.nan], - [np.nan, np.nan], - [46.26, 46.96], - [43.22, 49.53], - [44.35, 51.04], - [34.05, 42.94], - [38.96, 43.22], - [45.25, 39.12], - [np.nan, np.nan], - [np.nan, np.nan], - ] - ) + roll = df.rolling(5, win_type="boxcar", center=True) + rs = getattr(roll, f)() - rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).sum() - tm.assert_frame_equal(DataFrame(xp), rs) + tm.assert_frame_equal(xp, rs) @td.skip_if_no_scipy def test_cmov_window_na_min_periods(self): diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index f42c507e51511..39ab3ffd9319e 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -60,7 +60,7 @@ def test_numpy_compat(self, method): getattr(w, method)(dtype=np.float64) @td.skip_if_no_scipy - @pytest.mark.parametrize("arg", ["median", "var", "std", "kurt", "skew"]) + @pytest.mark.parametrize("arg", ["median", "kurt", "skew"]) def test_agg_function_support(self, arg): df = pd.DataFrame({"A": np.arange(5)}) roll = df.rolling(2, win_type="triang")