diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index af2f02a09428b..d7d025981f2f4 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -312,8 +312,8 @@ We provide a number of common statistical functions: :meth:`~Rolling.median`, Arithmetic median of values :meth:`~Rolling.min`, Minimum :meth:`~Rolling.max`, Maximum - :meth:`~Rolling.std`, Bessel-corrected sample standard deviation - :meth:`~Rolling.var`, Unbiased variance + :meth:`~Rolling.std`, Sample standard deviation + :meth:`~Rolling.var`, Sample variance :meth:`~Rolling.skew`, Sample skewness (3rd moment) :meth:`~Rolling.kurt`, Sample kurtosis (4th moment) :meth:`~Rolling.quantile`, Sample quantile (value at %) @@ -321,6 +321,26 @@ We provide a number of common statistical functions: :meth:`~Rolling.cov`, Unbiased covariance (binary) :meth:`~Rolling.corr`, Correlation (binary) +.. _computation.window_variance.caveats: + +.. note:: + + Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample + variance formula by default, i.e. the sum of squared differences is divided by + ``window_size - 1`` and not by ``window_size`` during averaging. In statistics, + we use sample when the dataset is drawn from a larger population that we + don't have access to. Using it implies that the data in our window is a + random sample from the population, and we are interested not in the variance + inside the specific window but in the variance of some general window that + our windows represent. In this situation, using the sample variance formula + results in an unbiased estimator and so is preferred. + + Usually, we are instead interested in the variance of each window as we slide + it over the data, and in this case we should specify ``ddof=0`` when calling + these methods to use population variance instead of sample variance. Using + sample variance under the circumstances would result in a biased estimator + of the variable we are trying to determine. + .. _stats.rolling_apply: Rolling apply @@ -848,8 +868,8 @@ Method summary :meth:`~Expanding.median`, Arithmetic median of values :meth:`~Expanding.min`, Minimum :meth:`~Expanding.max`, Maximum - :meth:`~Expanding.std`, Unbiased standard deviation - :meth:`~Expanding.var`, Unbiased variance + :meth:`~Expanding.std`, Sample standard deviation + :meth:`~Expanding.var`, Sample variance :meth:`~Expanding.skew`, Unbiased skewness (3rd moment) :meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment) :meth:`~Expanding.quantile`, Sample quantile (value at %) @@ -857,6 +877,13 @@ Method summary :meth:`~Expanding.cov`, Unbiased covariance (binary) :meth:`~Expanding.corr`, Correlation (binary) +.. note:: + + Using sample variance formulas for :meth:`~Expanding.std` and + :meth:`~Expanding.var` comes with the same caveats as using them with rolling + windows. See :ref:`this section ` for more + information. + .. currentmodule:: pandas Aside from not having a ``window`` parameter, these functions have the same diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2f4e961ff433f..00b52c0650920 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -129,7 +129,7 @@ Other API changes - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``std``, ``var``, ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`) +- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`) - Using a :func:`pandas.api.indexers.BaseIndexer` with ``min``, ``max`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. - diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 05f19de19f9f7..40f17126fa163 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -327,7 +327,17 @@ def func(arg, window, min_periods=None): def validate_baseindexer_support(func_name: Optional[str]) -> None: # GH 32865: These functions work correctly with a BaseIndexer subclass - BASEINDEXER_WHITELIST = {"min", "max", "mean", "sum", "median", "kurt", "quantile"} + BASEINDEXER_WHITELIST = { + "min", + "max", + "mean", + "sum", + "median", + "std", + "var", + "kurt", + "quantile", + } if isinstance(func_name, str) and func_name not in BASEINDEXER_WHITELIST: raise NotImplementedError( f"{func_name} is not supported with using a BaseIndexer " diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index bb93c70b8a597..43489e310bb93 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -82,7 +82,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): df.rolling(indexer, win_type="boxcar") -@pytest.mark.parametrize("func", ["std", "var", "count", "skew", "cov", "corr"]) +@pytest.mark.parametrize("func", ["count", "skew", "cov", "corr"]) def test_notimplemented_functions(func): # GH 32865 class CustomIndexer(BaseIndexer): @@ -97,13 +97,52 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize("constructor", [Series, DataFrame]) @pytest.mark.parametrize( - "func,alt_func,expected", + "func,np_func,expected,np_kwargs", [ - ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan]), - ("max", np.max, [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan]), + ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {},), + ( + "max", + np.max, + [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan], + {}, + ), + ( + "std", + np.std, + [ + 1.0, + 1.0, + 1.0, + 55.71654452, + 54.85739087, + 53.9845657, + 1.0, + 1.0, + 0.70710678, + np.nan, + ], + {"ddof": 1}, + ), + ( + "var", + np.var, + [ + 1.0, + 1.0, + 1.0, + 3104.333333, + 3009.333333, + 2914.333333, + 1.0, + 1.0, + 0.500000, + np.nan, + ], + {"ddof": 1}, + ), ], ) -def test_rolling_forward_window(constructor, func, alt_func, expected): +def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 values = np.arange(10) values[5] = 100.0 @@ -124,5 +163,5 @@ def test_rolling_forward_window(constructor, func, alt_func, expected): result = getattr(rolling, func)() expected = constructor(expected) tm.assert_equal(result, expected) - expected2 = constructor(rolling.apply(lambda x: alt_func(x))) + expected2 = constructor(rolling.apply(lambda x: np_func(x, **np_kwargs))) tm.assert_equal(result, expected2)