Whitelist std and var for use with custom rolling windows (#33448)

AlexKirko · web-flow · commit 0e37717c7149 · 2020-04-16T22:54:50.000-04:00
* stop throwing NotImplemented on std and var

* DOC: edit whatsnew

* restart checks

* restart checks

* TST: add kwargs to tests

* TST: add tests for std and var

* DOC: expand documentation on sample variance

* CLN: remove trailing whitespace

* CLN: remove double space

* CLN: remove pd_kwargs from the test
diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst
@@ -312,15 +312,35 @@ We provide a number of common statistical functions:
     :meth:`~Rolling.median`, Arithmetic median of values
     :meth:`~Rolling.min`, Minimum
     :meth:`~Rolling.max`, Maximum
-    :meth:`~Rolling.std`, Bessel-corrected sample standard deviation
-    :meth:`~Rolling.var`, Unbiased variance
+    :meth:`~Rolling.std`, Sample standard deviation
+    :meth:`~Rolling.var`, Sample variance
     :meth:`~Rolling.skew`, Sample skewness (3rd moment)
     :meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
     :meth:`~Rolling.quantile`, Sample quantile (value at %)
     :meth:`~Rolling.apply`, Generic apply
     :meth:`~Rolling.cov`, Unbiased covariance (binary)
     :meth:`~Rolling.corr`, Correlation (binary)
 
+.. _computation.window_variance.caveats:
+
+.. note::
+
+   Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample
+   variance formula by default, i.e. the sum of squared differences is divided by
+   ``window_size - 1`` and not by ``window_size`` during averaging. In statistics,
+   we use sample when the dataset is drawn from a larger population that we
+   don't have access to. Using it implies that the data in our window is a
+   random sample from the population, and we are interested not in the variance
+   inside the specific window but in the variance of some general window that
+   our windows represent. In this situation, using the sample variance formula
+   results in an unbiased estimator and so is preferred.
+
+   Usually, we are instead interested in the variance of each window as we slide
+   it over the data, and in this case we should specify ``ddof=0`` when calling
+   these methods to use population variance instead of sample variance. Using
+   sample variance under the circumstances would result in a biased estimator
+   of the variable we are trying to determine.
+
 .. _stats.rolling_apply:
 
 Rolling apply
@@ -848,15 +868,22 @@ Method summary
     :meth:`~Expanding.median`, Arithmetic median of values
     :meth:`~Expanding.min`, Minimum
     :meth:`~Expanding.max`, Maximum
-    :meth:`~Expanding.std`, Unbiased standard deviation
-    :meth:`~Expanding.var`, Unbiased variance
+    :meth:`~Expanding.std`, Sample standard deviation
+    :meth:`~Expanding.var`, Sample variance
     :meth:`~Expanding.skew`, Unbiased skewness (3rd moment)
     :meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment)
     :meth:`~Expanding.quantile`, Sample quantile (value at %)
     :meth:`~Expanding.apply`, Generic apply
     :meth:`~Expanding.cov`, Unbiased covariance (binary)
     :meth:`~Expanding.corr`, Correlation (binary)
 
+.. note::
+
+   Using sample variance formulas for :meth:`~Expanding.std` and
+   :meth:`~Expanding.var` comes with the same caveats as using them with rolling
+   windows. See :ref:`this section <computation.window_variance.caveats>` for more
+   information.
+
 .. currentmodule:: pandas
 
 Aside from not having a ``window`` parameter, these functions have the same
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -174,7 +174,7 @@ Other API changes
 - Added :meth:`DataFrame.value_counts` (:issue:`5377`)
 - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
 - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
-- Using a :func:`pandas.api.indexers.BaseIndexer` with ``std``, ``var``, ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
+- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
 - Using a :func:`pandas.api.indexers.BaseIndexer` with ``min``, ``max`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
 - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
 -
diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py
@@ -327,7 +327,17 @@ def func(arg, window, min_periods=None):
 
 def validate_baseindexer_support(func_name: Optional[str]) -> None:
     # GH 32865: These functions work correctly with a BaseIndexer subclass
-    BASEINDEXER_WHITELIST = {"min", "max", "mean", "sum", "median", "kurt", "quantile"}
+    BASEINDEXER_WHITELIST = {
+        "min",
+        "max",
+        "mean",
+        "sum",
+        "median",
+        "std",
+        "var",
+        "kurt",
+        "quantile",
+    }
     if isinstance(func_name, str) and func_name not in BASEINDEXER_WHITELIST:
         raise NotImplementedError(
             f"{func_name} is not supported with using a BaseIndexer "
diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py
@@ -82,7 +82,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
         df.rolling(indexer, win_type="boxcar")
 
 
-@pytest.mark.parametrize("func", ["std", "var", "count", "skew", "cov", "corr"])
+@pytest.mark.parametrize("func", ["count", "skew", "cov", "corr"])
 def test_notimplemented_functions(func):
     # GH 32865
     class CustomIndexer(BaseIndexer):
@@ -97,13 +97,52 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
 
 @pytest.mark.parametrize("constructor", [Series, DataFrame])
 @pytest.mark.parametrize(
-    "func,alt_func,expected",
+    "func,np_func,expected,np_kwargs",
     [
-        ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan]),
-        ("max", np.max, [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan]),
+        ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {},),
+        (
+            "max",
+            np.max,
+            [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan],
+            {},
+        ),
+        (
+            "std",
+            np.std,
+            [
+                1.0,
+                1.0,
+                1.0,
+                55.71654452,
+                54.85739087,
+                53.9845657,
+                1.0,
+                1.0,
+                0.70710678,
+                np.nan,
+            ],
+            {"ddof": 1},
+        ),
+        (
+            "var",
+            np.var,
+            [
+                1.0,
+                1.0,
+                1.0,
+                3104.333333,
+                3009.333333,
+                2914.333333,
+                1.0,
+                1.0,
+                0.500000,
+                np.nan,
+            ],
+            {"ddof": 1},
+        ),
     ],
 )
-def test_rolling_forward_window(constructor, func, alt_func, expected):
+def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs):
     # GH 32865
     values = np.arange(10)
     values[5] = 100.0
@@ -124,5 +163,5 @@ def test_rolling_forward_window(constructor, func, alt_func, expected):
     result = getattr(rolling, func)()
     expected = constructor(expected)
     tm.assert_equal(result, expected)
-    expected2 = constructor(rolling.apply(lambda x: alt_func(x)))
+    expected2 = constructor(rolling.apply(lambda x: np_func(x, **np_kwargs)))
     tm.assert_equal(result, expected2)

Original file line number	Diff line number	Diff line change
`@@ -174,7 +174,7 @@ Other API changes`
`174`	`174`	- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
`175`	`175`	- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
`176`	`176`	- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
`177`		-- Using a :func:`pandas.api.indexers.BaseIndexer` with ``std``, ``var``, ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
	`177`	+- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
`178`	`178`	- Using a :func:`pandas.api.indexers.BaseIndexer` with ``min``, ``max`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
`179`	`179`	- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
`180`	`180`	`-`