From cb765063c45cc1a9690be876ca32c52deffd10b0 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Thu, 9 Apr 2020 11:50:11 +0300 Subject: [PATCH 01/10] stop throwing NotImplemented on std and var --- pandas/core/window/common.py | 12 +++++++++++- pandas/tests/window/test_base_indexer.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 05f19de19f9f7..40f17126fa163 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -327,7 +327,17 @@ def func(arg, window, min_periods=None): def validate_baseindexer_support(func_name: Optional[str]) -> None: # GH 32865: These functions work correctly with a BaseIndexer subclass - BASEINDEXER_WHITELIST = {"min", "max", "mean", "sum", "median", "kurt", "quantile"} + BASEINDEXER_WHITELIST = { + "min", + "max", + "mean", + "sum", + "median", + "std", + "var", + "kurt", + "quantile", + } if isinstance(func_name, str) and func_name not in BASEINDEXER_WHITELIST: raise NotImplementedError( f"{func_name} is not supported with using a BaseIndexer " diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index bb93c70b8a597..f48c507f7d239 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -82,7 +82,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): df.rolling(indexer, win_type="boxcar") -@pytest.mark.parametrize("func", ["std", "var", "count", "skew", "cov", "corr"]) +@pytest.mark.parametrize("func", ["count", "skew", "cov", "corr"]) def test_notimplemented_functions(func): # GH 32865 class CustomIndexer(BaseIndexer): From 5062fb5ea17c8307ee5fee520c891d123f1c57da Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 10 Apr 2020 10:21:10 +0300 Subject: [PATCH 02/10] DOC: edit whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5c39377899a20..7c4a4d03eda81 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -107,7 +107,7 @@ Other API changes - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``std``, ``var``, ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`) +- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`) - Using a :func:`pandas.api.indexers.BaseIndexer` with ``min``, ``max`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. - From ada2a7dc685c08dfaeda734eb53c33cb674f5d75 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 10 Apr 2020 11:50:52 +0300 Subject: [PATCH 03/10] restart checks From 3c712c903707d3f825ffea56c597e70ecbf488a2 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Fri, 10 Apr 2020 16:20:19 +0300 Subject: [PATCH 04/10] restart checks From eece9e775fe398a26e17c9a44669b235650c6e6c Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 11 Apr 2020 10:40:53 +0300 Subject: [PATCH 05/10] TST: add kwargs to tests --- pandas/tests/window/test_base_indexer.py | 30 +++++++++++++++++------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index f48c507f7d239..2cd91f830dafa 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -97,13 +97,27 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize("constructor", [Series, DataFrame]) @pytest.mark.parametrize( - "func,alt_func,expected", + "func,np_func,expected,pd_kwargs,np_kwargs", [ - ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan]), - ("max", np.max, [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan]), + ( + "min", + np.min, + [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], + {}, + {}, + ), + ( + "max", + np.max, + [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan], + {}, + {}, + ), ], ) -def test_rolling_forward_window(constructor, func, alt_func, expected): +def test_rolling_forward_window( + constructor, func, np_func, expected, pd_kwargs, np_kwargs +): # GH 32865 values = np.arange(10) values[5] = 100.0 @@ -113,16 +127,16 @@ def test_rolling_forward_window(constructor, func, alt_func, expected): match = "Forward-looking windows can't have center=True" with pytest.raises(ValueError, match=match): rolling = constructor(values).rolling(window=indexer, center=True) - result = getattr(rolling, func)() + result = getattr(rolling, func)(**pd_kwargs) match = "Forward-looking windows don't support setting the closed argument" with pytest.raises(ValueError, match=match): rolling = constructor(values).rolling(window=indexer, closed="right") - result = getattr(rolling, func)() + result = getattr(rolling, func)(**pd_kwargs) rolling = constructor(values).rolling(window=indexer, min_periods=2) - result = getattr(rolling, func)() + result = getattr(rolling, func)(**pd_kwargs) expected = constructor(expected) tm.assert_equal(result, expected) - expected2 = constructor(rolling.apply(lambda x: alt_func(x))) + expected2 = constructor(rolling.apply(lambda x: np_func(x, **np_kwargs))) tm.assert_equal(result, expected2) From 2cf13c729396a75815a63f014f2b038d99fb1e5d Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 11 Apr 2020 10:45:43 +0300 Subject: [PATCH 06/10] TST: add tests for std and var --- pandas/tests/window/test_base_indexer.py | 42 ++++++++++++++++++++---- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 2cd91f830dafa..358147754c610 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -99,19 +99,49 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize( "func,np_func,expected,pd_kwargs,np_kwargs", [ + ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {}, {},), ( - "min", - np.min, - [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], + "max", + np.max, + [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan], {}, {}, ), ( - "max", - np.max, - [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan], + "std", + np.std, + [ + 1.0, + 1.0, + 1.0, + 55.71654452, + 54.85739087, + 53.9845657, + 1.0, + 1.0, + 0.70710678, + np.nan, + ], {}, + {"ddof": 1}, + ), + ( + "var", + np.var, + [ + 1.0, + 1.0, + 1.0, + 3104.333333, + 3009.333333, + 2914.333333, + 1.0, + 1.0, + 0.500000, + np.nan, + ], {}, + {"ddof": 1}, ), ], ) From 9f3540e2243ffdd86f6e2d870a433f3cdfba5a14 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 11 Apr 2020 11:39:01 +0300 Subject: [PATCH 07/10] DOC: expand documentation on sample variance --- doc/source/user_guide/computation.rst | 35 ++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index af2f02a09428b..1bd98270698ef 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -312,8 +312,8 @@ We provide a number of common statistical functions: :meth:`~Rolling.median`, Arithmetic median of values :meth:`~Rolling.min`, Minimum :meth:`~Rolling.max`, Maximum - :meth:`~Rolling.std`, Bessel-corrected sample standard deviation - :meth:`~Rolling.var`, Unbiased variance + :meth:`~Rolling.std`, Sample standard deviation + :meth:`~Rolling.var`, Sample variance :meth:`~Rolling.skew`, Sample skewness (3rd moment) :meth:`~Rolling.kurt`, Sample kurtosis (4th moment) :meth:`~Rolling.quantile`, Sample quantile (value at %) @@ -321,6 +321,26 @@ We provide a number of common statistical functions: :meth:`~Rolling.cov`, Unbiased covariance (binary) :meth:`~Rolling.corr`, Correlation (binary) +.. _computation.window_variance.caveats: + +.. note:: + + Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample + variance formula by default, i.e. the sum of squared differences is divided by + ``window_size - 1`` and not by ``window_size`` during averaging. In statistics, + we use sample when the dataset is drawn from a larger population that we + don't have access to. Using it implies that the data in our window is a + random sample from the population, and we are interested not in the variance + inside the specific window but in the variance of some general window that + our windows represent. In this situation, using the sample variance formula + results in an unbiased estimator and so is preferred. + + Usually, we are instead interested in the variance of each window as we slide + it over the data, and in this case we should specify ``ddof=0`` when calling + these methods to use population variance instead of sample variance. Using + sample variance under the circumstances would result in a biased estimator + of the variable we are trying to determine. + .. _stats.rolling_apply: Rolling apply @@ -848,8 +868,8 @@ Method summary :meth:`~Expanding.median`, Arithmetic median of values :meth:`~Expanding.min`, Minimum :meth:`~Expanding.max`, Maximum - :meth:`~Expanding.std`, Unbiased standard deviation - :meth:`~Expanding.var`, Unbiased variance + :meth:`~Expanding.std`, Sample standard deviation + :meth:`~Expanding.var`, Sample variance :meth:`~Expanding.skew`, Unbiased skewness (3rd moment) :meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment) :meth:`~Expanding.quantile`, Sample quantile (value at %) @@ -857,6 +877,13 @@ Method summary :meth:`~Expanding.cov`, Unbiased covariance (binary) :meth:`~Expanding.corr`, Correlation (binary) +.. note:: + + Using sample variance formulas for :meth:`~Expanding.std` and + :meth:`~Expanding.var` comes with the same caveats as using them with rolling + windows. See :ref:`this section ` for more + information. + .. currentmodule:: pandas Aside from not having a ``window`` parameter, these functions have the same From 2351c4f07fc9b8fb25cc73b76131eb013a2f83d7 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Sat, 11 Apr 2020 12:06:22 +0300 Subject: [PATCH 08/10] CLN: remove trailing whitespace --- doc/source/user_guide/computation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 1bd98270698ef..a8a705f1e71af 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -339,7 +339,7 @@ We provide a number of common statistical functions: it over the data, and in this case we should specify ``ddof=0`` when calling these methods to use population variance instead of sample variance. Using sample variance under the circumstances would result in a biased estimator - of the variable we are trying to determine. + of the variable we are trying to determine. .. _stats.rolling_apply: From 9f286aad4e07333dfda38a496580bf3236feaa49 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 13 Apr 2020 08:56:06 +0300 Subject: [PATCH 09/10] CLN: remove double space --- doc/source/user_guide/computation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index a8a705f1e71af..d7d025981f2f4 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -328,7 +328,7 @@ We provide a number of common statistical functions: Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample variance formula by default, i.e. the sum of squared differences is divided by ``window_size - 1`` and not by ``window_size`` during averaging. In statistics, - we use sample when the dataset is drawn from a larger population that we + we use sample when the dataset is drawn from a larger population that we don't have access to. Using it implies that the data in our window is a random sample from the population, and we are interested not in the variance inside the specific window but in the variance of some general window that From 0318f8832311504a13bb3eb4d3d468e7354c3828 Mon Sep 17 00:00:00 2001 From: Alexander Kirko Date: Mon, 13 Apr 2020 08:59:11 +0300 Subject: [PATCH 10/10] CLN: remove pd_kwargs from the test --- pandas/tests/window/test_base_indexer.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 358147754c610..43489e310bb93 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -97,15 +97,14 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize("constructor", [Series, DataFrame]) @pytest.mark.parametrize( - "func,np_func,expected,pd_kwargs,np_kwargs", + "func,np_func,expected,np_kwargs", [ - ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {}, {},), + ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {},), ( "max", np.max, [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan], {}, - {}, ), ( "std", @@ -122,7 +121,6 @@ def get_window_bounds(self, num_values, min_periods, center, closed): 0.70710678, np.nan, ], - {}, {"ddof": 1}, ), ( @@ -140,14 +138,11 @@ def get_window_bounds(self, num_values, min_periods, center, closed): 0.500000, np.nan, ], - {}, {"ddof": 1}, ), ], ) -def test_rolling_forward_window( - constructor, func, np_func, expected, pd_kwargs, np_kwargs -): +def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 values = np.arange(10) values[5] = 100.0 @@ -157,15 +152,15 @@ def test_rolling_forward_window( match = "Forward-looking windows can't have center=True" with pytest.raises(ValueError, match=match): rolling = constructor(values).rolling(window=indexer, center=True) - result = getattr(rolling, func)(**pd_kwargs) + result = getattr(rolling, func)() match = "Forward-looking windows don't support setting the closed argument" with pytest.raises(ValueError, match=match): rolling = constructor(values).rolling(window=indexer, closed="right") - result = getattr(rolling, func)(**pd_kwargs) + result = getattr(rolling, func)() rolling = constructor(values).rolling(window=indexer, min_periods=2) - result = getattr(rolling, func)(**pd_kwargs) + result = getattr(rolling, func)() expected = constructor(expected) tm.assert_equal(result, expected) expected2 = constructor(rolling.apply(lambda x: np_func(x, **np_kwargs)))