BUG: Remove artificial precision limit in rolling var & std (pandas-dev#40505)

mroeschke · JulianWgs · commit 20601a996d02 · 2021-07-03T13:07:55.000+02:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -299,6 +299,24 @@ cast to ``dtype=object`` (:issue:`38709`)
    ser
    ser2
 
+
+.. _whatsnew_130.notable_bug_fixes.rolling_var_precision:
+
+Removed artificial truncation in rolling variance and standard deviation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`core.window.Rolling.std` and :meth:`core.window.Rolling.var` will no longer
+artificially truncate results that are less than ``~1e-8`` and ``~1e-15`` respectively to
+zero (:issue:`37051`, :issue:`40448`, :issue:`39872`).
+
+However, floating point artifacts may now exist in the results when rolling over larger values.
+
+.. ipython:: python
+
+   s = pd.Series([7, 5, 5, 5])
+   s.rolling(3).var()
+
+
 .. _whatsnew_130.api_breaking.deps:
 
 Increased minimum versions for dependencies
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -283,10 +283,6 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
             result = 0
         else:
             result = ssqdm_x / (nobs - <float64_t>ddof)
-            # Fix for numerical imprecision.
-            # Can be result < 0 once Kahan Summation is implemented
-            if result < 1e-14:
-                result = 0
     else:
         result = NaN
 
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -1882,21 +1882,24 @@ def median(
         The default ``ddof`` of 1 used in :meth:`Series.std` is different
         than the default ``ddof`` of 0 in :func:`numpy.std`.
 
-        A minimum of one period is required for the rolling calculation.\n
+        A minimum of one period is required for the rolling calculation.
+
+        The implementation is susceptible to floating point imprecision as
+        shown in the example below.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).std()
-        0         NaN
-        1         NaN
-        2    0.577350
-        3    1.000000
-        4    1.000000
-        5    1.154701
-        6    0.000000
+        0             NaN
+        1             NaN
+        2    5.773503e-01
+        3    1.000000e+00
+        4    1.000000e+00
+        5    1.154701e+00
+        6    2.580957e-08
         dtype: float64
         """
         ).replace("\n", "", 1),
@@ -1931,21 +1934,24 @@ def std(self, ddof: int = 1, *args, **kwargs):
         The default ``ddof`` of 1 used in :meth:`Series.var` is different
         than the default ``ddof`` of 0 in :func:`numpy.var`.
 
-        A minimum of one period is required for the rolling calculation.\n
+        A minimum of one period is required for the rolling calculation.
+
+        The implementation is susceptible to floating point imprecision as
+        shown in the example below.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).var()
-        0         NaN
-        1         NaN
-        2    0.333333
-        3    1.000000
-        4    1.000000
-        5    1.333333
-        6    0.000000
+        0             NaN
+        1             NaN
+        2    3.333333e-01
+        3    1.000000e+00
+        4    1.000000e+00
+        5    1.333333e+00
+        6    6.661338e-16
         dtype: float64
         """
         ).replace("\n", "", 1),
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1150,3 +1150,25 @@ def test_rolling_descending_date_order_with_offset(window, frame_or_series):
     idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1d")
     expected = frame_or_series([np.nan, 3, 2], index=idx)
     tm.assert_equal(result, expected)
+
+
+def test_rolling_var_floating_artifact_precision():
+    # GH 37051
+    s = Series([7, 5, 5, 5])
+    result = s.rolling(3).var()
+    expected = Series([np.nan, np.nan, 4 / 3, 0])
+    tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)
+
+
+def test_rolling_std_small_values():
+    # GH 37051
+    s = Series(
+        [
+            0.00000054,
+            0.00000053,
+            0.00000054,
+        ]
+    )
+    result = s.rolling(2).std()
+    expected = Series([np.nan, 7.071068e-9, 7.071068e-9])
+    tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)