ENH: use same value counts for roll_var to remove floating point artifacts (#46431)

auderson · auderson · jreback · web-flow · commit a8968bfa696d · 2022-04-19T14:25:13.000-07:00
* fix merge

* bug fix: nobs == 0 == min_periods

* use Py_ssize_t in place of int64_t

* use tm.assert_series_equal

* add more checks in roll numba methods

* add more checks in roll numba methods

* cancel check exact

* test 32bit: undo Py_ssize_t

* add explanation for test_rolling_var_same_value_count_logic

* comments updated

* add equal-zero test for test_rolling_var_numerical_issues &amp; test_rolling_var_floating_artifact_precision

* update docs

* add more tests

* update whats new

* update whats new

* update doc

Co-authored-by: auderson &lt;liao.renjie@techfin.ai&gt;
Co-authored-by: Jeff Reback &lt;jeff@reback.net&gt;
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -603,6 +603,7 @@ Groupby/resample/rolling
 - Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`)
 - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`)
 - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
+- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
 - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
 - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
 
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
 
 
 cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
-                               float64_t ssqdm_x) nogil:
+                               float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil:
     cdef:
         float64_t result
 
     # Variance is unchanged if no observation is added or removed
     if (nobs >= minp) and (nobs > ddof):
 
-        # pathological case
-        if nobs == 1:
+        # pathological case & repeatedly same values case
+        if nobs == 1 or num_consecutive_same_value >= nobs:
             result = 0
         else:
             result = ssqdm_x / (nobs - <float64_t>ddof)
@@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
 
 
 cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
-                         float64_t *ssqdm_x, float64_t *compensation) nogil:
+                         float64_t *ssqdm_x, float64_t *compensation,
+                         int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
     """ add a value from the var calc """
     cdef:
         float64_t delta, prev_mean, y, t
@@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
         return
 
     nobs[0] = nobs[0] + 1
+
+    # GH#42064, record num of same values to remove floating point artifacts
+    if val == prev_value[0]:
+        num_consecutive_same_value[0] += 1
+    else:
+        # reset to 1 (include current value itself)
+        num_consecutive_same_value[0] = 1
+    prev_value[0] = val
+
     # Welford's method for the online variance-calculation
     # using Kahan summation
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
@@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
     """
     cdef:
         float64_t mean_x, ssqdm_x, nobs, compensation_add,
-        float64_t compensation_remove,
-        float64_t val, prev, delta, mean_x_old
-        int64_t s, e
+        float64_t compensation_remove, prev_value
+        int64_t s, e, num_consecutive_same_value
         Py_ssize_t i, j, N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
                 for j in range(s, e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
+                            &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
+                            &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
+            output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0.0
diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py
@@ -81,7 +81,7 @@ def sliding_sum(
                     val, nobs, sum_x, compensation_add
                 )
 
-        if nobs == 0 == nobs:
+        if nobs == 0 == min_periods:
             result = 0.0
         elif nobs >= min_periods:
             result = sum_x
diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py
@@ -16,9 +16,22 @@
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
 def add_var(
-    val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
-) -> tuple[int, float, float, float]:
+    val: float,
+    nobs: int,
+    mean_x: float,
+    ssqdm_x: float,
+    compensation: float,
+    num_consecutive_same_value: int,
+    prev_value: float,
+) -> tuple[int, float, float, float, int, float]:
     if not np.isnan(val):
+
+        if val == prev_value:
+            num_consecutive_same_value += 1
+        else:
+            num_consecutive_same_value = 1
+        prev_value = val
+
         nobs += 1
         prev_mean = mean_x - compensation
         y = val - compensation
@@ -30,7 +43,7 @@ def add_var(
         else:
             mean_x = 0
         ssqdm_x += (val - prev_mean) * (val - mean_x)
-    return nobs, mean_x, ssqdm_x, compensation
+    return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
 
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
@@ -79,10 +92,27 @@ def sliding_var(
         s = start[i]
         e = end[i]
         if i == 0 or not is_monotonic_increasing_bounds:
+
+            prev_value = values[s]
+            num_consecutive_same_value = 0
+
             for j in range(s, e):
                 val = values[j]
-                nobs, mean_x, ssqdm_x, compensation_add = add_var(
-                    val, nobs, mean_x, ssqdm_x, compensation_add
+                (
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
+                ) = add_var(
+                    val,
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
                 )
         else:
             for j in range(start[i - 1], s):
@@ -93,12 +123,25 @@ def sliding_var(
 
             for j in range(end[i - 1], e):
                 val = values[j]
-                nobs, mean_x, ssqdm_x, compensation_add = add_var(
-                    val, nobs, mean_x, ssqdm_x, compensation_add
+                (
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
+                ) = add_var(
+                    val,
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
                 )
 
         if nobs >= min_periods and nobs > ddof:
-            if nobs == 1:
+            if nobs == 1 or num_consecutive_same_value >= nobs:
                 result = 0.0
             else:
                 result = ssqdm_x / (nobs - ddof)
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -2150,24 +2150,21 @@ def median(
         The default ``ddof`` of 1 used in :meth:`Series.std` is different
         than the default ``ddof`` of 0 in :func:`numpy.std`.
 
-        A minimum of one period is required for the rolling calculation.
-
-        The implementation is susceptible to floating point imprecision as
-        shown in the example below.\n
+        A minimum of one period is required for the rolling calculation.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).std()
-        0             NaN
-        1             NaN
-        2    5.773503e-01
-        3    1.000000e+00
-        4    1.000000e+00
-        5    1.154701e+00
-        6    2.580957e-08
+        0         NaN
+        1         NaN
+        2    0.577350
+        3    1.000000
+        4    1.000000
+        5    1.154701
+        6    0.000000
         dtype: float64
         """
         ).replace("\n", "", 1),
@@ -2212,24 +2209,21 @@ def std(
         The default ``ddof`` of 1 used in :meth:`Series.var` is different
         than the default ``ddof`` of 0 in :func:`numpy.var`.
 
-        A minimum of one period is required for the rolling calculation.
-
-        The implementation is susceptible to floating point imprecision as
-        shown in the example below.\n
+        A minimum of one period is required for the rolling calculation.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).var()
-        0             NaN
-        1             NaN
-        2    3.333333e-01
-        3    1.000000e+00
-        4    1.000000e+00
-        5    1.333333e+00
-        6    6.661338e-16
+        0         NaN
+        1         NaN
+        2    0.333333
+        3    1.000000
+        4    1.000000
+        5    1.333333
+        6    0.000000
         dtype: float64
         """
         ).replace("\n", "", 1),
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -79,7 +79,20 @@ def f(x, *args):
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
-        "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]
+        "data",
+        [
+            DataFrame(np.eye(5)),
+            DataFrame(
+                [
+                    [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
+                    [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
+                    [np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5],
+                ]
+            ).T,
+            Series(range(5), name="foo"),
+            Series([20, 10, 10, np.inf, 1, 1, 2, 3]),
+            Series([20, 10, 10, np.nan, 10, 1, 2, 3]),
+        ],
     )
     def test_numba_vs_cython_rolling_methods(
         self,
@@ -95,7 +108,7 @@ def test_numba_vs_cython_rolling_methods(
 
         engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
 
-        roll = data.rolling(2, step=step)
+        roll = data.rolling(3, step=step)
         result = getattr(roll, method)(
             engine="numba", engine_kwargs=engine_kwargs, **kwargs
         )
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1203,6 +1203,9 @@ def test_rolling_var_numerical_issues(func, third_value, values):
     result = getattr(ds.rolling(2), func)()
     expected = Series([np.nan] + values)
     tm.assert_series_equal(result, expected)
+    # GH 42064
+    # new `roll_var` will output 0.0 correctly
+    tm.assert_series_equal(result == 0, expected == 0)
 
 
 def test_timeoffset_as_window_parameter_for_corr():
@@ -1509,6 +1512,9 @@ def test_rolling_var_floating_artifact_precision():
     result = s.rolling(3).var()
     expected = Series([np.nan, np.nan, 4 / 3, 0])
     tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)
+    # GH 42064
+    # new `roll_var` will output 0.0 correctly
+    tm.assert_series_equal(result == 0, expected == 0)
 
 
 def test_rolling_std_small_values():
@@ -1769,3 +1775,77 @@ def test_step_not_integer_raises():
 def test_step_not_positive_raises():
     with pytest.raises(ValueError, match="step must be >= 0"):
         DataFrame(range(2)).rolling(1, step=-1)
+
+
+@pytest.mark.parametrize(
+    ["values", "window", "min_periods", "expected"],
+    [
+        [
+            [20, 10, 10, np.inf, 1, 1, 2, 3],
+            3,
+            1,
+            [np.nan, 50, 100 / 3, 0, 40.5, 0, 1 / 3, 1],
+        ],
+        [
+            [20, 10, 10, np.nan, 10, 1, 2, 3],
+            3,
+            1,
+            [np.nan, 50, 100 / 3, 0, 0, 40.5, 73 / 3, 1],
+        ],
+        [
+            [np.nan, 5, 6, 7, 5, 5, 5],
+            3,
+            3,
+            [np.nan] * 3 + [1, 1, 4 / 3, 0],
+        ],
+        [
+            [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
+            3,
+            3,
+            [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [1 / 3, 0],
+        ],
+        [
+            [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
+            3,
+            3,
+            [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [16 / 3, 0],
+        ],
+        [
+            [5, 7] * 4,
+            3,
+            3,
+            [np.nan] * 2 + [4 / 3] * 6,
+        ],
+        [
+            [5, 7, 5, np.nan, 7, 5, 7],
+            3,
+            2,
+            [np.nan, 2, 4 / 3] + [2] * 3 + [4 / 3],
+        ],
+    ],
+)
+def test_rolling_var_same_value_count_logic(values, window, min_periods, expected):
+    # GH 42064.
+
+    expected = Series(expected)
+    sr = Series(values)
+
+    # With new algo implemented, result will be set to .0 in rolling var
+    # if sufficient amount of consecutively same values are found.
+    result_var = sr.rolling(window, min_periods=min_periods).var()
+
+    # use `assert_series_equal` twice to check for equality,
+    # because `check_exact=True` will fail in 32-bit tests due to
+    # precision loss.
+
+    # 1. result should be close to correct value
+    # non-zero values can still differ slightly from "truth"
+    # as the result of online algorithm
+    tm.assert_series_equal(result_var, expected)
+    # 2. zeros should be exactly the same since the new algo takes effect here
+    tm.assert_series_equal(expected == 0, result_var == 0)
+
+    # std should also pass as it's just a sqrt of var
+    result_std = sr.rolling(window, min_periods=min_periods).std()
+    tm.assert_series_equal(result_std, np.sqrt(expected))
+    tm.assert_series_equal(expected == 0, result_std == 0)

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ def sliding_sum(`
`81`	`81`	`val, nobs, sum_x, compensation_add`
`82`	`82`	`)`
`83`	`83`
`84`		`- if nobs == 0 == nobs:`
	`84`	`+ if nobs == 0 == min_periods:`
`85`	`85`	`result = 0.0`
`86`	`86`	`elif nobs >= min_periods:`
`87`	`87`	`result = sum_x`