fix merge

auderson · auderson · commit ac1cec468983 · 2022-03-25T14:00:15.000+08:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -46,6 +46,7 @@ Other enhancements
 - Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
 - :class:`Series` and :class:`DataFrame` with ``IntegerDtype`` now supports bitwise operations (:issue:`34463`)
 - Add ``milliseconds`` field support for :class:`~pandas.DateOffset` (:issue:`43371`)
+- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
 
 
 cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
-                               float64_t ssqdm_x) nogil:
+                               float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil:
     cdef:
         float64_t result
 
     # Variance is unchanged if no observation is added or removed
     if (nobs >= minp) and (nobs > ddof):
 
-        # pathological case
-        if nobs == 1:
+        # pathological case & repeatedly same values case
+        if nobs == 1 or num_consecutive_same_value >= nobs:
             result = 0
         else:
             result = ssqdm_x / (nobs - <float64_t>ddof)
@@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
 
 
 cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
-                         float64_t *ssqdm_x, float64_t *compensation) nogil:
+                         float64_t *ssqdm_x, float64_t *compensation,
+                         int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
     """ add a value from the var calc """
     cdef:
         float64_t delta, prev_mean, y, t
@@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
         return
 
     nobs[0] = nobs[0] + 1
+
+    # GH#42064, record num of same values to remove floating point artifacts
+    if val == prev_value[0]:
+        num_consecutive_same_value[0] += 1
+    else:
+        # reset to 1 (include current value itself)
+        num_consecutive_same_value[0] = 1
+    prev_value[0] = val
+
     # Welford's method for the online variance-calculation
     # using Kahan summation
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
@@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
     """
     cdef:
         float64_t mean_x, ssqdm_x, nobs, compensation_add,
-        float64_t compensation_remove,
-        float64_t val, prev, delta, mean_x_old
-        int64_t s, e
+        float64_t compensation_remove, prev_value
+        int64_t s, e, num_consecutive_same_value
         Py_ssize_t i, j, N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
@@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
             # never removed
             if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                prev_value = values[s]
+                num_consecutive_same_value = 0
+
                 mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
                 for j in range(s, e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
+                            &num_consecutive_same_value, &prev_value)
 
             else:
 
@@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
+                            &num_consecutive_same_value, &prev_value)
 
-            output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
+            output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0.0
diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py
@@ -16,9 +16,22 @@
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
 def add_var(
-    val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
-) -> tuple[int, float, float, float]:
+    val: float,
+    nobs: int,
+    mean_x: float,
+    ssqdm_x: float,
+    compensation: float,
+    num_consecutive_same_value: int,
+    prev_value: float,
+) -> tuple[int, float, float, float, int, float]:
     if not np.isnan(val):
+
+        if val == prev_value:
+            num_consecutive_same_value += 1
+        else:
+            num_consecutive_same_value = 1
+        prev_value = val
+
         nobs += 1
         prev_mean = mean_x - compensation
         y = val - compensation
@@ -30,7 +43,7 @@ def add_var(
         else:
             mean_x = 0
         ssqdm_x += (val - prev_mean) * (val - mean_x)
-    return nobs, mean_x, ssqdm_x, compensation
+    return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
 
 
 @numba.jit(nopython=True, nogil=True, parallel=False)
@@ -79,10 +92,27 @@ def sliding_var(
         s = start[i]
         e = end[i]
         if i == 0 or not is_monotonic_increasing_bounds:
+
+            prev_value = values[s]
+            num_consecutive_same_value = 0
+
             for j in range(s, e):
                 val = values[j]
-                nobs, mean_x, ssqdm_x, compensation_add = add_var(
-                    val, nobs, mean_x, ssqdm_x, compensation_add
+                (
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
+                ) = add_var(
+                    val,
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
                 )
         else:
             for j in range(start[i - 1], s):
@@ -93,12 +123,25 @@ def sliding_var(
 
             for j in range(end[i - 1], e):
                 val = values[j]
-                nobs, mean_x, ssqdm_x, compensation_add = add_var(
-                    val, nobs, mean_x, ssqdm_x, compensation_add
+                (
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
+                ) = add_var(
+                    val,
+                    nobs,
+                    mean_x,
+                    ssqdm_x,
+                    compensation_add,
+                    num_consecutive_same_value,
+                    prev_value,
                 )
 
         if nobs >= min_periods and nobs > ddof:
-            if nobs == 1:
+            if nobs == 1 or num_consecutive_same_value >= nobs:
                 result = 0.0
             else:
                 result = ssqdm_x / (nobs - ddof)
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -2150,24 +2150,21 @@ def median(
         The default ``ddof`` of 1 used in :meth:`Series.std` is different
         than the default ``ddof`` of 0 in :func:`numpy.std`.
 
-        A minimum of one period is required for the rolling calculation.
-
-        The implementation is susceptible to floating point imprecision as
-        shown in the example below.\n
+        A minimum of one period is required for the rolling calculation.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).std()
-        0             NaN
-        1             NaN
-        2    5.773503e-01
-        3    1.000000e+00
-        4    1.000000e+00
-        5    1.154701e+00
-        6    2.580957e-08
+        0         NaN
+        1         NaN
+        2    0.577350
+        3    1.000000
+        4    1.000000
+        5    1.154701
+        6    0.000000
         dtype: float64
         """
         ).replace("\n", "", 1),
@@ -2212,24 +2209,21 @@ def std(
         The default ``ddof`` of 1 used in :meth:`Series.var` is different
         than the default ``ddof`` of 0 in :func:`numpy.var`.
 
-        A minimum of one period is required for the rolling calculation.
-
-        The implementation is susceptible to floating point imprecision as
-        shown in the example below.\n
+        A minimum of one period is required for the rolling calculation.\n
         """
         ).replace("\n", "", 1),
         create_section_header("Examples"),
         dedent(
             """
         >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
         >>> s.rolling(3).var()
-        0             NaN
-        1             NaN
-        2    3.333333e-01
-        3    1.000000e+00
-        4    1.000000e+00
-        5    1.333333e+00
-        6    6.661338e-16
+        0         NaN
+        1         NaN
+        2    0.333333
+        3    1.000000
+        4    1.000000
+        5    1.333333
+        6    0.000000
         dtype: float64
         """
         ).replace("\n", "", 1),
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1781,3 +1781,103 @@ def test_step_not_integer_raises():
 def test_step_not_positive_raises():
     with pytest.raises(ValueError, match="step must be >= 0"):
         DataFrame(range(2)).rolling(1, step=-1)
+
+
+@pytest.mark.parametrize(
+    ["values", "window", "min_periods", "expected"],
+    [
+        [
+            np.array([20, 10, 10, np.inf, 1, 1, 2, 3]),
+            3,
+            1,
+            np.array(
+                [
+                    np.nan,
+                    50.0,
+                    33.33333333333333,
+                    0.0,
+                    40.5,
+                    0.0,
+                    0.3333333333333333,
+                    1.0,
+                ]
+            ),
+        ],
+        [
+            np.array([20, 10, 10, np.nan, 10, 1, 2, 3]),
+            3,
+            1,
+            np.array(
+                [
+                    np.nan,
+                    50.0,
+                    33.33333333333333,
+                    0.0,
+                    0.0,
+                    40.5,
+                    24.333333333333332,
+                    1.0,
+                ]
+            ),
+        ],
+        [
+            np.array([np.nan, 5, 6, 7, 5, 5, 5]),
+            3,
+            3,
+            np.array([np.nan, np.nan, np.nan, 1.0, 1.0, 1.3333333333333335, 0.0]),
+        ],
+        [
+            np.array([5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3]),
+            3,
+            3,
+            np.array(
+                [
+                    np.nan,
+                    np.nan,
+                    1.3333333333333335,
+                    0.0,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    0.33333333333333337,
+                    0.0,
+                ]
+            ),
+        ],
+        [
+            np.array([5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3]),
+            3,
+            3,
+            np.array(
+                [
+                    np.nan,
+                    np.nan,
+                    1.3333333333333335,
+                    0.0,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    5.333333333333333,
+                    0.0,
+                ]
+            ),
+        ],
+    ],
+)
+def test_rolling_var_same_value_count_logic(values, window, min_periods, expected):
+    # GH 42064
+
+    sr = Series(values)
+    result_var = sr.rolling(window, min_periods=min_periods).var()
+    # 1. result should be close to correct value
+    # non-zero values can still differ slightly as the result of online algorithm
+    assert np.isclose(result_var, expected, equal_nan=True).all()
+    # 2. zeros should be exactly the same since the new algo takes effect here
+    assert (result_var[expected == 0] == 0).all()
+
+    # std should also pass as it's just a sqrt of var
+    result_std = sr.rolling(window, min_periods=min_periods).std()
+    assert np.isclose(result_std, np.sqrt(expected), equal_nan=True).all()
+    assert (result_std[expected == 0] == 0).all()

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ Other enhancements`
`46`	`46`	- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
`47`	`47`	- :class:`Series` and :class:`DataFrame` with ``IntegerDtype`` now supports bitwise operations (:issue:`34463`)
`48`	`48`	- Add ``milliseconds`` field support for :class:`~pandas.DateOffset` (:issue:`43371`)
	`49`	+- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`)
`49`	`50`	`-`
`50`	`51`
`51`	`52`	`.. ---------------------------------------------------------------------------`