diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 08500019143ed..9f2bbaa8d5120 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -603,6 +603,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) +- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`) - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`) - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index e58f4c2aa8e42..a1120cd559d57 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, - float64_t ssqdm_x) nogil: + float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil: cdef: float64_t result # Variance is unchanged if no observation is added or removed if (nobs >= minp) and (nobs > ddof): - # pathological case - if nobs == 1: + # pathological case & repeatedly same values case + if nobs == 1 or num_consecutive_same_value >= nobs: result = 0 else: result = ssqdm_x / (nobs - ddof) @@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, - float64_t *ssqdm_x, float64_t *compensation) nogil: + float64_t *ssqdm_x, float64_t *compensation, + int64_t *num_consecutive_same_value, float64_t *prev_value) nogil: """ add a value from the var calc """ cdef: float64_t delta, prev_mean, y, t @@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, return nobs[0] = nobs[0] + 1 + + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + # Welford's method for the online variance-calculation # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance @@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, """ cdef: float64_t mean_x, ssqdm_x, nobs, compensation_add, - float64_t compensation_remove, - float64_t val, prev, delta, mean_x_old - int64_t s, e + float64_t compensation_remove, prev_value + int64_t s, e, num_consecutive_same_value Py_ssize_t i, j, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # never removed if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + prev_value = values[s] + num_consecutive_same_value = 0 + mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0 for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + &num_consecutive_same_value, &prev_value) else: @@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value) if not is_monotonic_increasing_bounds: nobs = 0.0 diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index c2e81b4990ba9..b67452e4071d9 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -81,7 +81,7 @@ def sliding_sum( val, nobs, sum_x, compensation_add ) - if nobs == 0 == nobs: + if nobs == 0 == min_periods: result = 0.0 elif nobs >= min_periods: result = sum_x diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 2e5660673701b..b1c72832d249b 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -16,9 +16,22 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_var( - val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float -) -> tuple[int, float, float, float]: + val: float, + nobs: int, + mean_x: float, + ssqdm_x: float, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, float, float, int, float]: if not np.isnan(val): + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + nobs += 1 prev_mean = mean_x - compensation y = val - compensation @@ -30,7 +43,7 @@ def add_var( else: mean_x = 0 ssqdm_x += (val - prev_mean) * (val - mean_x) - return nobs, mean_x, ssqdm_x, compensation + return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value @numba.jit(nopython=True, nogil=True, parallel=False) @@ -79,10 +92,27 @@ def sliding_var( s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: + + prev_value = values[s] + num_consecutive_same_value = 0 + for j in range(s, e): val = values[j] - nobs, mean_x, ssqdm_x, compensation_add = add_var( - val, nobs, mean_x, ssqdm_x, compensation_add + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) else: for j in range(start[i - 1], s): @@ -93,12 +123,25 @@ def sliding_var( for j in range(end[i - 1], e): val = values[j] - nobs, mean_x, ssqdm_x, compensation_add = add_var( - val, nobs, mean_x, ssqdm_x, compensation_add + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) if nobs >= min_periods and nobs > ddof: - if nobs == 1: + if nobs == 1 or num_consecutive_same_value >= nobs: result = 0.0 else: result = ssqdm_x / (nobs - ddof) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ac3d8b3dabb2b..5006db44849e0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2150,10 +2150,7 @@ def median( The default ``ddof`` of 1 used in :meth:`Series.std` is different than the default ``ddof`` of 0 in :func:`numpy.std`. - A minimum of one period is required for the rolling calculation. - - The implementation is susceptible to floating point imprecision as - shown in the example below.\n + A minimum of one period is required for the rolling calculation.\n """ ).replace("\n", "", 1), create_section_header("Examples"), @@ -2161,13 +2158,13 @@ def median( """ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).std() - 0 NaN - 1 NaN - 2 5.773503e-01 - 3 1.000000e+00 - 4 1.000000e+00 - 5 1.154701e+00 - 6 2.580957e-08 + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 dtype: float64 """ ).replace("\n", "", 1), @@ -2212,10 +2209,7 @@ def std( The default ``ddof`` of 1 used in :meth:`Series.var` is different than the default ``ddof`` of 0 in :func:`numpy.var`. - A minimum of one period is required for the rolling calculation. - - The implementation is susceptible to floating point imprecision as - shown in the example below.\n + A minimum of one period is required for the rolling calculation.\n """ ).replace("\n", "", 1), create_section_header("Examples"), @@ -2223,13 +2217,13 @@ def std( """ >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) >>> s.rolling(3).var() - 0 NaN - 1 NaN - 2 3.333333e-01 - 3 1.000000e+00 - 4 1.000000e+00 - 5 1.333333e+00 - 6 6.661338e-16 + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 dtype: float64 """ ).replace("\n", "", 1), diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 0e7fe24420171..4cf6306dc39e5 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -79,7 +79,20 @@ def f(x, *args): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")] + "data", + [ + DataFrame(np.eye(5)), + DataFrame( + [ + [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3], + [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3], + [np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5], + ] + ).T, + Series(range(5), name="foo"), + Series([20, 10, 10, np.inf, 1, 1, 2, 3]), + Series([20, 10, 10, np.nan, 10, 1, 2, 3]), + ], ) def test_numba_vs_cython_rolling_methods( self, @@ -95,7 +108,7 @@ def test_numba_vs_cython_rolling_methods( engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - roll = data.rolling(2, step=step) + roll = data.rolling(3, step=step) result = getattr(roll, method)( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 89c90836ae957..5f7709c62a1e2 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1203,6 +1203,9 @@ def test_rolling_var_numerical_issues(func, third_value, values): result = getattr(ds.rolling(2), func)() expected = Series([np.nan] + values) tm.assert_series_equal(result, expected) + # GH 42064 + # new `roll_var` will output 0.0 correctly + tm.assert_series_equal(result == 0, expected == 0) def test_timeoffset_as_window_parameter_for_corr(): @@ -1509,6 +1512,9 @@ def test_rolling_var_floating_artifact_precision(): result = s.rolling(3).var() expected = Series([np.nan, np.nan, 4 / 3, 0]) tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15) + # GH 42064 + # new `roll_var` will output 0.0 correctly + tm.assert_series_equal(result == 0, expected == 0) def test_rolling_std_small_values(): @@ -1769,3 +1775,77 @@ def test_step_not_integer_raises(): def test_step_not_positive_raises(): with pytest.raises(ValueError, match="step must be >= 0"): DataFrame(range(2)).rolling(1, step=-1) + + +@pytest.mark.parametrize( + ["values", "window", "min_periods", "expected"], + [ + [ + [20, 10, 10, np.inf, 1, 1, 2, 3], + 3, + 1, + [np.nan, 50, 100 / 3, 0, 40.5, 0, 1 / 3, 1], + ], + [ + [20, 10, 10, np.nan, 10, 1, 2, 3], + 3, + 1, + [np.nan, 50, 100 / 3, 0, 0, 40.5, 73 / 3, 1], + ], + [ + [np.nan, 5, 6, 7, 5, 5, 5], + 3, + 3, + [np.nan] * 3 + [1, 1, 4 / 3, 0], + ], + [ + [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3], + 3, + 3, + [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [1 / 3, 0], + ], + [ + [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3], + 3, + 3, + [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [16 / 3, 0], + ], + [ + [5, 7] * 4, + 3, + 3, + [np.nan] * 2 + [4 / 3] * 6, + ], + [ + [5, 7, 5, np.nan, 7, 5, 7], + 3, + 2, + [np.nan, 2, 4 / 3] + [2] * 3 + [4 / 3], + ], + ], +) +def test_rolling_var_same_value_count_logic(values, window, min_periods, expected): + # GH 42064. + + expected = Series(expected) + sr = Series(values) + + # With new algo implemented, result will be set to .0 in rolling var + # if sufficient amount of consecutively same values are found. + result_var = sr.rolling(window, min_periods=min_periods).var() + + # use `assert_series_equal` twice to check for equality, + # because `check_exact=True` will fail in 32-bit tests due to + # precision loss. + + # 1. result should be close to correct value + # non-zero values can still differ slightly from "truth" + # as the result of online algorithm + tm.assert_series_equal(result_var, expected) + # 2. zeros should be exactly the same since the new algo takes effect here + tm.assert_series_equal(expected == 0, result_var == 0) + + # std should also pass as it's just a sqrt of var + result_std = sr.rolling(window, min_periods=min_periods).std() + tm.assert_series_equal(result_std, np.sqrt(expected)) + tm.assert_series_equal(expected == 0, result_std == 0)