Skip to content

ENH: Improve numerical stability for window functions skew and kurt #37557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ Other enhancements
- :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`)
- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)

.. _whatsnew_120.api_breaking.python:

Expand Down
178 changes: 138 additions & 40 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -458,51 +458,92 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,

cdef inline void add_skew(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx) nogil:
float64_t *xxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx) nogil:
""" add a value from the skew calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1

# seriously don't ask me why this is faster
x[0] = x[0] + val
xx[0] = xx[0] + val * val
xxx[0] = xxx[0] + val * val * val
y = val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t


cdef inline void remove_skew(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx) nogil:
float64_t *xxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx) nogil:
""" remove a value from the skew calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] - 1

# seriously don't ask me why this is faster
x[0] = x[0] - val
xx[0] = xx[0] - val * val
xxx[0] = xxx[0] - val * val * val
y = - val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = - val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = - val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t


def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t val, prev
float64_t val, prev, min_val, mean_val, sum_val = 0
float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0
float64_t compensation_xx_add = 0, compensation_xx_remove = 0
float64_t compensation_x_add = 0, compensation_x_remove = 0
float64_t x = 0, xx = 0, xxx = 0
int64_t nobs = 0, i, j, N = len(values)
int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0, mean_int = 0
int64_t s, e
ndarray[float64_t] output
ndarray[float64_t] output, mean_array
bint is_monotonic_increasing_bounds

minp = max(minp, 3)
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
start, end
)
output = np.empty(N, dtype=float)
min_val = np.nanmin(values)

with nogil:
for i in range(0, N):
val = values[i]
if notnan(val):
nobs_mean += 1
sum_val += val
mean_val = sum_val / nobs_mean
# Other cases would lead to imprecision for smallest values
if min_val - mean_val > -1e5:
mean_int = <int64_t>mean_val
for i in range(0, N):
values[i] = values[i] - mean_int

for i in range(0, N):

Expand All @@ -515,22 +556,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,

for j in range(s, e):
val = values[j]
add_skew(val, &nobs, &x, &xx, &xxx)
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)

else:

# After the first window, observations can both be added
# and removed
# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
&compensation_xx_remove, &compensation_xxx_remove)

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
add_skew(val, &nobs, &x, &xx, &xxx)

# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
remove_skew(val, &nobs, &x, &xx, &xxx)
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
&compensation_xx_add, &compensation_xxx_add)

output[i] = calc_skew(minp, nobs, x, xx, xxx)

Expand Down Expand Up @@ -585,42 +628,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,

cdef inline void add_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx) nogil:
float64_t *xxx, float64_t *xxxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) nogil:
""" add a value from the kurotic calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1

# seriously don't ask me why this is faster
x[0] = x[0] + val
xx[0] = xx[0] + val * val
xxx[0] = xxx[0] + val * val * val
xxxx[0] = xxxx[0] + val * val * val * val
y = val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t


cdef inline void remove_kurt(float64_t val, int64_t *nobs,
float64_t *x, float64_t *xx,
float64_t *xxx, float64_t *xxxx) nogil:
float64_t *xxx, float64_t *xxxx,
float64_t *compensation_x,
float64_t *compensation_xx,
float64_t *compensation_xxx,
float64_t *compensation_xxxx) nogil:
""" remove a value from the kurotic calc """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] - 1

# seriously don't ask me why this is faster
x[0] = x[0] - val
xx[0] = xx[0] - val * val
xxx[0] = xxx[0] - val * val * val
xxxx[0] = xxxx[0] - val * val * val * val
y = - val - compensation_x[0]
t = x[0] + y
compensation_x[0] = t - x[0] - y
x[0] = t
y = - val * val - compensation_xx[0]
t = xx[0] + y
compensation_xx[0] = t - xx[0] - y
xx[0] = t
y = - val * val * val - compensation_xxx[0]
t = xxx[0] + y
compensation_xxx[0] = t - xxx[0] - y
xxx[0] = t
y = - val * val * val * val - compensation_xxxx[0]
t = xxxx[0] + y
compensation_xxxx[0] = t - xxxx[0] - y
xxxx[0] = t


def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t val, prev
float64_t val, prev, mean_val, min_val, sum_val = 0
float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0
float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0
float64_t compensation_xx_remove = 0, compensation_xx_add = 0
float64_t compensation_x_remove = 0, compensation_x_add = 0
float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
int64_t nobs = 0, i, j, s, e, N = len(values)
int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0, mean_int = 0
ndarray[float64_t] output
bint is_monotonic_increasing_bounds

Expand All @@ -629,8 +710,20 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
start, end
)
output = np.empty(N, dtype=float)
min_val = np.nanmin(values)

with nogil:
for i in range(0, N):
val = values[i]
if notnan(val):
nobs_mean += 1
sum_val += val
mean_val = sum_val / nobs_mean
# Other cases would lead to imprecision for smallest values
if min_val - mean_val > -1e4:
mean_int = <int64_t>mean_val
for i in range(0, N):
values[i] = values[i] - mean_int

for i in range(0, N):

Expand All @@ -642,20 +735,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
if i == 0 or not is_monotonic_increasing_bounds:

for j in range(s, e):
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)

else:

# After the first window, observations can both be added
# and removed
# calculate deletes
for j in range(start[i - 1], s):
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_remove, &compensation_xx_remove,
&compensation_xxx_remove, &compensation_xxxx_remove)

# calculate adds
for j in range(end[i - 1], e):
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)

# calculate deletes
for j in range(start[i - 1], s):
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
&compensation_x_add, &compensation_xx_add,
&compensation_xxx_add, &compensation_xxxx_add)

output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/window/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,13 @@ def test_expanding_sem(constructor):
result = pd.Series(result[0].values)
expected = pd.Series([np.nan] + [0.707107] * 2)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_expanding_skew_kurt_numerical_stability(method):
# GH: 6929
s = pd.Series(np.random.rand(10))
expected = getattr(s.expanding(3), method)()
s = s + 5000
result = getattr(s.expanding(3), method)()
tm.assert_series_equal(result, expected)
25 changes: 25 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,3 +1102,28 @@ def test_groupby_rolling_nan_included():
),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_rolling_skew_kurt_numerical_stability(method):
# GH: 6929
s = Series(np.random.rand(10))
expected = getattr(s.rolling(3), method)()
s = s + 50000
result = getattr(s.rolling(3), method)()
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
("method", "values"),
[
("skew", [2.0, 0.854563, 0.0, 1.999984]),
("kurt", [4.0, -1.289256, -1.2, 3.999946]),
],
)
def test_rolling_skew_kurt_large_value_range(method, values):
# GH: 37557
s = Series([3000000, 1, 1, 2, 3, 4, 999])
result = getattr(s.rolling(4), method)()
expected = Series([np.nan] * 3 + values)
tm.assert_series_equal(result, expected)