Skip to content

Commit 8dd2d95

Browse files
authored
ENH: Improve numerical stability for window functions skew and kurt (#37557)
1 parent 3760f47 commit 8dd2d95

File tree

4 files changed

+175
-40
lines changed

4 files changed

+175
-40
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ Other enhancements
232232
- :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
233233
- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
234234
- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`)
235+
- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
235236

236237
.. _whatsnew_120.api_breaking.python:
237238

pandas/_libs/window/aggregations.pyx

+139-40
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import cython
44

5+
from libc.math cimport round
56
from libcpp.deque cimport deque
67

78
import numpy as np
@@ -458,51 +459,92 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
458459

459460
cdef inline void add_skew(float64_t val, int64_t *nobs,
460461
float64_t *x, float64_t *xx,
461-
float64_t *xxx) nogil:
462+
float64_t *xxx,
463+
float64_t *compensation_x,
464+
float64_t *compensation_xx,
465+
float64_t *compensation_xxx) nogil:
462466
""" add a value from the skew calc """
467+
cdef:
468+
float64_t y, t
463469

464470
# Not NaN
465471
if notnan(val):
466472
nobs[0] = nobs[0] + 1
467473

468-
# seriously don't ask me why this is faster
469-
x[0] = x[0] + val
470-
xx[0] = xx[0] + val * val
471-
xxx[0] = xxx[0] + val * val * val
474+
y = val - compensation_x[0]
475+
t = x[0] + y
476+
compensation_x[0] = t - x[0] - y
477+
x[0] = t
478+
y = val * val - compensation_xx[0]
479+
t = xx[0] + y
480+
compensation_xx[0] = t - xx[0] - y
481+
xx[0] = t
482+
y = val * val * val - compensation_xxx[0]
483+
t = xxx[0] + y
484+
compensation_xxx[0] = t - xxx[0] - y
485+
xxx[0] = t
472486

473487

474488
cdef inline void remove_skew(float64_t val, int64_t *nobs,
475489
float64_t *x, float64_t *xx,
476-
float64_t *xxx) nogil:
490+
float64_t *xxx,
491+
float64_t *compensation_x,
492+
float64_t *compensation_xx,
493+
float64_t *compensation_xxx) nogil:
477494
""" remove a value from the skew calc """
495+
cdef:
496+
float64_t y, t
478497

479498
# Not NaN
480499
if notnan(val):
481500
nobs[0] = nobs[0] - 1
482501

483-
# seriously don't ask me why this is faster
484-
x[0] = x[0] - val
485-
xx[0] = xx[0] - val * val
486-
xxx[0] = xxx[0] - val * val * val
502+
y = - val - compensation_x[0]
503+
t = x[0] + y
504+
compensation_x[0] = t - x[0] - y
505+
x[0] = t
506+
y = - val * val - compensation_xx[0]
507+
t = xx[0] + y
508+
compensation_xx[0] = t - xx[0] - y
509+
xx[0] = t
510+
y = - val * val * val - compensation_xxx[0]
511+
t = xxx[0] + y
512+
compensation_xxx[0] = t - xxx[0] - y
513+
xxx[0] = t
487514

488515

489516
def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
490517
ndarray[int64_t] end, int64_t minp):
491518
cdef:
492-
float64_t val, prev
519+
float64_t val, prev, min_val, mean_val, sum_val = 0
520+
float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0
521+
float64_t compensation_xx_add = 0, compensation_xx_remove = 0
522+
float64_t compensation_x_add = 0, compensation_x_remove = 0
493523
float64_t x = 0, xx = 0, xxx = 0
494-
int64_t nobs = 0, i, j, N = len(values)
524+
int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0
495525
int64_t s, e
496-
ndarray[float64_t] output
526+
ndarray[float64_t] output, mean_array
497527
bint is_monotonic_increasing_bounds
498528

499529
minp = max(minp, 3)
500530
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
501531
start, end
502532
)
503533
output = np.empty(N, dtype=float)
534+
min_val = np.nanmin(values)
504535

505536
with nogil:
537+
for i in range(0, N):
538+
val = values[i]
539+
if notnan(val):
540+
nobs_mean += 1
541+
sum_val += val
542+
mean_val = sum_val / nobs_mean
543+
# Other cases would lead to imprecision for smallest values
544+
if min_val - mean_val > -1e5:
545+
mean_val = round(mean_val)
546+
for i in range(0, N):
547+
values[i] = values[i] - mean_val
506548

507549
for i in range(0, N):
508550

@@ -515,22 +557,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
515557

516558
for j in range(s, e):
517559
val = values[j]
518-
add_skew(val, &nobs, &x, &xx, &xxx)
560+
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
561+
&compensation_xx_add, &compensation_xxx_add)
519562

520563
else:
521564

522565
# After the first window, observations can both be added
523566
# and removed
567+
# calculate deletes
568+
for j in range(start[i - 1], s):
569+
val = values[j]
570+
remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
571+
&compensation_xx_remove, &compensation_xxx_remove)
524572

525573
# calculate adds
526574
for j in range(end[i - 1], e):
527575
val = values[j]
528-
add_skew(val, &nobs, &x, &xx, &xxx)
529-
530-
# calculate deletes
531-
for j in range(start[i - 1], s):
532-
val = values[j]
533-
remove_skew(val, &nobs, &x, &xx, &xxx)
576+
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
577+
&compensation_xx_add, &compensation_xxx_add)
534578

535579
output[i] = calc_skew(minp, nobs, x, xx, xxx)
536580

@@ -585,42 +629,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
585629

586630
cdef inline void add_kurt(float64_t val, int64_t *nobs,
587631
float64_t *x, float64_t *xx,
588-
float64_t *xxx, float64_t *xxxx) nogil:
632+
float64_t *xxx, float64_t *xxxx,
633+
float64_t *compensation_x,
634+
float64_t *compensation_xx,
635+
float64_t *compensation_xxx,
636+
float64_t *compensation_xxxx) nogil:
589637
""" add a value from the kurotic calc """
638+
cdef:
639+
float64_t y, t
590640

591641
# Not NaN
592642
if notnan(val):
593643
nobs[0] = nobs[0] + 1
594644

595-
# seriously don't ask me why this is faster
596-
x[0] = x[0] + val
597-
xx[0] = xx[0] + val * val
598-
xxx[0] = xxx[0] + val * val * val
599-
xxxx[0] = xxxx[0] + val * val * val * val
645+
y = val - compensation_x[0]
646+
t = x[0] + y
647+
compensation_x[0] = t - x[0] - y
648+
x[0] = t
649+
y = val * val - compensation_xx[0]
650+
t = xx[0] + y
651+
compensation_xx[0] = t - xx[0] - y
652+
xx[0] = t
653+
y = val * val * val - compensation_xxx[0]
654+
t = xxx[0] + y
655+
compensation_xxx[0] = t - xxx[0] - y
656+
xxx[0] = t
657+
y = val * val * val * val - compensation_xxxx[0]
658+
t = xxxx[0] + y
659+
compensation_xxxx[0] = t - xxxx[0] - y
660+
xxxx[0] = t
600661

601662

602663
cdef inline void remove_kurt(float64_t val, int64_t *nobs,
603664
float64_t *x, float64_t *xx,
604-
float64_t *xxx, float64_t *xxxx) nogil:
665+
float64_t *xxx, float64_t *xxxx,
666+
float64_t *compensation_x,
667+
float64_t *compensation_xx,
668+
float64_t *compensation_xxx,
669+
float64_t *compensation_xxxx) nogil:
605670
""" remove a value from the kurotic calc """
671+
cdef:
672+
float64_t y, t
606673

607674
# Not NaN
608675
if notnan(val):
609676
nobs[0] = nobs[0] - 1
610677

611-
# seriously don't ask me why this is faster
612-
x[0] = x[0] - val
613-
xx[0] = xx[0] - val * val
614-
xxx[0] = xxx[0] - val * val * val
615-
xxxx[0] = xxxx[0] - val * val * val * val
678+
y = - val - compensation_x[0]
679+
t = x[0] + y
680+
compensation_x[0] = t - x[0] - y
681+
x[0] = t
682+
y = - val * val - compensation_xx[0]
683+
t = xx[0] + y
684+
compensation_xx[0] = t - xx[0] - y
685+
xx[0] = t
686+
y = - val * val * val - compensation_xxx[0]
687+
t = xxx[0] + y
688+
compensation_xxx[0] = t - xxx[0] - y
689+
xxx[0] = t
690+
y = - val * val * val * val - compensation_xxxx[0]
691+
t = xxxx[0] + y
692+
compensation_xxxx[0] = t - xxxx[0] - y
693+
xxxx[0] = t
616694

617695

618696
def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
619697
ndarray[int64_t] end, int64_t minp):
620698
cdef:
621-
float64_t val, prev
699+
float64_t val, prev, mean_val, min_val, sum_val = 0
700+
float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0
701+
float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0
702+
float64_t compensation_xx_remove = 0, compensation_xx_add = 0
703+
float64_t compensation_x_remove = 0, compensation_x_add = 0
622704
float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
623-
int64_t nobs = 0, i, j, s, e, N = len(values)
705+
int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0
624706
ndarray[float64_t] output
625707
bint is_monotonic_increasing_bounds
626708

@@ -629,8 +711,20 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
629711
start, end
630712
)
631713
output = np.empty(N, dtype=float)
714+
min_val = np.nanmin(values)
632715

633716
with nogil:
717+
for i in range(0, N):
718+
val = values[i]
719+
if notnan(val):
720+
nobs_mean += 1
721+
sum_val += val
722+
mean_val = sum_val / nobs_mean
723+
# Other cases would lead to imprecision for smallest values
724+
if min_val - mean_val > -1e4:
725+
mean_val = round(mean_val)
726+
for i in range(0, N):
727+
values[i] = values[i] - mean_val
634728

635729
for i in range(0, N):
636730

@@ -642,20 +736,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
642736
if i == 0 or not is_monotonic_increasing_bounds:
643737

644738
for j in range(s, e):
645-
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
739+
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
740+
&compensation_x_add, &compensation_xx_add,
741+
&compensation_xxx_add, &compensation_xxxx_add)
646742

647743
else:
648744

649745
# After the first window, observations can both be added
650746
# and removed
747+
# calculate deletes
748+
for j in range(start[i - 1], s):
749+
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
750+
&compensation_x_remove, &compensation_xx_remove,
751+
&compensation_xxx_remove, &compensation_xxxx_remove)
651752

652753
# calculate adds
653754
for j in range(end[i - 1], e):
654-
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
655-
656-
# calculate deletes
657-
for j in range(start[i - 1], s):
658-
remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
755+
add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
756+
&compensation_x_add, &compensation_xx_add,
757+
&compensation_xxx_add, &compensation_xxxx_add)
659758

660759
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
661760

pandas/tests/window/test_expanding.py

+10
Original file line numberDiff line numberDiff line change
@@ -255,3 +255,13 @@ def test_expanding_sem(constructor):
255255
result = Series(result[0].values)
256256
expected = Series([np.nan] + [0.707107] * 2)
257257
tm.assert_series_equal(result, expected)
258+
259+
260+
@pytest.mark.parametrize("method", ["skew", "kurt"])
261+
def test_expanding_skew_kurt_numerical_stability(method):
262+
# GH: 6929
263+
s = Series(np.random.rand(10))
264+
expected = getattr(s.expanding(3), method)()
265+
s = s + 5000
266+
result = getattr(s.expanding(3), method)()
267+
tm.assert_series_equal(result, expected)

pandas/tests/window/test_rolling.py

+25
Original file line numberDiff line numberDiff line change
@@ -1102,3 +1102,28 @@ def test_groupby_rolling_nan_included():
11021102
),
11031103
)
11041104
tm.assert_frame_equal(result, expected)
1105+
1106+
1107+
@pytest.mark.parametrize("method", ["skew", "kurt"])
1108+
def test_rolling_skew_kurt_numerical_stability(method):
1109+
# GH: 6929
1110+
s = Series(np.random.rand(10))
1111+
expected = getattr(s.rolling(3), method)()
1112+
s = s + 50000
1113+
result = getattr(s.rolling(3), method)()
1114+
tm.assert_series_equal(result, expected)
1115+
1116+
1117+
@pytest.mark.parametrize(
1118+
("method", "values"),
1119+
[
1120+
("skew", [2.0, 0.854563, 0.0, 1.999984]),
1121+
("kurt", [4.0, -1.289256, -1.2, 3.999946]),
1122+
],
1123+
)
1124+
def test_rolling_skew_kurt_large_value_range(method, values):
1125+
# GH: 37557
1126+
s = Series([3000000, 1, 1, 2, 3, 4, 999])
1127+
result = getattr(s.rolling(4), method)()
1128+
expected = Series([np.nan] * 3 + values)
1129+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)