Skip to content

Commit ae6fd52

Browse files
committed
WIP: Stable rolling skewness and kutosis algorithms
1 parent 989a51b commit ae6fd52

File tree

2 files changed

+153
-55
lines changed

2 files changed

+153
-55
lines changed

pandas/algos.pyx

Lines changed: 149 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,31 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
11531153

11541154
return result
11551155

1156+
#----------------------------------------------------------------------
1157+
# Auxiliary function for roll_var, roll_skew and roll_kurt
1158+
1159+
cdef inline void count_repeats(double val, double prev, double *rep,
1160+
Py_ssize_t *nrep, Py_ssize_t *nobs) nogil:
1161+
if prev == prev:
1162+
# prev is not NaN, removing an observation...
1163+
if nobs[0] == nrep[0]:
1164+
# ...and removing a repeat
1165+
nrep[0] -= 1
1166+
if nrep[0] == 0:
1167+
rep[0] = NaN
1168+
nobs[0] -= 1
1169+
1170+
if val == val:
1171+
# next is not NaN, adding an observation...
1172+
if val == prev:
1173+
# ...and adding a repeat
1174+
nrep[0] += 1
1175+
else:
1176+
# ...and resetting repeats
1177+
nrep[0] = 1
1178+
rep[0] = val
1179+
nobs[0] += 1
1180+
11561181
#----------------------------------------------------------------------
11571182
# Rolling variance
11581183

@@ -1241,88 +1266,160 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
12411266

12421267
return output
12431268

1244-
12451269
#-------------------------------------------------------------------------------
12461270
# Rolling skewness
12471271

12481272
def roll_skew(ndarray[double_t] input, int win, int minp):
1273+
"""
1274+
Numerically stable implementation using a Welford-like method.
1275+
"""
12491276
cdef double val, prev
1250-
cdef double x = 0, xx = 0, xxx = 0
1251-
cdef Py_ssize_t nobs = 0, i
1277+
cdef double mean_x = 0, sum2dm_x = 0, sum3dm_x = 0
1278+
cdef double delta, rep = NaN, val_prev, delta_nobs, sum_sum2dm_x
1279+
cdef Py_ssize_t nobs = 0, nrep = 0, i
12521280
cdef Py_ssize_t N = len(input)
12531281

12541282
cdef ndarray[double_t] output = np.empty(N, dtype=float)
12551283

1256-
# 3 components of the skewness equation
1257-
cdef double A, B, C, R
1258-
12591284
minp = _check_minp(win, minp, N)
12601285

1261-
for i from 0 <= i < minp - 1:
1262-
val = input[i]
1263-
1264-
# Not NaN
1265-
if val == val:
1266-
nobs += 1
1267-
x += val
1268-
xx += val * val
1269-
xxx += val * val * val
1270-
1271-
output[i] = NaN
1272-
1273-
for i from minp - 1 <= i < N:
1286+
for i from 0 <= i < N:
12741287
val = input[i]
1275-
1276-
if val == val:
1277-
nobs += 1
1278-
x += val
1279-
xx += val * val
1280-
xxx += val * val * val
1281-
1282-
if i > win - 1:
1283-
prev = input[i - win]
1288+
prev = NaN if i < win else input[i - win]
1289+
1290+
# First, count the number of onservation and consecutive repeats
1291+
count_repeats(val, prev, &rep, &nrep, &nobs)
1292+
1293+
# Then, compute the new mean and sums of squared and cubed differences
1294+
if nobs == nrep:
1295+
# All non-NaN values in window are identical...
1296+
sum2dm_x = sum3dm_x = 0
1297+
mean_x = rep if nobs > 0 else 0
1298+
elif val == val:
1299+
# Adding one observation...
12841300
if prev == prev:
1285-
x -= prev
1286-
xx -= prev * prev
1287-
xxx -= prev * prev * prev
1288-
1289-
nobs -= 1
1290-
1301+
# ...and removing another
1302+
delta_nobs = val - prev
1303+
delta = delta_nobs / nobs
1304+
prev -= mean_x
1305+
mean_x += delta
1306+
val -= mean_x
1307+
val_prev = val + prev
1308+
sum_sum2dm_x = sum2dm_x
1309+
sum2dm_x += delta_nobs * val_prev
1310+
sum_sum2dm_x += sum2dm_x
1311+
sum3dm_x += (0.25 * delta_nobs * delta * (nobs*nobs - 1) +
1312+
0.75 * nobs * val_prev * val_prev -
1313+
1.5 * sum_sum2dm_x) * delta
1314+
else:
1315+
# ...and not removing any
1316+
delta_nobs = val - mean_x
1317+
delta = delta_nobs / nobs
1318+
mean_x += delta
1319+
sum3dm_x += delta * (delta_nobs*delta*(nobs-1)*(nobs-2) -
1320+
3*sum2dm_x)
1321+
sum2dm_x += delta_nobs * (val - mean_x)
1322+
elif prev == prev:
1323+
# Adding no new observation, but removing one
1324+
delta_nobs = prev - mean_x
1325+
delta = delta_nobs / nobs
1326+
sum3dm_x -= delta * (delta * delta_nobs * (nobs+1) * (nobs+2) -
1327+
3 * sum2dm_x)
1328+
sum2dm_x -= delta_nobs * delta * (nobs + 1)
1329+
mean_x -= delta
1330+
1331+
# Finally, compute and write the rolling skewness to the output array
12911332
if nobs >= minp:
1292-
A = x / nobs
1293-
B = xx / nobs - A * A
1294-
C = xxx / nobs - A * A * A - 3 * A * B
1295-
1296-
R = sqrt(B)
1297-
1298-
if B == 0 or nobs < 3:
1299-
output[i] = NaN
1333+
if nobs < 3 or sum2dm_x == 0:
1334+
val = NaN
13001335
else:
1301-
output[i] = ((sqrt(nobs * (nobs - 1.)) * C) /
1302-
((nobs-2) * R * R * R))
1336+
val = sum3dm_x / sum2dm_x / sqrt(sum2dm_x)
1337+
val *= (nobs) * sqrt(nobs - 1) / (nobs - 2)
13031338
else:
1304-
output[i] = NaN
1339+
val = NaN
1340+
1341+
output[i] = val
13051342

13061343
return output
13071344

13081345
#-------------------------------------------------------------------------------
13091346
# Rolling kurtosis
13101347

13111348

1312-
def roll_kurt(ndarray[double_t] input,
1313-
int win, int minp):
1349+
def roll_kurt(ndarray[double_t] input, int win, int minp):
1350+
"""
1351+
Numerically stable implementation using a Welford-like method.
1352+
"""
13141353
cdef double val, prev
1315-
cdef double x = 0, xx = 0, xxx = 0, xxxx = 0
1316-
cdef Py_ssize_t nobs = 0, i
1354+
cdef double mean_x = 0, sum2dm_x = 0, sum3dm_x = 0, sum4dm_x = 0
1355+
cdef double rep = NaN, delta, val_prev, delta_nobs, sum_sum2dm_x
1356+
cdef Py_ssize_t nobs = 0, nrep = 0, i
13171357
cdef Py_ssize_t N = len(input)
13181358

13191359
cdef ndarray[double_t] output = np.empty(N, dtype=float)
13201360

1321-
# 5 components of the kurtosis equation
1322-
cdef double A, B, C, D, R, K
1323-
13241361
minp = _check_minp(win, minp, N)
13251362

1363+
for i from 0 <= i < N:
1364+
val = input[i]
1365+
prev = NaN if i < win else input[i - win]
1366+
1367+
# First, count the number of onservation and consecutive repeats
1368+
count_repeats(val, prev, &rep, &nrep, &nobs)
1369+
1370+
# Then, compute the new mean and sums of squared, cubed and
1371+
# tesseracted differences
1372+
if nobs == nrep:
1373+
# All non-NaN values in window are identical...
1374+
sum2dm_x = sum3dm_x = sum4dm_x = 0
1375+
mean_x = rep if nobs > 0 else 0
1376+
elif val == val:
1377+
# Adding one observation...
1378+
if prev == prev:
1379+
# ...and removing another
1380+
delta_nobs = val - prev
1381+
delta = delta_nobs / nobs
1382+
prev -= mean_x
1383+
mean_x += delta
1384+
val -= mean_x
1385+
val_prev = val + prev
1386+
sum_sum2dm_x = sum2dm_x
1387+
dif_sum2dm_x = -sum2dm_x
1388+
sum2dm_x += delta_nobs * val_prev
1389+
sum_sum2dm_x += sum2dm_x
1390+
dif_sum2dm_x += sum2dm_x
1391+
sum_sum3dm_x = sum3dm_x
1392+
sum3dm_x += (0.25 * delta_nobs * delta * (nobs*nobs - 1) +
1393+
0.75 * nobs * val_prev * val_prev -
1394+
1.5 * sum_sum2dm_x) * delta
1395+
sum_sum3dm_x += sum3dm_x
1396+
sum4dm_x = (2 * m * (val*val*val + prev*prev*prev) -
1397+
delta * delta_nobs * (m-2) * (m-1) * val_prev -
1398+
2 * sum_sum3dm_x - delta * dif_sum2dm_x) * delta
1399+
else:
1400+
# ...and not removing any
1401+
delta_nobs = val - mean_x
1402+
delta = delta_nobs / nobs
1403+
delta2 = delta * delta
1404+
1405+
sum4dm_x += (delta * delta * delta_nobs * (nobs - 1) * (
1406+
1407+
mean_x += delta
1408+
sum3dm_x += delta * (delta_nobs*delta*(nobs-1)*(nobs-2) -
1409+
3*sum2dm_x)
1410+
sum2dm_x += delta_nobs * (val - mean_x)
1411+
elif prev == prev:
1412+
# Adding no new observation, but removing one
1413+
delta_nobs = prev - mean_x
1414+
delta = delta_nobs / nobs
1415+
sum3dm_x -= delta * (delta * delta_nobs * (nobs+1) * (nobs+2) -
1416+
3 * sum2dm_x)
1417+
sum2dm_x -= delta_nobs * delta * (nobs + 1)
1418+
mean_x -= delta
1419+
1420+
1421+
1422+
13261423
for i from 0 <= i < minp - 1:
13271424
val = input[i]
13281425

pandas/stats/tests/test_moments.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,8 @@ def test_rolling_skew(self):
301301
except ImportError:
302302
raise nose.SkipTest('no scipy')
303303
self._check_moment_func(mom.rolling_skew,
304-
lambda x: skew(x, bias=False))
304+
lambda x: skew(x, bias=False),
305+
test_stable=True)
305306

306307
def test_rolling_kurt(self):
307308
try:
@@ -521,7 +522,7 @@ def test_ewma(self):
521522
self.assertTrue(np.abs(result - 1) < 1e-2)
522523

523524
s = Series([1.0, 2.0, 4.0, 8.0])
524-
525+
525526
expected = Series([1.0, 1.6, 2.736842, 4.923077])
526527
for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
527528
lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
@@ -1027,7 +1028,7 @@ def test_expanding_corr_pairwise_diff_length(self):
10271028
assert_frame_equal(result2, expected)
10281029
assert_frame_equal(result3, expected)
10291030
assert_frame_equal(result4, expected)
1030-
1031+
10311032
def test_pairwise_stats_column_names_order(self):
10321033
# GH 7738
10331034
df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),

0 commit comments

Comments
 (0)