Skip to content

Commit cbbb771

Browse files
committed
add same value count for skew & kurt
1 parent b3b5e2a commit cbbb771

File tree

3 files changed

+110
-44
lines changed

3 files changed

+110
-44
lines changed

pandas/_libs/window/aggregations.pyx

+86-35
Original file line numberDiff line numberDiff line change
@@ -409,8 +409,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
409409

410410

411411
cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
412-
float64_t x, float64_t xx,
413-
float64_t xxx) nogil:
412+
float64_t x, float64_t xx, float64_t xxx,
413+
int64_t num_consecutive_same_value
414+
) nogil:
414415
cdef:
415416
float64_t result, dnobs
416417
float64_t A, B, C, R
@@ -421,6 +422,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
421422
B = xx / dnobs - A * A
422423
C = xxx / dnobs - A * A * A - 3 * A * B
423424

425+
if nobs < 3:
426+
result = NaN
427+
# GH 42064 46431
428+
# uniform case, force result to be 0
429+
elif num_consecutive_same_value >= nobs:
430+
result = 0.0
424431
# #18044: with uniform distribution, floating issue will
425432
# cause B != 0. and cause the result is a very
426433
# large number.
@@ -430,7 +437,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
430437
# if the variance is less than 1e-14, it could be
431438
# treat as zero, here we follow the original
432439
# skew/kurt behaviour to check B <= 1e-14
433-
if B <= 1e-14 or nobs < 3:
440+
elif B <= 1e-14:
434441
result = NaN
435442
else:
436443
R = sqrt(B)
@@ -447,7 +454,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
447454
float64_t *xxx,
448455
float64_t *compensation_x,
449456
float64_t *compensation_xx,
450-
float64_t *compensation_xxx) nogil:
457+
float64_t *compensation_xxx,
458+
int64_t *num_consecutive_same_value,
459+
float64_t *prev_value,
460+
) nogil:
451461
""" add a value from the skew calc """
452462
cdef:
453463
float64_t y, t
@@ -469,6 +479,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
469479
compensation_xxx[0] = t - xxx[0] - y
470480
xxx[0] = t
471481

482+
# GH#42064, record num of same values to remove floating point artifacts
483+
if val == prev_value[0]:
484+
num_consecutive_same_value[0] += 1
485+
else:
486+
# reset to 1 (include current value itself)
487+
num_consecutive_same_value[0] = 1
488+
prev_value[0] = val
489+
472490

473491
cdef inline void remove_skew(float64_t val, int64_t *nobs,
474492
float64_t *x, float64_t *xx,
@@ -507,8 +525,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
507525
float64_t compensation_xx_add, compensation_xx_remove
508526
float64_t compensation_x_add, compensation_x_remove
509527
float64_t x, xx, xxx
528+
float64_t prev_value
510529
int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
511-
int64_t s, e
530+
int64_t s, e, num_consecutive_same_value
512531
ndarray[float64_t] output, mean_array, values_copy
513532
bint is_monotonic_increasing_bounds
514533

@@ -542,6 +561,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
542561
# never removed
543562
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
544563

564+
prev_value = values[s]
565+
num_consecutive_same_value = 0
566+
545567
compensation_xxx_add = compensation_xxx_remove = 0
546568
compensation_xx_add = compensation_xx_remove = 0
547569
compensation_x_add = compensation_x_remove = 0
@@ -550,7 +572,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
550572
for j in range(s, e):
551573
val = values_copy[j]
552574
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
553-
&compensation_xx_add, &compensation_xxx_add)
575+
&compensation_xx_add, &compensation_xxx_add,
576+
&num_consecutive_same_value, &prev_value)
554577

555578
else:
556579

@@ -566,9 +589,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
566589
for j in range(end[i - 1], e):
567590
val = values_copy[j]
568591
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
569-
&compensation_xx_add, &compensation_xxx_add)
592+
&compensation_xx_add, &compensation_xxx_add,
593+
&num_consecutive_same_value, &prev_value)
570594

571-
output[i] = calc_skew(minp, nobs, x, xx, xxx)
595+
output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
572596

573597
if not is_monotonic_increasing_bounds:
574598
nobs = 0
@@ -584,35 +608,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
584608

585609
cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
586610
float64_t x, float64_t xx,
587-
float64_t xxx, float64_t xxxx) nogil:
611+
float64_t xxx, float64_t xxxx,
612+
int64_t num_consecutive_same_value,
613+
) nogil:
588614
cdef:
589615
float64_t result, dnobs
590616
float64_t A, B, C, D, R, K
591617

592618
if nobs >= minp:
593-
dnobs = <float64_t>nobs
594-
A = x / dnobs
595-
R = A * A
596-
B = xx / dnobs - R
597-
R = R * A
598-
C = xxx / dnobs - R - 3 * A * B
599-
R = R * A
600-
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
601-
602-
# #18044: with uniform distribution, floating issue will
603-
# cause B != 0. and cause the result is a very
604-
# large number.
605-
#
606-
# in core/nanops.py nanskew/nankurt call the function
607-
# _zero_out_fperr(m2) to fix floating error.
608-
# if the variance is less than 1e-14, it could be
609-
# treat as zero, here we follow the original
610-
# skew/kurt behaviour to check B <= 1e-14
611-
if B <= 1e-14 or nobs < 4:
619+
if nobs < 4:
612620
result = NaN
621+
# GH 42064 46431
622+
# uniform case, force result to be -3.
623+
elif num_consecutive_same_value >= nobs:
624+
result = -3.
613625
else:
614-
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
615-
result = K / ((dnobs - 2.) * (dnobs - 3.))
626+
dnobs = <float64_t>nobs
627+
A = x / dnobs
628+
R = A * A
629+
B = xx / dnobs - R
630+
R = R * A
631+
C = xxx / dnobs - R - 3 * A * B
632+
R = R * A
633+
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
634+
635+
# #18044: with uniform distribution, floating issue will
636+
# cause B != 0. and cause the result is a very
637+
# large number.
638+
#
639+
# in core/nanops.py nanskew/nankurt call the function
640+
# _zero_out_fperr(m2) to fix floating error.
641+
# if the variance is less than 1e-14, it could be
642+
# treat as zero, here we follow the original
643+
# skew/kurt behaviour to check B <= 1e-14
644+
if B <= 1e-14:
645+
result = NaN
646+
else:
647+
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
648+
result = K / ((dnobs - 2.) * (dnobs - 3.))
616649
else:
617650
result = NaN
618651

@@ -625,7 +658,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
625658
float64_t *compensation_x,
626659
float64_t *compensation_xx,
627660
float64_t *compensation_xxx,
628-
float64_t *compensation_xxxx) nogil:
661+
float64_t *compensation_xxxx,
662+
int64_t *num_consecutive_same_value,
663+
float64_t *prev_value
664+
) nogil:
629665
""" add a value from the kurotic calc """
630666
cdef:
631667
float64_t y, t
@@ -651,6 +687,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
651687
compensation_xxxx[0] = t - xxxx[0] - y
652688
xxxx[0] = t
653689

690+
# GH#42064, record num of same values to remove floating point artifacts
691+
if val == prev_value[0]:
692+
num_consecutive_same_value[0] += 1
693+
else:
694+
# reset to 1 (include current value itself)
695+
num_consecutive_same_value[0] = 1
696+
prev_value[0] = val
697+
654698

655699
cdef inline void remove_kurt(float64_t val, int64_t *nobs,
656700
float64_t *x, float64_t *xx,
@@ -695,7 +739,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
695739
float64_t compensation_xx_remove, compensation_xx_add
696740
float64_t compensation_x_remove, compensation_x_add
697741
float64_t x, xx, xxx, xxxx
698-
int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
742+
float64_t prev_value
743+
int64_t nobs, s, e, num_consecutive_same_value
744+
int64_t N = len(start), V = len(values), nobs_mean = 0
699745
ndarray[float64_t] output, values_copy
700746
bint is_monotonic_increasing_bounds
701747

@@ -729,6 +775,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
729775
# never removed
730776
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
731777

778+
prev_value = values[s]
779+
num_consecutive_same_value = 0
780+
732781
compensation_xxxx_add = compensation_xxxx_remove = 0
733782
compensation_xxx_remove = compensation_xxx_add = 0
734783
compensation_xx_remove = compensation_xx_add = 0
@@ -738,7 +787,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
738787
for j in range(s, e):
739788
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
740789
&compensation_x_add, &compensation_xx_add,
741-
&compensation_xxx_add, &compensation_xxxx_add)
790+
&compensation_xxx_add, &compensation_xxxx_add,
791+
&num_consecutive_same_value, &prev_value)
742792

743793
else:
744794

@@ -754,9 +804,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
754804
for j in range(end[i - 1], e):
755805
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
756806
&compensation_x_add, &compensation_xx_add,
757-
&compensation_xxx_add, &compensation_xxxx_add)
807+
&compensation_xxx_add, &compensation_xxxx_add,
808+
&num_consecutive_same_value, &prev_value)
758809

759-
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
810+
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value)
760811

761812
if not is_monotonic_increasing_bounds:
762813
nobs = 0

pandas/tests/window/test_rolling.py

+11
Original file line numberDiff line numberDiff line change
@@ -1769,3 +1769,14 @@ def test_step_not_integer_raises():
17691769
def test_step_not_positive_raises():
17701770
with pytest.raises(ValueError, match="step must be >= 0"):
17711771
DataFrame(range(2)).rolling(1, step=-1)
1772+
1773+
1774+
def test_rolling_skew_kurt_floating_artifacts():
1775+
# GH 42064 46431
1776+
1777+
sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
1778+
r = sr.rolling(4)
1779+
result = r.skew()
1780+
assert (result[-2:] == 0).all()
1781+
result = r.kurt()
1782+
assert (result[-2:] == -3).all()

pandas/tests/window/test_rolling_skew_kurt.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -178,17 +178,18 @@ def test_center_reindex_frame(frame, roll_func):
178178

179179
def test_rolling_skew_edge_cases(step):
180180

181-
all_nan = Series([np.NaN] * 5)[::step]
182-
181+
expected = Series([np.NaN] * 4 + [0.0])[::step]
183182
# yields all NaN (0 variance)
184183
d = Series([1] * 5)
185184
x = d.rolling(window=5, step=step).skew()
186-
tm.assert_series_equal(all_nan, x)
185+
# index 4 should be 0 as it contains 5 same obs
186+
tm.assert_series_equal(expected, x)
187187

188+
expected = Series([np.NaN] * 5)[::step]
188189
# yields all NaN (window too small)
189190
d = Series(np.random.randn(5))
190191
x = d.rolling(window=2, step=step).skew()
191-
tm.assert_series_equal(all_nan, x)
192+
tm.assert_series_equal(expected, x)
192193

193194
# yields [NaN, NaN, NaN, 0.177994, 1.548824]
194195
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
@@ -199,17 +200,18 @@ def test_rolling_skew_edge_cases(step):
199200

200201
def test_rolling_kurt_edge_cases(step):
201202

202-
all_nan = Series([np.NaN] * 5)[::step]
203+
expected = Series([np.NaN] * 4 + [-3.0])[::step]
203204

204205
# yields all NaN (0 variance)
205206
d = Series([1] * 5)
206207
x = d.rolling(window=5, step=step).kurt()
207-
tm.assert_series_equal(all_nan, x)
208+
tm.assert_series_equal(expected, x)
208209

209210
# yields all NaN (window too small)
211+
expected = Series([np.NaN] * 5)[::step]
210212
d = Series(np.random.randn(5))
211213
x = d.rolling(window=3, step=step).kurt()
212-
tm.assert_series_equal(all_nan, x)
214+
tm.assert_series_equal(expected, x)
213215

214216
# yields [NaN, NaN, NaN, 1.224307, 2.671499]
215217
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
@@ -221,10 +223,12 @@ def test_rolling_kurt_edge_cases(step):
221223
def test_rolling_skew_eq_value_fperr(step):
222224
# #18804 all rolling skew for all equal values should return Nan
223225
a = Series([1.1] * 15).rolling(window=10, step=step).skew()
224-
assert np.isnan(a).all()
226+
assert (a[a.index >= 9] == 0).all()
227+
assert a[a.index < 9].isna().all()
225228

226229

227230
def test_rolling_kurt_eq_value_fperr(step):
228231
# #18804 all rolling kurt for all equal values should return Nan
229232
a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
230-
assert np.isnan(a).all()
233+
assert (a[a.index >= 9] == -3).all()
234+
assert a[a.index < 9].isna().all()

0 commit comments

Comments
 (0)