Skip to content

Commit 6f33c46

Browse files
audersonauderson
and
auderson
authored
EHN: add same value count for skew & kurt (#46717)
* add same value count for skew & kurt * add same value count for skew & kurt * update doc * merged * restore old comments Co-authored-by: auderson <[email protected]>
1 parent e8308ce commit 6f33c46

File tree

4 files changed

+113
-44
lines changed

4 files changed

+113
-44
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,7 @@ Groupby/resample/rolling
732732
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
733733
- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
734734
- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
735+
- Bug in :meth:`Rolling.skew` and :meth:`Rolling.kurt` would give NaN with window of same values (:issue:`30993`)
735736
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
736737
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
737738
- Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`)

pandas/_libs/window/aggregations.pyx

+86-35
Original file line numberDiff line numberDiff line change
@@ -455,8 +455,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
455455

456456

457457
cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
458-
float64_t x, float64_t xx,
459-
float64_t xxx) nogil:
458+
float64_t x, float64_t xx, float64_t xxx,
459+
int64_t num_consecutive_same_value
460+
) nogil:
460461
cdef:
461462
float64_t result, dnobs
462463
float64_t A, B, C, R
@@ -467,6 +468,12 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
467468
B = xx / dnobs - A * A
468469
C = xxx / dnobs - A * A * A - 3 * A * B
469470

471+
if nobs < 3:
472+
result = NaN
473+
# GH 42064 46431
474+
# uniform case, force result to be 0
475+
elif num_consecutive_same_value >= nobs:
476+
result = 0.0
470477
# #18044: with uniform distribution, floating issue will
471478
# cause B != 0. and cause the result is a very
472479
# large number.
@@ -476,7 +483,7 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
476483
# if the variance is less than 1e-14, it could be
477484
# treat as zero, here we follow the original
478485
# skew/kurt behaviour to check B <= 1e-14
479-
if B <= 1e-14 or nobs < 3:
486+
elif B <= 1e-14:
480487
result = NaN
481488
else:
482489
R = sqrt(B)
@@ -493,7 +500,10 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
493500
float64_t *xxx,
494501
float64_t *compensation_x,
495502
float64_t *compensation_xx,
496-
float64_t *compensation_xxx) nogil:
503+
float64_t *compensation_xxx,
504+
int64_t *num_consecutive_same_value,
505+
float64_t *prev_value,
506+
) nogil:
497507
""" add a value from the skew calc """
498508
cdef:
499509
float64_t y, t
@@ -515,6 +525,14 @@ cdef inline void add_skew(float64_t val, int64_t *nobs,
515525
compensation_xxx[0] = t - xxx[0] - y
516526
xxx[0] = t
517527

528+
# GH#42064, record num of same values to remove floating point artifacts
529+
if val == prev_value[0]:
530+
num_consecutive_same_value[0] += 1
531+
else:
532+
# reset to 1 (include current value itself)
533+
num_consecutive_same_value[0] = 1
534+
prev_value[0] = val
535+
518536

519537
cdef inline void remove_skew(float64_t val, int64_t *nobs,
520538
float64_t *x, float64_t *xx,
@@ -553,8 +571,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
553571
float64_t compensation_xx_add, compensation_xx_remove
554572
float64_t compensation_x_add, compensation_x_remove
555573
float64_t x, xx, xxx
574+
float64_t prev_value
556575
int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
557-
int64_t s, e
576+
int64_t s, e, num_consecutive_same_value
558577
ndarray[float64_t] output, mean_array, values_copy
559578
bint is_monotonic_increasing_bounds
560579

@@ -588,6 +607,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
588607
# never removed
589608
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
590609

610+
prev_value = values[s]
611+
num_consecutive_same_value = 0
612+
591613
compensation_xxx_add = compensation_xxx_remove = 0
592614
compensation_xx_add = compensation_xx_remove = 0
593615
compensation_x_add = compensation_x_remove = 0
@@ -596,7 +618,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
596618
for j in range(s, e):
597619
val = values_copy[j]
598620
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
599-
&compensation_xx_add, &compensation_xxx_add)
621+
&compensation_xx_add, &compensation_xxx_add,
622+
&num_consecutive_same_value, &prev_value)
600623

601624
else:
602625

@@ -612,9 +635,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
612635
for j in range(end[i - 1], e):
613636
val = values_copy[j]
614637
add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
615-
&compensation_xx_add, &compensation_xxx_add)
638+
&compensation_xx_add, &compensation_xxx_add,
639+
&num_consecutive_same_value, &prev_value)
616640

617-
output[i] = calc_skew(minp, nobs, x, xx, xxx)
641+
output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
618642

619643
if not is_monotonic_increasing_bounds:
620644
nobs = 0
@@ -630,35 +654,44 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
630654

631655
cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
632656
float64_t x, float64_t xx,
633-
float64_t xxx, float64_t xxxx) nogil:
657+
float64_t xxx, float64_t xxxx,
658+
int64_t num_consecutive_same_value,
659+
) nogil:
634660
cdef:
635661
float64_t result, dnobs
636662
float64_t A, B, C, D, R, K
637663

638664
if nobs >= minp:
639-
dnobs = <float64_t>nobs
640-
A = x / dnobs
641-
R = A * A
642-
B = xx / dnobs - R
643-
R = R * A
644-
C = xxx / dnobs - R - 3 * A * B
645-
R = R * A
646-
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
647-
648-
# #18044: with uniform distribution, floating issue will
649-
# cause B != 0. and cause the result is a very
650-
# large number.
651-
#
652-
# in core/nanops.py nanskew/nankurt call the function
653-
# _zero_out_fperr(m2) to fix floating error.
654-
# if the variance is less than 1e-14, it could be
655-
# treat as zero, here we follow the original
656-
# skew/kurt behaviour to check B <= 1e-14
657-
if B <= 1e-14 or nobs < 4:
665+
if nobs < 4:
658666
result = NaN
667+
# GH 42064 46431
668+
# uniform case, force result to be -3.
669+
elif num_consecutive_same_value >= nobs:
670+
result = -3.
659671
else:
660-
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
661-
result = K / ((dnobs - 2.) * (dnobs - 3.))
672+
dnobs = <float64_t>nobs
673+
A = x / dnobs
674+
R = A * A
675+
B = xx / dnobs - R
676+
R = R * A
677+
C = xxx / dnobs - R - 3 * A * B
678+
R = R * A
679+
D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
680+
681+
# #18044: with uniform distribution, floating issue will
682+
# cause B != 0. and cause the result is a very
683+
# large number.
684+
#
685+
# in core/nanops.py nanskew/nankurt call the function
686+
# _zero_out_fperr(m2) to fix floating error.
687+
# if the variance is less than 1e-14, it could be
688+
# treat as zero, here we follow the original
689+
# skew/kurt behaviour to check B <= 1e-14
690+
if B <= 1e-14:
691+
result = NaN
692+
else:
693+
K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
694+
result = K / ((dnobs - 2.) * (dnobs - 3.))
662695
else:
663696
result = NaN
664697

@@ -671,7 +704,10 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
671704
float64_t *compensation_x,
672705
float64_t *compensation_xx,
673706
float64_t *compensation_xxx,
674-
float64_t *compensation_xxxx) nogil:
707+
float64_t *compensation_xxxx,
708+
int64_t *num_consecutive_same_value,
709+
float64_t *prev_value
710+
) nogil:
675711
""" add a value from the kurotic calc """
676712
cdef:
677713
float64_t y, t
@@ -697,6 +733,14 @@ cdef inline void add_kurt(float64_t val, int64_t *nobs,
697733
compensation_xxxx[0] = t - xxxx[0] - y
698734
xxxx[0] = t
699735

736+
# GH#42064, record num of same values to remove floating point artifacts
737+
if val == prev_value[0]:
738+
num_consecutive_same_value[0] += 1
739+
else:
740+
# reset to 1 (include current value itself)
741+
num_consecutive_same_value[0] = 1
742+
prev_value[0] = val
743+
700744

701745
cdef inline void remove_kurt(float64_t val, int64_t *nobs,
702746
float64_t *x, float64_t *xx,
@@ -741,7 +785,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
741785
float64_t compensation_xx_remove, compensation_xx_add
742786
float64_t compensation_x_remove, compensation_x_add
743787
float64_t x, xx, xxx, xxxx
744-
int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
788+
float64_t prev_value
789+
int64_t nobs, s, e, num_consecutive_same_value
790+
int64_t N = len(start), V = len(values), nobs_mean = 0
745791
ndarray[float64_t] output, values_copy
746792
bint is_monotonic_increasing_bounds
747793

@@ -775,6 +821,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
775821
# never removed
776822
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
777823

824+
prev_value = values[s]
825+
num_consecutive_same_value = 0
826+
778827
compensation_xxxx_add = compensation_xxxx_remove = 0
779828
compensation_xxx_remove = compensation_xxx_add = 0
780829
compensation_xx_remove = compensation_xx_add = 0
@@ -784,7 +833,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
784833
for j in range(s, e):
785834
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
786835
&compensation_x_add, &compensation_xx_add,
787-
&compensation_xxx_add, &compensation_xxxx_add)
836+
&compensation_xxx_add, &compensation_xxxx_add,
837+
&num_consecutive_same_value, &prev_value)
788838

789839
else:
790840

@@ -800,9 +850,10 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
800850
for j in range(end[i - 1], e):
801851
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
802852
&compensation_x_add, &compensation_xx_add,
803-
&compensation_xxx_add, &compensation_xxxx_add)
853+
&compensation_xxx_add, &compensation_xxxx_add,
854+
&num_consecutive_same_value, &prev_value)
804855

805-
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
856+
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx, num_consecutive_same_value)
806857

807858
if not is_monotonic_increasing_bounds:
808859
nobs = 0

pandas/tests/window/test_rolling.py

+11
Original file line numberDiff line numberDiff line change
@@ -1860,3 +1860,14 @@ def test_rolling_mean_sum_floating_artifacts():
18601860
assert (result[-3:] == 0).all()
18611861
result = r.sum()
18621862
assert (result[-3:] == 0).all()
1863+
1864+
1865+
def test_rolling_skew_kurt_floating_artifacts():
1866+
# GH 42064 46431
1867+
1868+
sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
1869+
r = sr.rolling(4)
1870+
result = r.skew()
1871+
assert (result[-2:] == 0).all()
1872+
result = r.kurt()
1873+
assert (result[-2:] == -3).all()

pandas/tests/window/test_rolling_skew_kurt.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -178,17 +178,18 @@ def test_center_reindex_frame(frame, roll_func):
178178

179179
def test_rolling_skew_edge_cases(step):
180180

181-
all_nan = Series([np.NaN] * 5)[::step]
182-
181+
expected = Series([np.NaN] * 4 + [0.0])[::step]
183182
# yields all NaN (0 variance)
184183
d = Series([1] * 5)
185184
x = d.rolling(window=5, step=step).skew()
186-
tm.assert_series_equal(all_nan, x)
185+
# index 4 should be 0 as it contains 5 same obs
186+
tm.assert_series_equal(expected, x)
187187

188+
expected = Series([np.NaN] * 5)[::step]
188189
# yields all NaN (window too small)
189190
d = Series(np.random.randn(5))
190191
x = d.rolling(window=2, step=step).skew()
191-
tm.assert_series_equal(all_nan, x)
192+
tm.assert_series_equal(expected, x)
192193

193194
# yields [NaN, NaN, NaN, 0.177994, 1.548824]
194195
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
@@ -199,17 +200,18 @@ def test_rolling_skew_edge_cases(step):
199200

200201
def test_rolling_kurt_edge_cases(step):
201202

202-
all_nan = Series([np.NaN] * 5)[::step]
203+
expected = Series([np.NaN] * 4 + [-3.0])[::step]
203204

204205
# yields all NaN (0 variance)
205206
d = Series([1] * 5)
206207
x = d.rolling(window=5, step=step).kurt()
207-
tm.assert_series_equal(all_nan, x)
208+
tm.assert_series_equal(expected, x)
208209

209210
# yields all NaN (window too small)
211+
expected = Series([np.NaN] * 5)[::step]
210212
d = Series(np.random.randn(5))
211213
x = d.rolling(window=3, step=step).kurt()
212-
tm.assert_series_equal(all_nan, x)
214+
tm.assert_series_equal(expected, x)
213215

214216
# yields [NaN, NaN, NaN, 1.224307, 2.671499]
215217
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
@@ -220,11 +222,15 @@ def test_rolling_kurt_edge_cases(step):
220222

221223
def test_rolling_skew_eq_value_fperr(step):
222224
# #18804 all rolling skew for all equal values should return Nan
225+
# #46717 update: all equal values should return 0 instead of NaN
223226
a = Series([1.1] * 15).rolling(window=10, step=step).skew()
224-
assert np.isnan(a).all()
227+
assert (a[a.index >= 9] == 0).all()
228+
assert a[a.index < 9].isna().all()
225229

226230

227231
def test_rolling_kurt_eq_value_fperr(step):
228232
# #18804 all rolling kurt for all equal values should return Nan
233+
# #46717 update: all equal values should return -3 instead of NaN
229234
a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
230-
assert np.isnan(a).all()
235+
assert (a[a.index >= 9] == -3).all()
236+
assert a[a.index < 9].isna().all()

0 commit comments

Comments
 (0)