Skip to content

Commit 419e9a2

Browse files
Joris Vankerschaverjreback
Joris Vankerschaver
authored andcommitted
BUG: Avoid cancellations in nanskew/nankurt.
Closes #11794 Closes #12121
1 parent 2c03c5e commit 419e9a2

File tree

3 files changed

+164
-26
lines changed

3 files changed

+164
-26
lines changed

doc/source/whatsnew/v0.18.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -541,3 +541,5 @@ of columns didn't match the number of series provided (:issue:`12039`).
541541

542542
- Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`)
543543
- Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`)
544+
545+
- Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`)

pandas/core/nanops.py

+62-26
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,13 @@ def nanargmin(values, axis=None, skipna=True):
474474

475475
@disallow('M8', 'm8')
476476
def nanskew(values, axis=None, skipna=True):
477+
""" Compute the sample skewness.
478+
479+
The statistic computed here is the adjusted Fisher-Pearson standardized
480+
moment coefficient G1. The algorithm computes this coefficient directly
481+
from the second and third central moment.
482+
483+
"""
477484

478485
mask = isnull(values)
479486
if not is_float_dtype(values.dtype):
@@ -486,32 +493,48 @@ def nanskew(values, axis=None, skipna=True):
486493
values = values.copy()
487494
np.putmask(values, mask, 0)
488495

489-
typ = values.dtype.type
490-
A = values.sum(axis) / count
491-
B = (values**2).sum(axis) / count - A**typ(2)
492-
C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B
496+
mean = values.sum(axis, dtype=np.float64) / count
497+
if axis is not None:
498+
mean = np.expand_dims(mean, axis)
499+
500+
adjusted = values - mean
501+
if skipna:
502+
np.putmask(adjusted, mask, 0)
503+
adjusted2 = adjusted ** 2
504+
adjusted3 = adjusted2 * adjusted
505+
m2 = adjusted2.sum(axis, dtype=np.float64)
506+
m3 = adjusted3.sum(axis, dtype=np.float64)
493507

494508
# floating point error
495-
B = _zero_out_fperr(B)
496-
C = _zero_out_fperr(C)
509+
m2 = _zero_out_fperr(m2)
510+
m3 = _zero_out_fperr(m3)
497511

498-
result = ((np.sqrt(count * count - count) * C) /
499-
((count - typ(2)) * np.sqrt(B)**typ(3)))
512+
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
513+
514+
dtype = values.dtype
515+
if is_float_dtype(dtype):
516+
result = result.astype(dtype)
500517

501518
if isinstance(result, np.ndarray):
502-
result = np.where(B == 0, 0, result)
519+
result = np.where(m2 == 0, 0, result)
503520
result[count < 3] = np.nan
504521
return result
505522
else:
506-
result = 0 if B == 0 else result
523+
result = 0 if m2 == 0 else result
507524
if count < 3:
508525
return np.nan
509526
return result
510527

511528

512529
@disallow('M8', 'm8')
513530
def nankurt(values, axis=None, skipna=True):
531+
""" Compute the sample skewness.
532+
533+
The statistic computed here is the adjusted Fisher-Pearson standardized
534+
moment coefficient G2, computed directly from the second and fourth
535+
central moment.
514536
537+
"""
515538
mask = isnull(values)
516539
if not is_float_dtype(values.dtype):
517540
values = values.astype('f8')
@@ -523,30 +546,43 @@ def nankurt(values, axis=None, skipna=True):
523546
values = values.copy()
524547
np.putmask(values, mask, 0)
525548

526-
typ = values.dtype.type
527-
A = values.sum(axis) / count
528-
B = (values**2).sum(axis) / count - A**typ(2)
529-
C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B
530-
D = ((values**4).sum(axis) / count - A**typ(4) -
531-
typ(6) * B * A * A - typ(4) * C * A)
549+
mean = values.sum(axis, dtype=np.float64) / count
550+
if axis is not None:
551+
mean = np.expand_dims(mean, axis)
552+
553+
adjusted = values - mean
554+
if skipna:
555+
np.putmask(adjusted, mask, 0)
556+
adjusted2 = adjusted ** 2
557+
adjusted4 = adjusted2 ** 2
558+
m2 = adjusted2.sum(axis, dtype=np.float64)
559+
m4 = adjusted4.sum(axis, dtype=np.float64)
560+
561+
adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
562+
numer = count * (count + 1) * (count - 1) * m4
563+
denom = (count - 2) * (count - 3) * m2**2
564+
result = numer / denom - adj
532565

533-
B = _zero_out_fperr(B)
534-
D = _zero_out_fperr(D)
566+
# floating point error
567+
numer = _zero_out_fperr(numer)
568+
denom = _zero_out_fperr(denom)
535569

536-
if not isinstance(B, np.ndarray):
537-
# if B is a scalar, check these corner cases first before doing
538-
# division
570+
if not isinstance(denom, np.ndarray):
571+
# if ``denom`` is a scalar, check these corner cases first before
572+
# doing division
539573
if count < 4:
540574
return np.nan
541-
if B == 0:
575+
if denom == 0:
542576
return 0
543577

544-
result = (((count * count - typ(1)) * D / (B * B) - typ(3) *
545-
((count - typ(1))**typ(2))) / ((count - typ(2)) *
546-
(count - typ(3))))
578+
result = numer / denom - adj
579+
580+
dtype = values.dtype
581+
if is_float_dtype(dtype):
582+
result = result.astype(dtype)
547583

548584
if isinstance(result, np.ndarray):
549-
result = np.where(B == 0, 0, result)
585+
result = np.where(denom == 0, 0, result)
550586
result[count < 4] = np.nan
551587

552588
return result

pandas/tests/test_nanops.py

+100
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,106 @@ def prng(self):
888888
return np.random.RandomState(1234)
889889

890890

891+
class TestNanskewFixedValues(tm.TestCase):
892+
893+
# xref GH 11974
894+
895+
def setUp(self):
896+
# Test data + skewness value (computed with scipy.stats.skew)
897+
self.samples = np.sin(np.linspace(0, 1, 200))
898+
self.actual_skew = -0.1875895205961754
899+
900+
def test_constant_series(self):
901+
# xref GH 11974
902+
for val in [3075.2, 3075.3, 3075.5]:
903+
data = val * np.ones(300)
904+
skew = nanops.nanskew(data)
905+
self.assertEqual(skew, 0.0)
906+
907+
def test_all_finite(self):
908+
alpha, beta = 0.3, 0.1
909+
left_tailed = self.prng.beta(alpha, beta, size=100)
910+
self.assertLess(nanops.nanskew(left_tailed), 0)
911+
912+
alpha, beta = 0.1, 0.3
913+
right_tailed = self.prng.beta(alpha, beta, size=100)
914+
self.assertGreater(nanops.nanskew(right_tailed), 0)
915+
916+
def test_ground_truth(self):
917+
skew = nanops.nanskew(self.samples)
918+
self.assertAlmostEqual(skew, self.actual_skew)
919+
920+
def test_axis(self):
921+
samples = np.vstack([self.samples,
922+
np.nan * np.ones(len(self.samples))])
923+
skew = nanops.nanskew(samples, axis=1)
924+
tm.assert_almost_equal(skew, [self.actual_skew, np.nan])
925+
926+
def test_nans(self):
927+
samples = np.hstack([self.samples, np.nan])
928+
skew = nanops.nanskew(samples, skipna=False)
929+
self.assertTrue(np.isnan(skew))
930+
931+
def test_nans_skipna(self):
932+
samples = np.hstack([self.samples, np.nan])
933+
skew = nanops.nanskew(samples, skipna=True)
934+
tm.assert_almost_equal(skew, self.actual_skew)
935+
936+
@property
937+
def prng(self):
938+
return np.random.RandomState(1234)
939+
940+
941+
class TestNankurtFixedValues(tm.TestCase):
942+
943+
# xref GH 11974
944+
945+
def setUp(self):
946+
# Test data + kurtosis value (computed with scipy.stats.kurtosis)
947+
self.samples = np.sin(np.linspace(0, 1, 200))
948+
self.actual_kurt = -1.2058303433799713
949+
950+
def test_constant_series(self):
951+
# xref GH 11974
952+
for val in [3075.2, 3075.3, 3075.5]:
953+
data = val * np.ones(300)
954+
kurt = nanops.nankurt(data)
955+
self.assertEqual(kurt, 0.0)
956+
957+
def test_all_finite(self):
958+
alpha, beta = 0.3, 0.1
959+
left_tailed = self.prng.beta(alpha, beta, size=100)
960+
self.assertLess(nanops.nankurt(left_tailed), 0)
961+
962+
alpha, beta = 0.1, 0.3
963+
right_tailed = self.prng.beta(alpha, beta, size=100)
964+
self.assertGreater(nanops.nankurt(right_tailed), 0)
965+
966+
def test_ground_truth(self):
967+
kurt = nanops.nankurt(self.samples)
968+
self.assertAlmostEqual(kurt, self.actual_kurt)
969+
970+
def test_axis(self):
971+
samples = np.vstack([self.samples,
972+
np.nan * np.ones(len(self.samples))])
973+
kurt = nanops.nankurt(samples, axis=1)
974+
tm.assert_almost_equal(kurt, [self.actual_kurt, np.nan])
975+
976+
def test_nans(self):
977+
samples = np.hstack([self.samples, np.nan])
978+
kurt = nanops.nankurt(samples, skipna=False)
979+
self.assertTrue(np.isnan(kurt))
980+
981+
def test_nans_skipna(self):
982+
samples = np.hstack([self.samples, np.nan])
983+
kurt = nanops.nankurt(samples, skipna=True)
984+
tm.assert_almost_equal(kurt, self.actual_kurt)
985+
986+
@property
987+
def prng(self):
988+
return np.random.RandomState(1234)
989+
990+
891991
if __name__ == '__main__':
892992
import nose
893993
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'

0 commit comments

Comments
 (0)