Skip to content

BUG: Avoid cancellations in nanskew/nankurt. #12121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,5 @@ of columns didn't match the number of series provided (:issue:`12039`).

- Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`)
- Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`)

- Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`)
88 changes: 62 additions & 26 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,13 @@ def nanargmin(values, axis=None, skipna=True):

@disallow('M8', 'm8')
def nanskew(values, axis=None, skipna=True):
""" Compute the sample skewness.

The statistic computed here is the adjusted Fisher-Pearson standardized
moment coefficient G1. The algorithm computes this coefficient directly
from the second and third central moment.

"""

mask = isnull(values)
if not is_float_dtype(values.dtype):
Expand All @@ -486,32 +493,48 @@ def nanskew(values, axis=None, skipna=True):
values = values.copy()
np.putmask(values, mask, 0)

typ = values.dtype.type
A = values.sum(axis) / count
B = (values**2).sum(axis) / count - A**typ(2)
C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B
mean = values.sum(axis, dtype=np.float64) / count
if axis is not None:
mean = np.expand_dims(mean, axis)

adjusted = values - mean
if skipna:
np.putmask(adjusted, mask, 0)
adjusted2 = adjusted ** 2
adjusted3 = adjusted2 * adjusted
m2 = adjusted2.sum(axis, dtype=np.float64)
m3 = adjusted3.sum(axis, dtype=np.float64)

# floating point error
B = _zero_out_fperr(B)
C = _zero_out_fperr(C)
m2 = _zero_out_fperr(m2)
m3 = _zero_out_fperr(m3)

result = ((np.sqrt(count * count - count) * C) /
((count - typ(2)) * np.sqrt(B)**typ(3)))
result = (count * (count-1) ** 0.5 / (count-2)) * (m3 / m2 ** 1.5)

dtype = values.dtype
if is_float_dtype(dtype):
result = result.astype(dtype)

if isinstance(result, np.ndarray):
result = np.where(B == 0, 0, result)
result = np.where(m2 == 0, 0, result)
result[count < 3] = np.nan
return result
else:
result = 0 if B == 0 else result
result = 0 if m2 == 0 else result
if count < 3:
return np.nan
return result


@disallow('M8', 'm8')
def nankurt(values, axis=None, skipna=True):
""" Compute the sample skewness.

The statistic computed here is the adjusted Fisher-Pearson standardized
moment coefficient G2, computed directly from the second and fourth
central moment.

"""
mask = isnull(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')
Expand All @@ -523,30 +546,43 @@ def nankurt(values, axis=None, skipna=True):
values = values.copy()
np.putmask(values, mask, 0)

typ = values.dtype.type
A = values.sum(axis) / count
B = (values**2).sum(axis) / count - A**typ(2)
C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B
D = ((values**4).sum(axis) / count - A**typ(4) -
typ(6) * B * A * A - typ(4) * C * A)
mean = values.sum(axis, dtype=np.float64) / count
if axis is not None:
mean = np.expand_dims(mean, axis)

adjusted = values - mean
if skipna:
np.putmask(adjusted, mask, 0)
adjusted2 = adjusted ** 2
adjusted4 = adjusted2 ** 2
m2 = adjusted2.sum(axis, dtype=np.float64)
m4 = adjusted4.sum(axis, dtype=np.float64)

adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
numer = count * (count + 1) * (count - 1) * m4
denom = (count - 2) * (count - 3) * m2**2
result = numer / denom - adj

B = _zero_out_fperr(B)
D = _zero_out_fperr(D)
# floating point error
numer = _zero_out_fperr(numer)
denom = _zero_out_fperr(denom)

if not isinstance(B, np.ndarray):
# if B is a scalar, check these corner cases first before doing
# division
if not isinstance(denom, np.ndarray):
# if ``denom`` is a scalar, check these corner cases first before
# doing division
if count < 4:
return np.nan
if B == 0:
if denom == 0:
return 0

result = (((count * count - typ(1)) * D / (B * B) - typ(3) *
((count - typ(1))**typ(2))) / ((count - typ(2)) *
(count - typ(3))))
result = numer / denom - adj

dtype = values.dtype
if is_float_dtype(dtype):
result = result.astype(dtype)

if isinstance(result, np.ndarray):
result = np.where(B == 0, 0, result)
result = np.where(denom == 0, 0, result)
result[count < 4] = np.nan

return result
Expand Down
100 changes: 100 additions & 0 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,106 @@ def prng(self):
return np.random.RandomState(1234)


class TestNanskewFixedValues(tm.TestCase):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add the issue number here as a comment

# xref GH 11974

def setUp(self):
# Test data + skewness value (computed with scipy.stats.skew)
self.samples = np.sin(np.linspace(0, 1, 200))
self.actual_skew = -0.1875895205961754

def test_constant_series(self):
# xref GH 11974
for val in [3075.2, 3075.3, 3075.5]:
data = val * np.ones(300)
skew = nanops.nanskew(data)
self.assertEqual(skew, 0.0)

def test_all_finite(self):
alpha, beta = 0.3, 0.1
left_tailed = self.prng.beta(alpha, beta, size=100)
self.assertLess(nanops.nanskew(left_tailed), 0)

alpha, beta = 0.1, 0.3
right_tailed = self.prng.beta(alpha, beta, size=100)
self.assertGreater(nanops.nanskew(right_tailed), 0)

def test_ground_truth(self):
skew = nanops.nanskew(self.samples)
self.assertAlmostEqual(skew, self.actual_skew)

def test_axis(self):
samples = np.vstack([self.samples,
np.nan * np.ones(len(self.samples))])
skew = nanops.nanskew(samples, axis=1)
tm.assert_almost_equal(skew, [self.actual_skew, np.nan])

def test_nans(self):
samples = np.hstack([self.samples, np.nan])
skew = nanops.nanskew(samples, skipna=False)
self.assertTrue(np.isnan(skew))

def test_nans_skipna(self):
samples = np.hstack([self.samples, np.nan])
skew = nanops.nanskew(samples, skipna=True)
tm.assert_almost_equal(skew, self.actual_skew)

@property
def prng(self):
return np.random.RandomState(1234)


class TestNankurtFixedValues(tm.TestCase):

# xref GH 11974

def setUp(self):
# Test data + kurtosis value (computed with scipy.stats.kurtosis)
self.samples = np.sin(np.linspace(0, 1, 200))
self.actual_kurt = -1.2058303433799713

def test_constant_series(self):
# xref GH 11974
for val in [3075.2, 3075.3, 3075.5]:
data = val * np.ones(300)
kurt = nanops.nankurt(data)
self.assertEqual(kurt, 0.0)

def test_all_finite(self):
alpha, beta = 0.3, 0.1
left_tailed = self.prng.beta(alpha, beta, size=100)
self.assertLess(nanops.nankurt(left_tailed), 0)

alpha, beta = 0.1, 0.3
right_tailed = self.prng.beta(alpha, beta, size=100)
self.assertGreater(nanops.nankurt(right_tailed), 0)

def test_ground_truth(self):
kurt = nanops.nankurt(self.samples)
self.assertAlmostEqual(kurt, self.actual_kurt)

def test_axis(self):
samples = np.vstack([self.samples,
np.nan * np.ones(len(self.samples))])
kurt = nanops.nankurt(samples, axis=1)
tm.assert_almost_equal(kurt, [self.actual_kurt, np.nan])

def test_nans(self):
samples = np.hstack([self.samples, np.nan])
kurt = nanops.nankurt(samples, skipna=False)
self.assertTrue(np.isnan(kurt))

def test_nans_skipna(self):
samples = np.hstack([self.samples, np.nan])
kurt = nanops.nankurt(samples, skipna=True)
tm.assert_almost_equal(kurt, self.actual_kurt)

@property
def prng(self):
return np.random.RandomState(1234)


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'
Expand Down