diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 75a38544fb8eb..d7016af6014fc 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -540,3 +540,5 @@ of columns didn't match the number of series provided (:issue:`12039`). - Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`) - Big in ``.style`` indexes and multi-indexes not appearing (:issue:`11655`) + +- Bug in ``.skew`` and ``.kurt`` due to roundoff error for highly similar values (:issue:`11974`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 7764d6e1e1fa9..b5e4e45991a49 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -474,6 +474,13 @@ def nanargmin(values, axis=None, skipna=True): @disallow('M8', 'm8') def nanskew(values, axis=None, skipna=True): + """ Compute the sample skewness. + + The statistic computed here is the adjusted Fisher-Pearson standardized + moment coefficient G1. The algorithm computes this coefficient directly + from the second and third central moment. + + """ mask = isnull(values) if not is_float_dtype(values.dtype): @@ -486,24 +493,34 @@ def nanskew(values, axis=None, skipna=True): values = values.copy() np.putmask(values, mask, 0) - typ = values.dtype.type - A = values.sum(axis) / count - B = (values**2).sum(axis) / count - A**typ(2) - C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B + mean = values.sum(axis, dtype=np.float64) / count + if axis is not None: + mean = np.expand_dims(mean, axis) + + adjusted = values - mean + if skipna: + np.putmask(adjusted, mask, 0) + adjusted2 = adjusted ** 2 + adjusted3 = adjusted2 * adjusted + m2 = adjusted2.sum(axis, dtype=np.float64) + m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error - B = _zero_out_fperr(B) - C = _zero_out_fperr(C) + m2 = _zero_out_fperr(m2) + m3 = _zero_out_fperr(m3) - result = ((np.sqrt(count * count - count) * C) / - ((count - typ(2)) * np.sqrt(B)**typ(3))) + result = (count * (count-1) ** 0.5 / (count-2)) * (m3 / m2 ** 1.5) + + dtype = values.dtype + if is_float_dtype(dtype): + result = result.astype(dtype) if isinstance(result, np.ndarray): - result = np.where(B == 0, 0, result) + result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: - result = 0 if B == 0 else result + result = 0 if m2 == 0 else result if count < 3: return np.nan return result @@ -511,7 +528,13 @@ def nanskew(values, axis=None, skipna=True): @disallow('M8', 'm8') def nankurt(values, axis=None, skipna=True): + """ Compute the sample skewness. + + The statistic computed here is the adjusted Fisher-Pearson standardized + moment coefficient G2, computed directly from the second and fourth + central moment. + """ mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') @@ -523,30 +546,43 @@ def nankurt(values, axis=None, skipna=True): values = values.copy() np.putmask(values, mask, 0) - typ = values.dtype.type - A = values.sum(axis) / count - B = (values**2).sum(axis) / count - A**typ(2) - C = (values**3).sum(axis) / count - A**typ(3) - typ(3) * A * B - D = ((values**4).sum(axis) / count - A**typ(4) - - typ(6) * B * A * A - typ(4) * C * A) + mean = values.sum(axis, dtype=np.float64) / count + if axis is not None: + mean = np.expand_dims(mean, axis) + + adjusted = values - mean + if skipna: + np.putmask(adjusted, mask, 0) + adjusted2 = adjusted ** 2 + adjusted4 = adjusted2 ** 2 + m2 = adjusted2.sum(axis, dtype=np.float64) + m4 = adjusted4.sum(axis, dtype=np.float64) + + adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) + numer = count * (count + 1) * (count - 1) * m4 + denom = (count - 2) * (count - 3) * m2**2 + result = numer / denom - adj - B = _zero_out_fperr(B) - D = _zero_out_fperr(D) + # floating point error + numer = _zero_out_fperr(numer) + denom = _zero_out_fperr(denom) - if not isinstance(B, np.ndarray): - # if B is a scalar, check these corner cases first before doing - # division + if not isinstance(denom, np.ndarray): + # if ``denom`` is a scalar, check these corner cases first before + # doing division if count < 4: return np.nan - if B == 0: + if denom == 0: return 0 - result = (((count * count - typ(1)) * D / (B * B) - typ(3) * - ((count - typ(1))**typ(2))) / ((count - typ(2)) * - (count - typ(3)))) + result = numer / denom - adj + + dtype = values.dtype + if is_float_dtype(dtype): + result = result.astype(dtype) if isinstance(result, np.ndarray): - result = np.where(B == 0, 0, result) + result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ecd3fa6ed53ee..ca5246ba98f89 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -888,6 +888,106 @@ def prng(self): return np.random.RandomState(1234) +class TestNanskewFixedValues(tm.TestCase): + + # xref GH 11974 + + def setUp(self): + # Test data + skewness value (computed with scipy.stats.skew) + self.samples = np.sin(np.linspace(0, 1, 200)) + self.actual_skew = -0.1875895205961754 + + def test_constant_series(self): + # xref GH 11974 + for val in [3075.2, 3075.3, 3075.5]: + data = val * np.ones(300) + skew = nanops.nanskew(data) + self.assertEqual(skew, 0.0) + + def test_all_finite(self): + alpha, beta = 0.3, 0.1 + left_tailed = self.prng.beta(alpha, beta, size=100) + self.assertLess(nanops.nanskew(left_tailed), 0) + + alpha, beta = 0.1, 0.3 + right_tailed = self.prng.beta(alpha, beta, size=100) + self.assertGreater(nanops.nanskew(right_tailed), 0) + + def test_ground_truth(self): + skew = nanops.nanskew(self.samples) + self.assertAlmostEqual(skew, self.actual_skew) + + def test_axis(self): + samples = np.vstack([self.samples, + np.nan * np.ones(len(self.samples))]) + skew = nanops.nanskew(samples, axis=1) + tm.assert_almost_equal(skew, [self.actual_skew, np.nan]) + + def test_nans(self): + samples = np.hstack([self.samples, np.nan]) + skew = nanops.nanskew(samples, skipna=False) + self.assertTrue(np.isnan(skew)) + + def test_nans_skipna(self): + samples = np.hstack([self.samples, np.nan]) + skew = nanops.nanskew(samples, skipna=True) + tm.assert_almost_equal(skew, self.actual_skew) + + @property + def prng(self): + return np.random.RandomState(1234) + + +class TestNankurtFixedValues(tm.TestCase): + + # xref GH 11974 + + def setUp(self): + # Test data + kurtosis value (computed with scipy.stats.kurtosis) + self.samples = np.sin(np.linspace(0, 1, 200)) + self.actual_kurt = -1.2058303433799713 + + def test_constant_series(self): + # xref GH 11974 + for val in [3075.2, 3075.3, 3075.5]: + data = val * np.ones(300) + kurt = nanops.nankurt(data) + self.assertEqual(kurt, 0.0) + + def test_all_finite(self): + alpha, beta = 0.3, 0.1 + left_tailed = self.prng.beta(alpha, beta, size=100) + self.assertLess(nanops.nankurt(left_tailed), 0) + + alpha, beta = 0.1, 0.3 + right_tailed = self.prng.beta(alpha, beta, size=100) + self.assertGreater(nanops.nankurt(right_tailed), 0) + + def test_ground_truth(self): + kurt = nanops.nankurt(self.samples) + self.assertAlmostEqual(kurt, self.actual_kurt) + + def test_axis(self): + samples = np.vstack([self.samples, + np.nan * np.ones(len(self.samples))]) + kurt = nanops.nankurt(samples, axis=1) + tm.assert_almost_equal(kurt, [self.actual_kurt, np.nan]) + + def test_nans(self): + samples = np.hstack([self.samples, np.nan]) + kurt = nanops.nankurt(samples, skipna=False) + self.assertTrue(np.isnan(kurt)) + + def test_nans_skipna(self): + samples = np.hstack([self.samples, np.nan]) + kurt = nanops.nankurt(samples, skipna=True) + tm.assert_almost_equal(kurt, self.actual_kurt) + + @property + def prng(self): + return np.random.RandomState(1234) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'