-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Use stable algorithm for _nanvar. #10679
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -182,13 +182,13 @@ def check_fun_data(self, testfunc, targfunc, | |
**kwargs) | ||
self.check_results(targ, res, axis) | ||
if skipna: | ||
res = testfunc(testarval, axis=axis) | ||
res = testfunc(testarval, axis=axis, **kwargs) | ||
self.check_results(targ, res, axis) | ||
if axis is None: | ||
res = testfunc(testarval, skipna=skipna) | ||
res = testfunc(testarval, skipna=skipna, **kwargs) | ||
self.check_results(targ, res, axis) | ||
if skipna and axis is None: | ||
res = testfunc(testarval) | ||
res = testfunc(testarval, **kwargs) | ||
self.check_results(targ, res, axis) | ||
except BaseException as exc: | ||
exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1), | ||
|
@@ -291,12 +291,13 @@ def check_funs_ddof(self, testfunc, targfunc, | |
allow_date=False, allow_tdelta=False, allow_obj=True,): | ||
for ddof in range(3): | ||
try: | ||
self.check_funs(self, testfunc, targfunc, | ||
self.check_funs(testfunc, targfunc, | ||
allow_complex, allow_all_nan, allow_str, | ||
allow_date, allow_tdelta, allow_obj, | ||
ddof=ddof) | ||
except BaseException as exc: | ||
exc.args += ('ddof %s' % ddof,) | ||
raise | ||
|
||
def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): | ||
if value.dtype.kind == 'O': | ||
|
@@ -366,16 +367,29 @@ def test_nanmedian(self): | |
|
||
def test_nanvar(self): | ||
self.check_funs_ddof(nanops.nanvar, np.var, | ||
allow_complex=False, allow_date=False, allow_tdelta=False) | ||
allow_complex=False, | ||
allow_str=False, | ||
allow_date=False, | ||
allow_tdelta=True, | ||
allow_obj='convert') | ||
|
||
def test_nanstd(self): | ||
self.check_funs_ddof(nanops.nanstd, np.std, | ||
allow_complex=False, allow_date=False, allow_tdelta=True) | ||
allow_complex=False, | ||
allow_str=False, | ||
allow_date=False, | ||
allow_tdelta=True, | ||
allow_obj='convert') | ||
|
||
def test_nansem(self): | ||
tm.skip_if_no_package('scipy.stats') | ||
self.check_funs_ddof(nanops.nansem, np.var, | ||
allow_complex=False, allow_date=False, allow_tdelta=False) | ||
from scipy.stats import sem | ||
self.check_funs_ddof(nanops.nansem, sem, | ||
allow_complex=False, | ||
allow_str=False, | ||
allow_date=False, | ||
allow_tdelta=True, | ||
allow_obj='convert') | ||
|
||
def _minmax_wrap(self, value, axis=None, func=None): | ||
res = func(value, axis) | ||
|
@@ -817,6 +831,121 @@ def test_non_convertable_values(self): | |
lambda: nanops._ensure_numeric([])) | ||
|
||
|
||
class TestNanvarFixedValues(tm.TestCase): | ||
|
||
def setUp(self): | ||
# Samples from a normal distribution. | ||
self.variance = variance = 3.0 | ||
self.samples = self.prng.normal(scale=variance ** 0.5, size=100000) | ||
|
||
def test_nanvar_all_finite(self): | ||
samples = self.samples | ||
actual_variance = nanops.nanvar(samples) | ||
np.testing.assert_almost_equal( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I resorted to using the Numpy variants because I needed to set the decimal precision to something low; is there a Pandas version that also allows the precision to be adjusted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we have a less_precise option where u can set the decimal places to compare (or defaults to 3 I think) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC the number of decimal places is fixed to 5 or 3 (depending on whether |
||
actual_variance, self.variance, decimal=2) | ||
|
||
def test_nanvar_nans(self): | ||
samples = np.nan * np.ones(2 * self.samples.shape[0]) | ||
samples[::2] = self.samples | ||
|
||
actual_variance = nanops.nanvar(samples, skipna=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should fail on numpy 1.7 because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is using Pandas' There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh right ok |
||
np.testing.assert_almost_equal( | ||
actual_variance, self.variance, decimal=2) | ||
|
||
actual_variance = nanops.nanvar(samples, skipna=False) | ||
np.testing.assert_almost_equal( | ||
actual_variance, np.nan, decimal=2) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test with |
||
def test_nanstd_nans(self): | ||
samples = np.nan * np.ones(2 * self.samples.shape[0]) | ||
samples[::2] = self.samples | ||
|
||
actual_std = nanops.nanstd(samples, skipna=True) | ||
np.testing.assert_almost_equal( | ||
actual_std, self.variance ** 0.5, decimal=2) | ||
|
||
actual_std = nanops.nanvar(samples, skipna=False) | ||
np.testing.assert_almost_equal( | ||
actual_std, np.nan, decimal=2) | ||
|
||
def test_nanvar_axis(self): | ||
# Generate some sample data. | ||
samples_norm = self.samples | ||
samples_unif = self.prng.uniform(size=samples_norm.shape[0]) | ||
samples = np.vstack([samples_norm, samples_unif]) | ||
|
||
actual_variance = nanops.nanvar(samples, axis=1) | ||
np.testing.assert_array_almost_equal( | ||
actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2) | ||
|
||
def test_nanvar_ddof(self): | ||
n = 5 | ||
samples = self.prng.uniform(size=(10000, n+1)) | ||
samples[:, -1] = np.nan # Force use of our own algorithm. | ||
|
||
variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean() | ||
variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean() | ||
variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean() | ||
|
||
# The unbiased estimate. | ||
var = 1.0 / 12 | ||
np.testing.assert_almost_equal(variance_1, var, decimal=2) | ||
# The underestimated variance. | ||
np.testing.assert_almost_equal( | ||
variance_0, (n - 1.0) / n * var, decimal=2) | ||
# The overestimated variance. | ||
np.testing.assert_almost_equal( | ||
variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2) | ||
|
||
def test_ground_truth(self): | ||
# Test against values that were precomputed with Numpy. | ||
samples = np.empty((4, 4)) | ||
samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287], | ||
[0.72980153, 0.03109364, 0.99155171], | ||
[0.09317602, 0.60078248, 0.15871292]]) | ||
samples[3] = samples[:, 3] = np.nan | ||
|
||
# Actual variances along axis=0, 1 for ddof=0, 1, 2 | ||
variance = np.array( | ||
[[[0.13762259, 0.05619224, 0.11568816], | ||
[0.20643388, 0.08428837, 0.17353224], | ||
[0.41286776, 0.16857673, 0.34706449]], | ||
[[0.09519783, 0.16435395, 0.05082054], | ||
[0.14279674, 0.24653093, 0.07623082], | ||
[0.28559348, 0.49306186, 0.15246163]]] | ||
) | ||
|
||
# Test nanvar. | ||
for axis in range(2): | ||
for ddof in range(3): | ||
var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) | ||
np.testing.assert_array_almost_equal( | ||
var[:3], variance[axis, ddof] | ||
) | ||
np.testing.assert_equal(var[3], np.nan) | ||
|
||
# Test nanstd. | ||
for axis in range(2): | ||
for ddof in range(3): | ||
std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) | ||
np.testing.assert_array_almost_equal( | ||
std[:3], variance[axis, ddof] ** 0.5 | ||
) | ||
np.testing.assert_equal(std[3], np.nan) | ||
|
||
def test_nanstd_roundoff(self): | ||
# Regression test for GH 10242 (test data taken from GH 10489). Ensure | ||
# that variance is stable. | ||
data = Series(766897346 * np.ones(10)) | ||
for ddof in range(3): | ||
result = data.std(ddof=ddof) | ||
self.assertEqual(result, 0.0) | ||
|
||
@property | ||
def prng(self): | ||
return np.random.RandomState(1234) | ||
|
||
|
||
if __name__ == '__main__': | ||
import nose | ||
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why are you changing this. No real need to go to numpy at all. Either its hit in bottleneck, or we have our own algo which is modelled on
nanvar
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback I think I misunderstood one of your earlier suggestions. To clarify, are you suggesting that we don't go to Numpy at all in
nanvar
, and just have our own implementation (which used to be_nanvar
)? I do think that would simplify matters a bit.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback Done (and squashed)