Skip to content

BUG: Use stable algorithm for _nanvar. #10679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,5 @@ Bug Fixes
- Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`)
- Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`)
- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`)
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`)
54 changes: 32 additions & 22 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
return count, d


def _nanvar(values, axis=None, skipna=True, ddof=1):
# private nanvar calculator
@disallow('M8')
@bottleneck_switch(ddof=1)
def nanstd(values, axis=None, skipna=True, ddof=1):
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
return _wrap_results(result, values.dtype)


@disallow('M8')
@bottleneck_switch(ddof=1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you changing this. No real need to go to numpy at all. Either its hit in bottleneck, or we have our own algo which is modelled on nanvar.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I think I misunderstood one of your earlier suggestions. To clarify, are you suggesting that we don't go to Numpy at all in nanvar, and just have our own implementation (which used to be _nanvar)? I do think that would simplify matters a bit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback Done (and squashed)

def nanvar(values, axis=None, skipna=True, ddof=1):

dtype = values.dtype
mask = isnull(values)
if is_any_int_dtype(values):
values = values.astype('f8')
values[mask] = np.nan

if is_float_dtype(values):
count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
Expand All @@ -361,36 +372,35 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
values = values.copy()
np.putmask(values, mask, 0)

X = _ensure_numeric(values.sum(axis))
XX = _ensure_numeric((values ** 2).sum(axis))
result = np.fabs((XX - X * X / count) / d)
return result

@disallow('M8')
@bottleneck_switch(ddof=1)
def nanstd(values, axis=None, skipna=True, ddof=1):

result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
# Compute variance via two-pass algorithm, which is stable against
# cancellation errors and relatively accurate for small numbers of
# observations.
#
# See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
if axis is not None:
avg = np.expand_dims(avg, axis)
sqr = _ensure_numeric((avg - values) ** 2)
np.putmask(sqr, mask, 0)
result = sqr.sum(axis=axis, dtype=np.float64) / d

# Return variance as np.float64 (the datatype used in the accumulator),
# unless we were dealing with a float array, in which case use the same
# precision as the original values array.
if is_float_dtype(dtype):
result = result.astype(dtype)
return _wrap_results(result, values.dtype)

@disallow('M8','m8')
@bottleneck_switch(ddof=1)
def nanvar(values, axis=None, skipna=True, ddof=1):

# we are going to allow timedelta64[ns] here
# but NOT going to coerce them to the Timedelta type
# as this could cause overflow
# so var cannot be computed (but std can!)
return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof)

@disallow('M8','m8')
@disallow('M8', 'm8')
def nansem(values, axis=None, skipna=True, ddof=1):
var = nanvar(values, axis, skipna, ddof=ddof)

mask = isnull(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')
count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
var = nanvar(values, axis, skipna, ddof=ddof)

return np.sqrt(var) / np.sqrt(count)

Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12589,11 +12589,11 @@ def test_numeric_only_flag(self):
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
assert_series_equal(expected, result)

assertRaisesRegexp(TypeError, 'float',
getattr(df1, meth), axis=1, numeric_only=False)

assertRaisesRegexp(TypeError, 'float',
getattr(df2, meth), axis=1, numeric_only=False)
try:
getattr(df1, meth)(axis=1, numeric_only=False)
getattr(df2, meth)(axis=1, numeric_only=False)
except (TypeError, ValueError) as e:
self.assertIn('float', str(e))

def test_sem(self):
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
Expand Down
145 changes: 137 additions & 8 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,13 @@ def check_fun_data(self, testfunc, targfunc,
**kwargs)
self.check_results(targ, res, axis)
if skipna:
res = testfunc(testarval, axis=axis)
res = testfunc(testarval, axis=axis, **kwargs)
self.check_results(targ, res, axis)
if axis is None:
res = testfunc(testarval, skipna=skipna)
res = testfunc(testarval, skipna=skipna, **kwargs)
self.check_results(targ, res, axis)
if skipna and axis is None:
res = testfunc(testarval)
res = testfunc(testarval, **kwargs)
self.check_results(targ, res, axis)
except BaseException as exc:
exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1),
Expand Down Expand Up @@ -291,12 +291,13 @@ def check_funs_ddof(self, testfunc, targfunc,
allow_date=False, allow_tdelta=False, allow_obj=True,):
for ddof in range(3):
try:
self.check_funs(self, testfunc, targfunc,
self.check_funs(testfunc, targfunc,
allow_complex, allow_all_nan, allow_str,
allow_date, allow_tdelta, allow_obj,
ddof=ddof)
except BaseException as exc:
exc.args += ('ddof %s' % ddof,)
raise

def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
if value.dtype.kind == 'O':
Expand Down Expand Up @@ -366,16 +367,29 @@ def test_nanmedian(self):

def test_nanvar(self):
self.check_funs_ddof(nanops.nanvar, np.var,
allow_complex=False, allow_date=False, allow_tdelta=False)
allow_complex=False,
allow_str=False,
allow_date=False,
allow_tdelta=True,
allow_obj='convert')

def test_nanstd(self):
self.check_funs_ddof(nanops.nanstd, np.std,
allow_complex=False, allow_date=False, allow_tdelta=True)
allow_complex=False,
allow_str=False,
allow_date=False,
allow_tdelta=True,
allow_obj='convert')

def test_nansem(self):
tm.skip_if_no_package('scipy.stats')
self.check_funs_ddof(nanops.nansem, np.var,
allow_complex=False, allow_date=False, allow_tdelta=False)
from scipy.stats import sem
self.check_funs_ddof(nanops.nansem, sem,
allow_complex=False,
allow_str=False,
allow_date=False,
allow_tdelta=True,
allow_obj='convert')

def _minmax_wrap(self, value, axis=None, func=None):
res = func(value, axis)
Expand Down Expand Up @@ -817,6 +831,121 @@ def test_non_convertable_values(self):
lambda: nanops._ensure_numeric([]))


class TestNanvarFixedValues(tm.TestCase):

def setUp(self):
# Samples from a normal distribution.
self.variance = variance = 3.0
self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)

def test_nanvar_all_finite(self):
samples = self.samples
actual_variance = nanops.nanvar(samples)
np.testing.assert_almost_equal(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use tm.assert_almost_equal or tm.assert_numpy_array_equal as appropriate

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I resorted to using the Numpy variants because I needed to set the decimal precision to something low; is there a Pandas version that also allows the precision to be adjusted?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we have a less_precise option where u can set the decimal places to compare (or defaults to 3 I think)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC the number of decimal places is fixed to 5 or 3 (depending on whether check_less_precise is True) and I really need 2 ;-) I'll see if I can work around this.

actual_variance, self.variance, decimal=2)

def test_nanvar_nans(self):
samples = np.nan * np.ones(2 * self.samples.shape[0])
samples[::2] = self.samples

actual_variance = nanops.nanvar(samples, skipna=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should fail on numpy 1.7 because nanvar doesn't exist., and yet it doesn't. The travis 2.7 builds are all with 1.7

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is using Pandas' nanvar ;-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh right ok

np.testing.assert_almost_equal(
actual_variance, self.variance, decimal=2)

actual_variance = nanops.nanvar(samples, skipna=False)
np.testing.assert_almost_equal(
actual_variance, np.nan, decimal=2)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test with nanstd as well for all-nan

def test_nanstd_nans(self):
samples = np.nan * np.ones(2 * self.samples.shape[0])
samples[::2] = self.samples

actual_std = nanops.nanstd(samples, skipna=True)
np.testing.assert_almost_equal(
actual_std, self.variance ** 0.5, decimal=2)

actual_std = nanops.nanvar(samples, skipna=False)
np.testing.assert_almost_equal(
actual_std, np.nan, decimal=2)

def test_nanvar_axis(self):
# Generate some sample data.
samples_norm = self.samples
samples_unif = self.prng.uniform(size=samples_norm.shape[0])
samples = np.vstack([samples_norm, samples_unif])

actual_variance = nanops.nanvar(samples, axis=1)
np.testing.assert_array_almost_equal(
actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2)

def test_nanvar_ddof(self):
n = 5
samples = self.prng.uniform(size=(10000, n+1))
samples[:, -1] = np.nan # Force use of our own algorithm.

variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()

# The unbiased estimate.
var = 1.0 / 12
np.testing.assert_almost_equal(variance_1, var, decimal=2)
# The underestimated variance.
np.testing.assert_almost_equal(
variance_0, (n - 1.0) / n * var, decimal=2)
# The overestimated variance.
np.testing.assert_almost_equal(
variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2)

def test_ground_truth(self):
# Test against values that were precomputed with Numpy.
samples = np.empty((4, 4))
samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287],
[0.72980153, 0.03109364, 0.99155171],
[0.09317602, 0.60078248, 0.15871292]])
samples[3] = samples[:, 3] = np.nan

# Actual variances along axis=0, 1 for ddof=0, 1, 2
variance = np.array(
[[[0.13762259, 0.05619224, 0.11568816],
[0.20643388, 0.08428837, 0.17353224],
[0.41286776, 0.16857673, 0.34706449]],
[[0.09519783, 0.16435395, 0.05082054],
[0.14279674, 0.24653093, 0.07623082],
[0.28559348, 0.49306186, 0.15246163]]]
)

# Test nanvar.
for axis in range(2):
for ddof in range(3):
var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
np.testing.assert_array_almost_equal(
var[:3], variance[axis, ddof]
)
np.testing.assert_equal(var[3], np.nan)

# Test nanstd.
for axis in range(2):
for ddof in range(3):
std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
np.testing.assert_array_almost_equal(
std[:3], variance[axis, ddof] ** 0.5
)
np.testing.assert_equal(std[3], np.nan)

def test_nanstd_roundoff(self):
# Regression test for GH 10242 (test data taken from GH 10489). Ensure
# that variance is stable.
data = Series(766897346 * np.ones(10))
for ddof in range(3):
result = data.std(ddof=ddof)
self.assertEqual(result, 0.0)

@property
def prng(self):
return np.random.RandomState(1234)


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure',
Expand Down
4 changes: 2 additions & 2 deletions pandas/tseries/tests/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,8 +684,8 @@ def test_timedelta_ops(self):
self.assertEqual(result[0], expected)

# invalid ops
for op in ['skew','kurt','sem','var','prod']:
self.assertRaises(TypeError, lambda : getattr(td,op)())
for op in ['skew','kurt','sem','prod']:
self.assertRaises(TypeError, getattr(td,op))

# GH 10040
# make sure NaT is properly handled by median()
Expand Down