Skip to content

Commit 5c942ea

Browse files
Joris VankerschaverNick Eubank
Joris Vankerschaver
authored and
Nick Eubank
committed
BUG: Use stable algorithm for _nanvar, pandas-dev#10242
1 parent 1afc1ab commit 5c942ea

File tree

5 files changed

+179
-36
lines changed

5 files changed

+179
-36
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -943,3 +943,5 @@ Bug Fixes
943943
- Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`)
944944
- Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`)
945945
- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`)
946+
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
947+
- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`)

pandas/core/nanops.py

+33-21
Original file line numberDiff line numberDiff line change
@@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
346346
return count, d
347347

348348

349-
def _nanvar(values, axis=None, skipna=True, ddof=1):
350-
# private nanvar calculator
349+
@disallow('M8')
350+
@bottleneck_switch(ddof=1)
351+
def nanstd(values, axis=None, skipna=True, ddof=1):
352+
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
353+
return _wrap_results(result, values.dtype)
354+
355+
356+
@disallow('M8')
357+
@bottleneck_switch(ddof=1)
358+
def nanvar(values, axis=None, skipna=True, ddof=1):
359+
360+
dtype = values.dtype
351361
mask = isnull(values)
352362
if is_any_int_dtype(values):
353363
values = values.astype('f8')
364+
values[mask] = np.nan
354365

355366
if is_float_dtype(values):
356367
count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
@@ -361,36 +372,37 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
361372
values = values.copy()
362373
np.putmask(values, mask, 0)
363374

364-
X = _ensure_numeric(values.sum(axis))
365-
XX = _ensure_numeric((values ** 2).sum(axis))
366-
result = np.fabs((XX - X * X / count) / d)
367-
return result
368-
369-
@disallow('M8')
370-
@bottleneck_switch(ddof=1)
371-
def nanstd(values, axis=None, skipna=True, ddof=1):
372375

373-
result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
376+
# xref GH10242
377+
# Compute variance via two-pass algorithm, which is stable against
378+
# cancellation errors and relatively accurate for small numbers of
379+
# observations.
380+
#
381+
# See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
382+
avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
383+
if axis is not None:
384+
avg = np.expand_dims(avg, axis)
385+
sqr = _ensure_numeric((avg - values) ** 2)
386+
np.putmask(sqr, mask, 0)
387+
result = sqr.sum(axis=axis, dtype=np.float64) / d
388+
389+
# Return variance as np.float64 (the datatype used in the accumulator),
390+
# unless we were dealing with a float array, in which case use the same
391+
# precision as the original values array.
392+
if is_float_dtype(dtype):
393+
result = result.astype(dtype)
374394
return _wrap_results(result, values.dtype)
375395

376-
@disallow('M8','m8')
377-
@bottleneck_switch(ddof=1)
378-
def nanvar(values, axis=None, skipna=True, ddof=1):
379396

380-
# we are going to allow timedelta64[ns] here
381-
# but NOT going to coerce them to the Timedelta type
382-
# as this could cause overflow
383-
# so var cannot be computed (but std can!)
384-
return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
385-
386-
@disallow('M8','m8')
397+
@disallow('M8', 'm8')
387398
def nansem(values, axis=None, skipna=True, ddof=1):
388399
var = nanvar(values, axis, skipna, ddof=ddof)
389400

390401
mask = isnull(values)
391402
if not is_float_dtype(values.dtype):
392403
values = values.astype('f8')
393404
count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
405+
var = nanvar(values, axis, skipna, ddof=ddof)
394406

395407
return np.sqrt(var) / np.sqrt(count)
396408

pandas/tests/test_frame.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -12581,11 +12581,9 @@ def test_numeric_only_flag(self):
1258112581
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
1258212582
assert_series_equal(expected, result)
1258312583

12584-
assertRaisesRegexp(TypeError, 'float',
12585-
getattr(df1, meth), axis=1, numeric_only=False)
12586-
12587-
assertRaisesRegexp(TypeError, 'float',
12588-
getattr(df2, meth), axis=1, numeric_only=False)
12584+
# df1 has all numbers, df2 has a letter inside
12585+
self.assertRaises(TypeError, lambda : getattr(df1, meth)(axis=1, numeric_only=False))
12586+
self.assertRaises(ValueError, lambda : getattr(df2, meth)(axis=1, numeric_only=False))
1258912587

1259012588
def test_sem(self):
1259112589
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))

pandas/tests/test_nanops.py

+139-8
Original file line numberDiff line numberDiff line change
@@ -186,13 +186,13 @@ def check_fun_data(self, testfunc, targfunc,
186186
**kwargs)
187187
self.check_results(targ, res, axis)
188188
if skipna:
189-
res = testfunc(testarval, axis=axis)
189+
res = testfunc(testarval, axis=axis, **kwargs)
190190
self.check_results(targ, res, axis)
191191
if axis is None:
192-
res = testfunc(testarval, skipna=skipna)
192+
res = testfunc(testarval, skipna=skipna, **kwargs)
193193
self.check_results(targ, res, axis)
194194
if skipna and axis is None:
195-
res = testfunc(testarval)
195+
res = testfunc(testarval, **kwargs)
196196
self.check_results(targ, res, axis)
197197
except BaseException as exc:
198198
exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1),
@@ -295,12 +295,13 @@ def check_funs_ddof(self, testfunc, targfunc,
295295
allow_date=False, allow_tdelta=False, allow_obj=True,):
296296
for ddof in range(3):
297297
try:
298-
self.check_funs(self, testfunc, targfunc,
298+
self.check_funs(testfunc, targfunc,
299299
allow_complex, allow_all_nan, allow_str,
300300
allow_date, allow_tdelta, allow_obj,
301301
ddof=ddof)
302302
except BaseException as exc:
303303
exc.args += ('ddof %s' % ddof,)
304+
raise
304305

305306
def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
306307
if value.dtype.kind == 'O':
@@ -370,16 +371,29 @@ def test_nanmedian(self):
370371

371372
def test_nanvar(self):
372373
self.check_funs_ddof(nanops.nanvar, np.var,
373-
allow_complex=False, allow_date=False, allow_tdelta=False)
374+
allow_complex=False,
375+
allow_str=False,
376+
allow_date=False,
377+
allow_tdelta=True,
378+
allow_obj='convert')
374379

375380
def test_nanstd(self):
376381
self.check_funs_ddof(nanops.nanstd, np.std,
377-
allow_complex=False, allow_date=False, allow_tdelta=True)
382+
allow_complex=False,
383+
allow_str=False,
384+
allow_date=False,
385+
allow_tdelta=True,
386+
allow_obj='convert')
378387

379388
def test_nansem(self):
380389
tm.skip_if_no_package('scipy.stats')
381-
self.check_funs_ddof(nanops.nansem, np.var,
382-
allow_complex=False, allow_date=False, allow_tdelta=False)
390+
from scipy.stats import sem
391+
self.check_funs_ddof(nanops.nansem, sem,
392+
allow_complex=False,
393+
allow_str=False,
394+
allow_date=False,
395+
allow_tdelta=True,
396+
allow_obj='convert')
383397

384398
def _minmax_wrap(self, value, axis=None, func=None):
385399
res = func(value, axis)
@@ -821,6 +835,123 @@ def test_non_convertable_values(self):
821835
lambda: nanops._ensure_numeric([]))
822836

823837

838+
class TestNanvarFixedValues(tm.TestCase):
839+
840+
# xref GH10242
841+
842+
def setUp(self):
843+
# Samples from a normal distribution.
844+
self.variance = variance = 3.0
845+
self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
846+
847+
def test_nanvar_all_finite(self):
848+
samples = self.samples
849+
actual_variance = nanops.nanvar(samples)
850+
np.testing.assert_almost_equal(
851+
actual_variance, self.variance, decimal=2)
852+
853+
def test_nanvar_nans(self):
854+
samples = np.nan * np.ones(2 * self.samples.shape[0])
855+
samples[::2] = self.samples
856+
857+
actual_variance = nanops.nanvar(samples, skipna=True)
858+
np.testing.assert_almost_equal(
859+
actual_variance, self.variance, decimal=2)
860+
861+
actual_variance = nanops.nanvar(samples, skipna=False)
862+
np.testing.assert_almost_equal(
863+
actual_variance, np.nan, decimal=2)
864+
865+
def test_nanstd_nans(self):
866+
samples = np.nan * np.ones(2 * self.samples.shape[0])
867+
samples[::2] = self.samples
868+
869+
actual_std = nanops.nanstd(samples, skipna=True)
870+
np.testing.assert_almost_equal(
871+
actual_std, self.variance ** 0.5, decimal=2)
872+
873+
actual_std = nanops.nanvar(samples, skipna=False)
874+
np.testing.assert_almost_equal(
875+
actual_std, np.nan, decimal=2)
876+
877+
def test_nanvar_axis(self):
878+
# Generate some sample data.
879+
samples_norm = self.samples
880+
samples_unif = self.prng.uniform(size=samples_norm.shape[0])
881+
samples = np.vstack([samples_norm, samples_unif])
882+
883+
actual_variance = nanops.nanvar(samples, axis=1)
884+
np.testing.assert_array_almost_equal(
885+
actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2)
886+
887+
def test_nanvar_ddof(self):
888+
n = 5
889+
samples = self.prng.uniform(size=(10000, n+1))
890+
samples[:, -1] = np.nan # Force use of our own algorithm.
891+
892+
variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
893+
variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
894+
variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
895+
896+
# The unbiased estimate.
897+
var = 1.0 / 12
898+
np.testing.assert_almost_equal(variance_1, var, decimal=2)
899+
# The underestimated variance.
900+
np.testing.assert_almost_equal(
901+
variance_0, (n - 1.0) / n * var, decimal=2)
902+
# The overestimated variance.
903+
np.testing.assert_almost_equal(
904+
variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2)
905+
906+
def test_ground_truth(self):
907+
# Test against values that were precomputed with Numpy.
908+
samples = np.empty((4, 4))
909+
samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287],
910+
[0.72980153, 0.03109364, 0.99155171],
911+
[0.09317602, 0.60078248, 0.15871292]])
912+
samples[3] = samples[:, 3] = np.nan
913+
914+
# Actual variances along axis=0, 1 for ddof=0, 1, 2
915+
variance = np.array(
916+
[[[0.13762259, 0.05619224, 0.11568816],
917+
[0.20643388, 0.08428837, 0.17353224],
918+
[0.41286776, 0.16857673, 0.34706449]],
919+
[[0.09519783, 0.16435395, 0.05082054],
920+
[0.14279674, 0.24653093, 0.07623082],
921+
[0.28559348, 0.49306186, 0.15246163]]]
922+
)
923+
924+
# Test nanvar.
925+
for axis in range(2):
926+
for ddof in range(3):
927+
var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
928+
np.testing.assert_array_almost_equal(
929+
var[:3], variance[axis, ddof]
930+
)
931+
np.testing.assert_equal(var[3], np.nan)
932+
933+
# Test nanstd.
934+
for axis in range(2):
935+
for ddof in range(3):
936+
std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
937+
np.testing.assert_array_almost_equal(
938+
std[:3], variance[axis, ddof] ** 0.5
939+
)
940+
np.testing.assert_equal(std[3], np.nan)
941+
942+
def test_nanstd_roundoff(self):
943+
# Regression test for GH 10242 (test data taken from GH 10489). Ensure
944+
# that variance is stable.
945+
data = Series(766897346 * np.ones(10))
946+
for ddof in range(3):
947+
result = data.std(ddof=ddof)
948+
self.assertEqual(result, 0.0)
949+
950+
@property
951+
def prng(self):
952+
return np.random.RandomState(1234)
953+
954+
824955
if __name__ == '__main__':
825956
import nose
826957
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure',

pandas/tseries/tests/test_timedeltas.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,8 @@ def test_timedelta_ops(self):
684684
self.assertEqual(result[0], expected)
685685

686686
# invalid ops
687-
for op in ['skew','kurt','sem','var','prod']:
688-
self.assertRaises(TypeError, lambda : getattr(td,op)())
687+
for op in ['skew','kurt','sem','prod']:
688+
self.assertRaises(TypeError, getattr(td,op))
689689

690690
# GH 10040
691691
# make sure NaT is properly handled by median()

0 commit comments

Comments
 (0)