Skip to content

Commit 2d0d58c

Browse files
Joris Vankerschaverjreback
Joris Vankerschaver
authored andcommitted
BUG: Use stable algorithm for _nanvar, #10242
1 parent aa40dd0 commit 2d0d58c

File tree

5 files changed

+179
-36
lines changed

5 files changed

+179
-36
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -923,3 +923,5 @@ Bug Fixes
923923
- Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`)
924924
- Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`)
925925
- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`)
926+
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
927+
- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`)

pandas/core/nanops.py

+33-21
Original file line numberDiff line numberDiff line change
@@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
346346
return count, d
347347

348348

349-
def _nanvar(values, axis=None, skipna=True, ddof=1):
350-
# private nanvar calculator
349+
@disallow('M8')
350+
@bottleneck_switch(ddof=1)
351+
def nanstd(values, axis=None, skipna=True, ddof=1):
352+
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
353+
return _wrap_results(result, values.dtype)
354+
355+
356+
@disallow('M8')
357+
@bottleneck_switch(ddof=1)
358+
def nanvar(values, axis=None, skipna=True, ddof=1):
359+
360+
dtype = values.dtype
351361
mask = isnull(values)
352362
if is_any_int_dtype(values):
353363
values = values.astype('f8')
364+
values[mask] = np.nan
354365

355366
if is_float_dtype(values):
356367
count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
@@ -361,36 +372,37 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
361372
values = values.copy()
362373
np.putmask(values, mask, 0)
363374

364-
X = _ensure_numeric(values.sum(axis))
365-
XX = _ensure_numeric((values ** 2).sum(axis))
366-
result = np.fabs((XX - X * X / count) / d)
367-
return result
368-
369-
@disallow('M8')
370-
@bottleneck_switch(ddof=1)
371-
def nanstd(values, axis=None, skipna=True, ddof=1):
372375

373-
result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
376+
# xref GH10242
377+
# Compute variance via two-pass algorithm, which is stable against
378+
# cancellation errors and relatively accurate for small numbers of
379+
# observations.
380+
#
381+
# See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
382+
avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
383+
if axis is not None:
384+
avg = np.expand_dims(avg, axis)
385+
sqr = _ensure_numeric((avg - values) ** 2)
386+
np.putmask(sqr, mask, 0)
387+
result = sqr.sum(axis=axis, dtype=np.float64) / d
388+
389+
# Return variance as np.float64 (the datatype used in the accumulator),
390+
# unless we were dealing with a float array, in which case use the same
391+
# precision as the original values array.
392+
if is_float_dtype(dtype):
393+
result = result.astype(dtype)
374394
return _wrap_results(result, values.dtype)
375395

376-
@disallow('M8','m8')
377-
@bottleneck_switch(ddof=1)
378-
def nanvar(values, axis=None, skipna=True, ddof=1):
379396

380-
# we are going to allow timedelta64[ns] here
381-
# but NOT going to coerce them to the Timedelta type
382-
# as this could cause overflow
383-
# so var cannot be computed (but std can!)
384-
return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
385-
386-
@disallow('M8','m8')
397+
@disallow('M8', 'm8')
387398
def nansem(values, axis=None, skipna=True, ddof=1):
388399
var = nanvar(values, axis, skipna, ddof=ddof)
389400

390401
mask = isnull(values)
391402
if not is_float_dtype(values.dtype):
392403
values = values.astype('f8')
393404
count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
405+
var = nanvar(values, axis, skipna, ddof=ddof)
394406

395407
return np.sqrt(var) / np.sqrt(count)
396408

pandas/tests/test_frame.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -12589,11 +12589,9 @@ def test_numeric_only_flag(self):
1258912589
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
1259012590
assert_series_equal(expected, result)
1259112591

12592-
assertRaisesRegexp(TypeError, 'float',
12593-
getattr(df1, meth), axis=1, numeric_only=False)
12594-
12595-
assertRaisesRegexp(TypeError, 'float',
12596-
getattr(df2, meth), axis=1, numeric_only=False)
12592+
# df1 has all numbers, df2 has a letter inside
12593+
self.assertRaises(TypeError, lambda : getattr(df1, meth)(axis=1, numeric_only=False))
12594+
self.assertRaises(ValueError, lambda : getattr(df2, meth)(axis=1, numeric_only=False))
1259712595

1259812596
def test_sem(self):
1259912597
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))

pandas/tests/test_nanops.py

+139-8
Original file line numberDiff line numberDiff line change
@@ -182,13 +182,13 @@ def check_fun_data(self, testfunc, targfunc,
182182
**kwargs)
183183
self.check_results(targ, res, axis)
184184
if skipna:
185-
res = testfunc(testarval, axis=axis)
185+
res = testfunc(testarval, axis=axis, **kwargs)
186186
self.check_results(targ, res, axis)
187187
if axis is None:
188-
res = testfunc(testarval, skipna=skipna)
188+
res = testfunc(testarval, skipna=skipna, **kwargs)
189189
self.check_results(targ, res, axis)
190190
if skipna and axis is None:
191-
res = testfunc(testarval)
191+
res = testfunc(testarval, **kwargs)
192192
self.check_results(targ, res, axis)
193193
except BaseException as exc:
194194
exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1),
@@ -291,12 +291,13 @@ def check_funs_ddof(self, testfunc, targfunc,
291291
allow_date=False, allow_tdelta=False, allow_obj=True,):
292292
for ddof in range(3):
293293
try:
294-
self.check_funs(self, testfunc, targfunc,
294+
self.check_funs(testfunc, targfunc,
295295
allow_complex, allow_all_nan, allow_str,
296296
allow_date, allow_tdelta, allow_obj,
297297
ddof=ddof)
298298
except BaseException as exc:
299299
exc.args += ('ddof %s' % ddof,)
300+
raise
300301

301302
def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
302303
if value.dtype.kind == 'O':
@@ -366,16 +367,29 @@ def test_nanmedian(self):
366367

367368
def test_nanvar(self):
368369
self.check_funs_ddof(nanops.nanvar, np.var,
369-
allow_complex=False, allow_date=False, allow_tdelta=False)
370+
allow_complex=False,
371+
allow_str=False,
372+
allow_date=False,
373+
allow_tdelta=True,
374+
allow_obj='convert')
370375

371376
def test_nanstd(self):
372377
self.check_funs_ddof(nanops.nanstd, np.std,
373-
allow_complex=False, allow_date=False, allow_tdelta=True)
378+
allow_complex=False,
379+
allow_str=False,
380+
allow_date=False,
381+
allow_tdelta=True,
382+
allow_obj='convert')
374383

375384
def test_nansem(self):
376385
tm.skip_if_no_package('scipy.stats')
377-
self.check_funs_ddof(nanops.nansem, np.var,
378-
allow_complex=False, allow_date=False, allow_tdelta=False)
386+
from scipy.stats import sem
387+
self.check_funs_ddof(nanops.nansem, sem,
388+
allow_complex=False,
389+
allow_str=False,
390+
allow_date=False,
391+
allow_tdelta=True,
392+
allow_obj='convert')
379393

380394
def _minmax_wrap(self, value, axis=None, func=None):
381395
res = func(value, axis)
@@ -817,6 +831,123 @@ def test_non_convertable_values(self):
817831
lambda: nanops._ensure_numeric([]))
818832

819833

834+
class TestNanvarFixedValues(tm.TestCase):
835+
836+
# xref GH10242
837+
838+
def setUp(self):
839+
# Samples from a normal distribution.
840+
self.variance = variance = 3.0
841+
self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
842+
843+
def test_nanvar_all_finite(self):
844+
samples = self.samples
845+
actual_variance = nanops.nanvar(samples)
846+
np.testing.assert_almost_equal(
847+
actual_variance, self.variance, decimal=2)
848+
849+
def test_nanvar_nans(self):
850+
samples = np.nan * np.ones(2 * self.samples.shape[0])
851+
samples[::2] = self.samples
852+
853+
actual_variance = nanops.nanvar(samples, skipna=True)
854+
np.testing.assert_almost_equal(
855+
actual_variance, self.variance, decimal=2)
856+
857+
actual_variance = nanops.nanvar(samples, skipna=False)
858+
np.testing.assert_almost_equal(
859+
actual_variance, np.nan, decimal=2)
860+
861+
def test_nanstd_nans(self):
862+
samples = np.nan * np.ones(2 * self.samples.shape[0])
863+
samples[::2] = self.samples
864+
865+
actual_std = nanops.nanstd(samples, skipna=True)
866+
np.testing.assert_almost_equal(
867+
actual_std, self.variance ** 0.5, decimal=2)
868+
869+
actual_std = nanops.nanvar(samples, skipna=False)
870+
np.testing.assert_almost_equal(
871+
actual_std, np.nan, decimal=2)
872+
873+
def test_nanvar_axis(self):
874+
# Generate some sample data.
875+
samples_norm = self.samples
876+
samples_unif = self.prng.uniform(size=samples_norm.shape[0])
877+
samples = np.vstack([samples_norm, samples_unif])
878+
879+
actual_variance = nanops.nanvar(samples, axis=1)
880+
np.testing.assert_array_almost_equal(
881+
actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2)
882+
883+
def test_nanvar_ddof(self):
884+
n = 5
885+
samples = self.prng.uniform(size=(10000, n+1))
886+
samples[:, -1] = np.nan # Force use of our own algorithm.
887+
888+
variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
889+
variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
890+
variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
891+
892+
# The unbiased estimate.
893+
var = 1.0 / 12
894+
np.testing.assert_almost_equal(variance_1, var, decimal=2)
895+
# The underestimated variance.
896+
np.testing.assert_almost_equal(
897+
variance_0, (n - 1.0) / n * var, decimal=2)
898+
# The overestimated variance.
899+
np.testing.assert_almost_equal(
900+
variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2)
901+
902+
def test_ground_truth(self):
903+
# Test against values that were precomputed with Numpy.
904+
samples = np.empty((4, 4))
905+
samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287],
906+
[0.72980153, 0.03109364, 0.99155171],
907+
[0.09317602, 0.60078248, 0.15871292]])
908+
samples[3] = samples[:, 3] = np.nan
909+
910+
# Actual variances along axis=0, 1 for ddof=0, 1, 2
911+
variance = np.array(
912+
[[[0.13762259, 0.05619224, 0.11568816],
913+
[0.20643388, 0.08428837, 0.17353224],
914+
[0.41286776, 0.16857673, 0.34706449]],
915+
[[0.09519783, 0.16435395, 0.05082054],
916+
[0.14279674, 0.24653093, 0.07623082],
917+
[0.28559348, 0.49306186, 0.15246163]]]
918+
)
919+
920+
# Test nanvar.
921+
for axis in range(2):
922+
for ddof in range(3):
923+
var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
924+
np.testing.assert_array_almost_equal(
925+
var[:3], variance[axis, ddof]
926+
)
927+
np.testing.assert_equal(var[3], np.nan)
928+
929+
# Test nanstd.
930+
for axis in range(2):
931+
for ddof in range(3):
932+
std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
933+
np.testing.assert_array_almost_equal(
934+
std[:3], variance[axis, ddof] ** 0.5
935+
)
936+
np.testing.assert_equal(std[3], np.nan)
937+
938+
def test_nanstd_roundoff(self):
939+
# Regression test for GH 10242 (test data taken from GH 10489). Ensure
940+
# that variance is stable.
941+
data = Series(766897346 * np.ones(10))
942+
for ddof in range(3):
943+
result = data.std(ddof=ddof)
944+
self.assertEqual(result, 0.0)
945+
946+
@property
947+
def prng(self):
948+
return np.random.RandomState(1234)
949+
950+
820951
if __name__ == '__main__':
821952
import nose
822953
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure',

pandas/tseries/tests/test_timedeltas.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,8 @@ def test_timedelta_ops(self):
684684
self.assertEqual(result[0], expected)
685685

686686
# invalid ops
687-
for op in ['skew','kurt','sem','var','prod']:
688-
self.assertRaises(TypeError, lambda : getattr(td,op)())
687+
for op in ['skew','kurt','sem','prod']:
688+
self.assertRaises(TypeError, getattr(td,op))
689689

690690
# GH 10040
691691
# make sure NaT is properly handled by median()

0 commit comments

Comments
 (0)