BUG: Use stable algorithm for _nanvar, pandas-dev#10242

Joris Vankerschaver · Nick Eubank · commit 5c942ea45089 · 2015-09-29T12:04:42.000-07:00
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -943,3 +943,5 @@ Bug Fixes
 - Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`)
 - Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`)
 - Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`)
+- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
+- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
     return count, d
 
 
-def _nanvar(values, axis=None, skipna=True, ddof=1):
-    # private nanvar calculator
+@disallow('M8')
+@bottleneck_switch(ddof=1)
+def nanstd(values, axis=None, skipna=True, ddof=1):
+    result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
+    return _wrap_results(result, values.dtype)
+
+
+@disallow('M8')
+@bottleneck_switch(ddof=1)
+def nanvar(values, axis=None, skipna=True, ddof=1):
+
+    dtype = values.dtype
     mask = isnull(values)
     if is_any_int_dtype(values):
         values = values.astype('f8')
+        values[mask] = np.nan
 
     if is_float_dtype(values):
         count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
@@ -361,36 +372,37 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
         values = values.copy()
         np.putmask(values, mask, 0)
 
-    X = _ensure_numeric(values.sum(axis))
-    XX = _ensure_numeric((values ** 2).sum(axis))
-    result = np.fabs((XX - X * X / count) / d)
-    return result
-
-@disallow('M8')
-@bottleneck_switch(ddof=1)
-def nanstd(values, axis=None, skipna=True, ddof=1):
 
-    result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
+    # xref GH10242
+    # Compute variance via two-pass algorithm, which is stable against
+    # cancellation errors and relatively accurate for small numbers of
+    # observations.
+    #
+    # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
+    if axis is not None:
+        avg = np.expand_dims(avg, axis)
+    sqr = _ensure_numeric((avg - values) ** 2)
+    np.putmask(sqr, mask, 0)
+    result = sqr.sum(axis=axis, dtype=np.float64) / d
+
+    # Return variance as np.float64 (the datatype used in the accumulator),
+    # unless we were dealing with a float array, in which case use the same
+    # precision as the original values array.
+    if is_float_dtype(dtype):
+        result = result.astype(dtype)
     return _wrap_results(result, values.dtype)
 
-@disallow('M8','m8')
-@bottleneck_switch(ddof=1)
-def nanvar(values, axis=None, skipna=True, ddof=1):
 
-    # we are going to allow timedelta64[ns] here
-    # but NOT going to coerce them to the Timedelta type
-    # as this could cause overflow
-    # so var cannot be computed (but std can!)
-    return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
-
-@disallow('M8','m8')
+@disallow('M8', 'm8')
 def nansem(values, axis=None, skipna=True, ddof=1):
     var = nanvar(values, axis, skipna, ddof=ddof)
 
     mask = isnull(values)
     if not is_float_dtype(values.dtype):
         values = values.astype('f8')
     count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
+    var = nanvar(values, axis, skipna, ddof=ddof)
 
     return np.sqrt(var) / np.sqrt(count)
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -12581,11 +12581,9 @@ def test_numeric_only_flag(self):
             expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
             assert_series_equal(expected, result)
 
-            assertRaisesRegexp(TypeError, 'float',
-                               getattr(df1, meth), axis=1, numeric_only=False)
-
-            assertRaisesRegexp(TypeError, 'float',
-                               getattr(df2, meth), axis=1, numeric_only=False)
+            # df1 has all numbers, df2 has a letter inside
+            self.assertRaises(TypeError, lambda : getattr(df1, meth)(axis=1, numeric_only=False))
+            self.assertRaises(ValueError, lambda : getattr(df2, meth)(axis=1, numeric_only=False))
 
     def test_sem(self):
         alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -186,13 +186,13 @@ def check_fun_data(self, testfunc, targfunc,
                                    **kwargs)
                     self.check_results(targ, res, axis)
                     if skipna:
-                        res = testfunc(testarval, axis=axis)
+                        res = testfunc(testarval, axis=axis, **kwargs)
                         self.check_results(targ, res, axis)
                     if axis is None:
-                        res = testfunc(testarval, skipna=skipna)
+                        res = testfunc(testarval, skipna=skipna, **kwargs)
                         self.check_results(targ, res, axis)
                     if skipna and axis is None:
-                        res = testfunc(testarval)
+                        res = testfunc(testarval, **kwargs)
                         self.check_results(targ, res, axis)
                 except BaseException as exc:
                     exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1),
@@ -295,12 +295,13 @@ def check_funs_ddof(self, testfunc, targfunc,
                         allow_date=False, allow_tdelta=False, allow_obj=True,):
         for ddof in range(3):
             try:
-                self.check_funs(self, testfunc, targfunc,
+                self.check_funs(testfunc, targfunc,
                                 allow_complex, allow_all_nan, allow_str,
                                 allow_date, allow_tdelta, allow_obj,
                                 ddof=ddof)
             except BaseException as exc:
                 exc.args += ('ddof %s' % ddof,)
+                raise
 
     def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
         if value.dtype.kind == 'O':
@@ -370,16 +371,29 @@ def test_nanmedian(self):
 
     def test_nanvar(self):
         self.check_funs_ddof(nanops.nanvar, np.var,
-                             allow_complex=False, allow_date=False, allow_tdelta=False)
+                             allow_complex=False,
+                             allow_str=False,
+                             allow_date=False,
+                             allow_tdelta=True,
+                             allow_obj='convert')
 
     def test_nanstd(self):
         self.check_funs_ddof(nanops.nanstd, np.std,
-                             allow_complex=False, allow_date=False, allow_tdelta=True)
+                             allow_complex=False,
+                             allow_str=False,
+                             allow_date=False,
+                             allow_tdelta=True,
+                             allow_obj='convert')
 
     def test_nansem(self):
         tm.skip_if_no_package('scipy.stats')
-        self.check_funs_ddof(nanops.nansem, np.var,
-                             allow_complex=False, allow_date=False, allow_tdelta=False)
+        from scipy.stats import sem
+        self.check_funs_ddof(nanops.nansem, sem,
+                             allow_complex=False,
+                             allow_str=False,
+                             allow_date=False,
+                             allow_tdelta=True,
+                             allow_obj='convert')
 
     def _minmax_wrap(self, value, axis=None, func=None):
         res = func(value, axis)
@@ -821,6 +835,123 @@ def test_non_convertable_values(self):
                           lambda: nanops._ensure_numeric([]))
 
 
+class TestNanvarFixedValues(tm.TestCase):
+
+    # xref GH10242
+
+    def setUp(self):
+        # Samples from a normal distribution.
+        self.variance = variance = 3.0
+        self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
+
+    def test_nanvar_all_finite(self):
+        samples = self.samples
+        actual_variance = nanops.nanvar(samples)
+        np.testing.assert_almost_equal(
+            actual_variance, self.variance, decimal=2)
+
+    def test_nanvar_nans(self):
+        samples = np.nan * np.ones(2 * self.samples.shape[0])
+        samples[::2] = self.samples
+
+        actual_variance = nanops.nanvar(samples, skipna=True)
+        np.testing.assert_almost_equal(
+            actual_variance, self.variance, decimal=2)
+
+        actual_variance = nanops.nanvar(samples, skipna=False)
+        np.testing.assert_almost_equal(
+            actual_variance, np.nan, decimal=2)
+
+    def test_nanstd_nans(self):
+        samples = np.nan * np.ones(2 * self.samples.shape[0])
+        samples[::2] = self.samples
+
+        actual_std = nanops.nanstd(samples, skipna=True)
+        np.testing.assert_almost_equal(
+            actual_std, self.variance ** 0.5, decimal=2)
+
+        actual_std = nanops.nanvar(samples, skipna=False)
+        np.testing.assert_almost_equal(
+            actual_std, np.nan, decimal=2)
+
+    def test_nanvar_axis(self):
+        # Generate some sample data.
+        samples_norm = self.samples
+        samples_unif = self.prng.uniform(size=samples_norm.shape[0])
+        samples = np.vstack([samples_norm, samples_unif])
+
+        actual_variance = nanops.nanvar(samples, axis=1)
+        np.testing.assert_array_almost_equal(
+            actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2)
+
+    def test_nanvar_ddof(self):
+        n = 5
+        samples = self.prng.uniform(size=(10000, n+1))
+        samples[:, -1] = np.nan  # Force use of our own algorithm.
+
+        variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
+        variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
+        variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
+
+        # The unbiased estimate.
+        var = 1.0 / 12
+        np.testing.assert_almost_equal(variance_1, var, decimal=2)
+        # The underestimated variance.
+        np.testing.assert_almost_equal(
+            variance_0,  (n - 1.0) / n * var, decimal=2)
+        # The overestimated variance.
+        np.testing.assert_almost_equal(
+            variance_2,  (n - 1.0) / (n - 2.0) * var, decimal=2)
+
+    def test_ground_truth(self):
+        # Test against values that were precomputed with Numpy.
+        samples = np.empty((4, 4))
+        samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287],
+                                    [0.72980153, 0.03109364, 0.99155171],
+                                    [0.09317602, 0.60078248, 0.15871292]])
+        samples[3] = samples[:, 3] = np.nan
+
+        # Actual variances along axis=0, 1 for ddof=0, 1, 2
+        variance = np.array(
+            [[[0.13762259, 0.05619224, 0.11568816],
+              [0.20643388, 0.08428837, 0.17353224],
+              [0.41286776, 0.16857673, 0.34706449]],
+             [[0.09519783, 0.16435395, 0.05082054],
+              [0.14279674, 0.24653093, 0.07623082],
+              [0.28559348, 0.49306186, 0.15246163]]]
+        )
+
+        # Test nanvar.
+        for axis in range(2):
+            for ddof in range(3):
+                var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
+                np.testing.assert_array_almost_equal(
+                    var[:3], variance[axis, ddof]
+                )
+                np.testing.assert_equal(var[3], np.nan)
+
+        # Test nanstd.
+        for axis in range(2):
+            for ddof in range(3):
+                std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
+                np.testing.assert_array_almost_equal(
+                    std[:3], variance[axis, ddof] ** 0.5
+                )
+                np.testing.assert_equal(std[3], np.nan)
+
+    def test_nanstd_roundoff(self):
+        # Regression test for GH 10242 (test data taken from GH 10489). Ensure
+        # that variance is stable.
+        data = Series(766897346 * np.ones(10))
+        for ddof in range(3):
+            result = data.std(ddof=ddof)
+            self.assertEqual(result, 0.0)
+
+    @property
+    def prng(self):
+        return np.random.RandomState(1234)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure',
diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -684,8 +684,8 @@ def test_timedelta_ops(self):
         self.assertEqual(result[0], expected)
 
         # invalid ops
-        for op in ['skew','kurt','sem','var','prod']:
-            self.assertRaises(TypeError, lambda : getattr(td,op)())
+        for op in ['skew','kurt','sem','prod']:
+            self.assertRaises(TypeError, getattr(td,op))
 
         # GH 10040
         # make sure NaT is properly handled by median()