diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 3f4a13a09e5e6..cafb7eb1d8446 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -923,3 +923,5 @@ Bug Fixes - Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`) - Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`) - Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`) +- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`) +- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 447a273a1e171..e5ef5456e4977 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): return count, d -def _nanvar(values, axis=None, skipna=True, ddof=1): - # private nanvar calculator +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanstd(values, axis=None, skipna=True, ddof=1): + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) + return _wrap_results(result, values.dtype) + + +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1): + + dtype = values.dtype mask = isnull(values) if is_any_int_dtype(values): values = values.astype('f8') + values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) @@ -361,29 +372,27 @@ def _nanvar(values, axis=None, skipna=True, ddof=1): values = values.copy() np.putmask(values, mask, 0) - X = _ensure_numeric(values.sum(axis)) - XX = _ensure_numeric((values ** 2).sum(axis)) - result = np.fabs((XX - X * X / count) / d) - return result - -@disallow('M8') -@bottleneck_switch(ddof=1) -def nanstd(values, axis=None, skipna=True, ddof=1): - - result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) + # Compute variance via two-pass algorithm, which is stable against + # cancellation errors and relatively accurate for small numbers of + # observations. + # + # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count + if axis is not None: + avg = np.expand_dims(avg, axis) + sqr = _ensure_numeric((avg - values) ** 2) + np.putmask(sqr, mask, 0) + result = sqr.sum(axis=axis, dtype=np.float64) / d + + # Return variance as np.float64 (the datatype used in the accumulator), + # unless we were dealing with a float array, in which case use the same + # precision as the original values array. + if is_float_dtype(dtype): + result = result.astype(dtype) return _wrap_results(result, values.dtype) -@disallow('M8','m8') -@bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): - # we are going to allow timedelta64[ns] here - # but NOT going to coerce them to the Timedelta type - # as this could cause overflow - # so var cannot be computed (but std can!) - return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof) - -@disallow('M8','m8') +@disallow('M8', 'm8') def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) @@ -391,6 +400,7 @@ def nansem(values, axis=None, skipna=True, ddof=1): if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) + var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index aea165b907c05..16620671f71f0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12589,11 +12589,11 @@ def test_numeric_only_flag(self): expected = getattr(df2[['bar', 'baz']], meth)(axis=1) assert_series_equal(expected, result) - assertRaisesRegexp(TypeError, 'float', - getattr(df1, meth), axis=1, numeric_only=False) - - assertRaisesRegexp(TypeError, 'float', - getattr(df2, meth), axis=1, numeric_only=False) + try: + getattr(df1, meth)(axis=1, numeric_only=False) + getattr(df2, meth)(axis=1, numeric_only=False) + except (TypeError, ValueError) as e: + self.assertIn('float', str(e)) def test_sem(self): alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x)) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index a903b76b3ac7f..8dc5f886eb4c3 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -182,13 +182,13 @@ def check_fun_data(self, testfunc, targfunc, **kwargs) self.check_results(targ, res, axis) if skipna: - res = testfunc(testarval, axis=axis) + res = testfunc(testarval, axis=axis, **kwargs) self.check_results(targ, res, axis) if axis is None: - res = testfunc(testarval, skipna=skipna) + res = testfunc(testarval, skipna=skipna, **kwargs) self.check_results(targ, res, axis) if skipna and axis is None: - res = testfunc(testarval) + res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis) except BaseException as exc: exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1), @@ -291,12 +291,13 @@ def check_funs_ddof(self, testfunc, targfunc, allow_date=False, allow_tdelta=False, allow_obj=True,): for ddof in range(3): try: - self.check_funs(self, testfunc, targfunc, + self.check_funs(testfunc, targfunc, allow_complex, allow_all_nan, allow_str, allow_date, allow_tdelta, allow_obj, ddof=ddof) except BaseException as exc: exc.args += ('ddof %s' % ddof,) + raise def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): if value.dtype.kind == 'O': @@ -366,16 +367,29 @@ def test_nanmedian(self): def test_nanvar(self): self.check_funs_ddof(nanops.nanvar, np.var, - allow_complex=False, allow_date=False, allow_tdelta=False) + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj='convert') def test_nanstd(self): self.check_funs_ddof(nanops.nanstd, np.std, - allow_complex=False, allow_date=False, allow_tdelta=True) + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj='convert') def test_nansem(self): tm.skip_if_no_package('scipy.stats') - self.check_funs_ddof(nanops.nansem, np.var, - allow_complex=False, allow_date=False, allow_tdelta=False) + from scipy.stats import sem + self.check_funs_ddof(nanops.nansem, sem, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj='convert') def _minmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -817,6 +831,121 @@ def test_non_convertable_values(self): lambda: nanops._ensure_numeric([])) +class TestNanvarFixedValues(tm.TestCase): + + def setUp(self): + # Samples from a normal distribution. + self.variance = variance = 3.0 + self.samples = self.prng.normal(scale=variance ** 0.5, size=100000) + + def test_nanvar_all_finite(self): + samples = self.samples + actual_variance = nanops.nanvar(samples) + np.testing.assert_almost_equal( + actual_variance, self.variance, decimal=2) + + def test_nanvar_nans(self): + samples = np.nan * np.ones(2 * self.samples.shape[0]) + samples[::2] = self.samples + + actual_variance = nanops.nanvar(samples, skipna=True) + np.testing.assert_almost_equal( + actual_variance, self.variance, decimal=2) + + actual_variance = nanops.nanvar(samples, skipna=False) + np.testing.assert_almost_equal( + actual_variance, np.nan, decimal=2) + + def test_nanstd_nans(self): + samples = np.nan * np.ones(2 * self.samples.shape[0]) + samples[::2] = self.samples + + actual_std = nanops.nanstd(samples, skipna=True) + np.testing.assert_almost_equal( + actual_std, self.variance ** 0.5, decimal=2) + + actual_std = nanops.nanvar(samples, skipna=False) + np.testing.assert_almost_equal( + actual_std, np.nan, decimal=2) + + def test_nanvar_axis(self): + # Generate some sample data. + samples_norm = self.samples + samples_unif = self.prng.uniform(size=samples_norm.shape[0]) + samples = np.vstack([samples_norm, samples_unif]) + + actual_variance = nanops.nanvar(samples, axis=1) + np.testing.assert_array_almost_equal( + actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2) + + def test_nanvar_ddof(self): + n = 5 + samples = self.prng.uniform(size=(10000, n+1)) + samples[:, -1] = np.nan # Force use of our own algorithm. + + variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean() + variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean() + variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean() + + # The unbiased estimate. + var = 1.0 / 12 + np.testing.assert_almost_equal(variance_1, var, decimal=2) + # The underestimated variance. + np.testing.assert_almost_equal( + variance_0, (n - 1.0) / n * var, decimal=2) + # The overestimated variance. + np.testing.assert_almost_equal( + variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2) + + def test_ground_truth(self): + # Test against values that were precomputed with Numpy. + samples = np.empty((4, 4)) + samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287], + [0.72980153, 0.03109364, 0.99155171], + [0.09317602, 0.60078248, 0.15871292]]) + samples[3] = samples[:, 3] = np.nan + + # Actual variances along axis=0, 1 for ddof=0, 1, 2 + variance = np.array( + [[[0.13762259, 0.05619224, 0.11568816], + [0.20643388, 0.08428837, 0.17353224], + [0.41286776, 0.16857673, 0.34706449]], + [[0.09519783, 0.16435395, 0.05082054], + [0.14279674, 0.24653093, 0.07623082], + [0.28559348, 0.49306186, 0.15246163]]] + ) + + # Test nanvar. + for axis in range(2): + for ddof in range(3): + var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) + np.testing.assert_array_almost_equal( + var[:3], variance[axis, ddof] + ) + np.testing.assert_equal(var[3], np.nan) + + # Test nanstd. + for axis in range(2): + for ddof in range(3): + std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) + np.testing.assert_array_almost_equal( + std[:3], variance[axis, ddof] ** 0.5 + ) + np.testing.assert_equal(std[3], np.nan) + + def test_nanstd_roundoff(self): + # Regression test for GH 10242 (test data taken from GH 10489). Ensure + # that variance is stable. + data = Series(766897346 * np.ones(10)) + for ddof in range(3): + result = data.std(ddof=ddof) + self.assertEqual(result, 0.0) + + @property + def prng(self): + return np.random.RandomState(1234) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 06533a4ef85ff..a2a8f1484f70e 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -684,8 +684,8 @@ def test_timedelta_ops(self): self.assertEqual(result[0], expected) # invalid ops - for op in ['skew','kurt','sem','var','prod']: - self.assertRaises(TypeError, lambda : getattr(td,op)()) + for op in ['skew','kurt','sem','prod']: + self.assertRaises(TypeError, getattr(td,op)) # GH 10040 # make sure NaT is properly handled by median()