From 57ba8dc002883a3b4d2dec98c05fa58b0c7a3bf9 Mon Sep 17 00:00:00 2001 From: Joris Vankerschaver Date: Tue, 28 Jul 2015 13:52:39 +0100 Subject: [PATCH 1/2] BUG: Allow exceptions to propagate to top level. A missing 'raise' caused any exception in check_fun_ddof to be ignored. Restoring the raise revealed some issues with the existing tests: 1. Optional keyword arguments were not passed into the function to be tested. 2. nanvar, nanstd, nansem were being tested on string data, but are not able to handle string inputs. 3. test_nansem compared the output of nanops.nansem to numpy.var, which probably should have been scipy.stats.sem, judging from the conditional at the top of the test. This commit also replaces all BaseExceptions by checks for AssertionError. --- pandas/tests/test_nanops.py | 45 ++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index a903b76b3ac7f..30d99cd8fb184 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -182,15 +182,15 @@ def check_fun_data(self, testfunc, targfunc, **kwargs) self.check_results(targ, res, axis) if skipna: - res = testfunc(testarval, axis=axis) + res = testfunc(testarval, axis=axis, **kwargs) self.check_results(targ, res, axis) if axis is None: - res = testfunc(testarval, skipna=skipna) + res = testfunc(testarval, skipna=skipna, **kwargs) self.check_results(targ, res, axis) if skipna and axis is None: - res = testfunc(testarval) + res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis) - except BaseException as exc: + except AssertionError as exc: exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1), 'skipna: %s' % skipna, 'kwargs: %s' % kwargs) @@ -222,7 +222,7 @@ def check_fun(self, testfunc, targfunc, try: self.check_fun_data(testfunc, targfunc, testarval, targarval, targarnanval, **kwargs) - except BaseException as exc: + except AssertionError as exc: exc.args += ('testar: %s' % testar, 'targar: %s' % targar, 'targarnan: %s' % targarnan) @@ -291,12 +291,13 @@ def check_funs_ddof(self, testfunc, targfunc, allow_date=False, allow_tdelta=False, allow_obj=True,): for ddof in range(3): try: - self.check_funs(self, testfunc, targfunc, + self.check_funs(testfunc, targfunc, allow_complex, allow_all_nan, allow_str, allow_date, allow_tdelta, allow_obj, ddof=ddof) - except BaseException as exc: + except AssertionError as exc: exc.args += ('ddof %s' % ddof,) + raise def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): if value.dtype.kind == 'O': @@ -366,16 +367,28 @@ def test_nanmedian(self): def test_nanvar(self): self.check_funs_ddof(nanops.nanvar, np.var, - allow_complex=False, allow_date=False, allow_tdelta=False) + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + allow_obj='convert') def test_nanstd(self): self.check_funs_ddof(nanops.nanstd, np.std, - allow_complex=False, allow_date=False, allow_tdelta=True) + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj='convert') def test_nansem(self): tm.skip_if_no_package('scipy.stats') - self.check_funs_ddof(nanops.nansem, np.var, - allow_complex=False, allow_date=False, allow_tdelta=False) + from scipy.stats import sem + self.check_funs_ddof(nanops.nansem, sem, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False) def _minmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -663,7 +676,7 @@ def check_bool(self, func, value, correct, *args, **kwargs): self.assertTrue(res0) else: self.assertFalse(res0) - except BaseException as exc: + except AssertionError as exc: exc.args += ('dim: %s' % getattr(value, 'ndim', value),) raise if not hasattr(value, 'ndim'): @@ -700,7 +713,7 @@ def test__has_infs(self): val = getattr(self, arr) try: self.check_bool(nanops._has_infs, val, correct) - except BaseException as exc: + except AssertionError as exc: exc.args += (arr,) raise @@ -710,7 +723,7 @@ def test__has_infs(self): self.check_bool(nanops._has_infs, val, correct) self.check_bool(nanops._has_infs, val.astype('f4'), correct) self.check_bool(nanops._has_infs, val.astype('f2'), correct) - except BaseException as exc: + except AssertionError as exc: exc.args += (arr,) raise @@ -743,7 +756,7 @@ def test__isfinite(self): val = getattr(self, arr) try: self.check_bool(func1, val, correct) - except BaseException as exc: + except AssertionError as exc: exc.args += (arr,) raise @@ -753,7 +766,7 @@ def test__isfinite(self): self.check_bool(func1, val, correct) self.check_bool(func1, val.astype('f4'), correct) self.check_bool(func1, val.astype('f2'), correct) - except BaseException as exc: + except AssertionError as exc: exc.args += (arr,) raise From 86317419263bfe6541d6ec94e6fbef566e440eb5 Mon Sep 17 00:00:00 2001 From: Joris Vankerschaver Date: Sun, 26 Jul 2015 17:46:47 +0100 Subject: [PATCH 2/2] BUG: Use stable algorithm for _nanvar. --- doc/source/whatsnew/v0.17.0.txt | 2 + pandas/core/nanops.py | 54 ++++++---- pandas/tests/test_frame.py | 10 +- pandas/tests/test_nanops.py | 136 ++++++++++++++++++++++-- pandas/tseries/tests/test_timedeltas.py | 4 +- 5 files changed, 167 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 3f4a13a09e5e6..cafb7eb1d8446 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -923,3 +923,5 @@ Bug Fixes - Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`) - Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`) - Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`) +- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`) +- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 447a273a1e171..e5ef5456e4977 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): return count, d -def _nanvar(values, axis=None, skipna=True, ddof=1): - # private nanvar calculator +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanstd(values, axis=None, skipna=True, ddof=1): + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) + return _wrap_results(result, values.dtype) + + +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1): + + dtype = values.dtype mask = isnull(values) if is_any_int_dtype(values): values = values.astype('f8') + values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) @@ -361,29 +372,27 @@ def _nanvar(values, axis=None, skipna=True, ddof=1): values = values.copy() np.putmask(values, mask, 0) - X = _ensure_numeric(values.sum(axis)) - XX = _ensure_numeric((values ** 2).sum(axis)) - result = np.fabs((XX - X * X / count) / d) - return result - -@disallow('M8') -@bottleneck_switch(ddof=1) -def nanstd(values, axis=None, skipna=True, ddof=1): - - result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) + # Compute variance via two-pass algorithm, which is stable against + # cancellation errors and relatively accurate for small numbers of + # observations. + # + # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count + if axis is not None: + avg = np.expand_dims(avg, axis) + sqr = _ensure_numeric((avg - values) ** 2) + np.putmask(sqr, mask, 0) + result = sqr.sum(axis=axis, dtype=np.float64) / d + + # Return variance as np.float64 (the datatype used in the accumulator), + # unless we were dealing with a float array, in which case use the same + # precision as the original values array. + if is_float_dtype(dtype): + result = result.astype(dtype) return _wrap_results(result, values.dtype) -@disallow('M8','m8') -@bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): - # we are going to allow timedelta64[ns] here - # but NOT going to coerce them to the Timedelta type - # as this could cause overflow - # so var cannot be computed (but std can!) - return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof) - -@disallow('M8','m8') +@disallow('M8', 'm8') def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) @@ -391,6 +400,7 @@ def nansem(values, axis=None, skipna=True, ddof=1): if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) + var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index aea165b907c05..16620671f71f0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12589,11 +12589,11 @@ def test_numeric_only_flag(self): expected = getattr(df2[['bar', 'baz']], meth)(axis=1) assert_series_equal(expected, result) - assertRaisesRegexp(TypeError, 'float', - getattr(df1, meth), axis=1, numeric_only=False) - - assertRaisesRegexp(TypeError, 'float', - getattr(df2, meth), axis=1, numeric_only=False) + try: + getattr(df1, meth)(axis=1, numeric_only=False) + getattr(df2, meth)(axis=1, numeric_only=False) + except (TypeError, ValueError) as e: + self.assertIn('float', str(e)) def test_sem(self): alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x)) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 30d99cd8fb184..8dc5f886eb4c3 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -190,7 +190,7 @@ def check_fun_data(self, testfunc, targfunc, if skipna and axis is None: res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis) - except AssertionError as exc: + except BaseException as exc: exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1), 'skipna: %s' % skipna, 'kwargs: %s' % kwargs) @@ -222,7 +222,7 @@ def check_fun(self, testfunc, targfunc, try: self.check_fun_data(testfunc, targfunc, testarval, targarval, targarnanval, **kwargs) - except AssertionError as exc: + except BaseException as exc: exc.args += ('testar: %s' % testar, 'targar: %s' % targar, 'targarnan: %s' % targarnan) @@ -295,7 +295,7 @@ def check_funs_ddof(self, testfunc, targfunc, allow_complex, allow_all_nan, allow_str, allow_date, allow_tdelta, allow_obj, ddof=ddof) - except AssertionError as exc: + except BaseException as exc: exc.args += ('ddof %s' % ddof,) raise @@ -370,7 +370,7 @@ def test_nanvar(self): allow_complex=False, allow_str=False, allow_date=False, - allow_tdelta=False, + allow_tdelta=True, allow_obj='convert') def test_nanstd(self): @@ -388,7 +388,8 @@ def test_nansem(self): allow_complex=False, allow_str=False, allow_date=False, - allow_tdelta=False) + allow_tdelta=True, + allow_obj='convert') def _minmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -676,7 +677,7 @@ def check_bool(self, func, value, correct, *args, **kwargs): self.assertTrue(res0) else: self.assertFalse(res0) - except AssertionError as exc: + except BaseException as exc: exc.args += ('dim: %s' % getattr(value, 'ndim', value),) raise if not hasattr(value, 'ndim'): @@ -713,7 +714,7 @@ def test__has_infs(self): val = getattr(self, arr) try: self.check_bool(nanops._has_infs, val, correct) - except AssertionError as exc: + except BaseException as exc: exc.args += (arr,) raise @@ -723,7 +724,7 @@ def test__has_infs(self): self.check_bool(nanops._has_infs, val, correct) self.check_bool(nanops._has_infs, val.astype('f4'), correct) self.check_bool(nanops._has_infs, val.astype('f2'), correct) - except AssertionError as exc: + except BaseException as exc: exc.args += (arr,) raise @@ -756,7 +757,7 @@ def test__isfinite(self): val = getattr(self, arr) try: self.check_bool(func1, val, correct) - except AssertionError as exc: + except BaseException as exc: exc.args += (arr,) raise @@ -766,7 +767,7 @@ def test__isfinite(self): self.check_bool(func1, val, correct) self.check_bool(func1, val.astype('f4'), correct) self.check_bool(func1, val.astype('f2'), correct) - except AssertionError as exc: + except BaseException as exc: exc.args += (arr,) raise @@ -830,6 +831,121 @@ def test_non_convertable_values(self): lambda: nanops._ensure_numeric([])) +class TestNanvarFixedValues(tm.TestCase): + + def setUp(self): + # Samples from a normal distribution. + self.variance = variance = 3.0 + self.samples = self.prng.normal(scale=variance ** 0.5, size=100000) + + def test_nanvar_all_finite(self): + samples = self.samples + actual_variance = nanops.nanvar(samples) + np.testing.assert_almost_equal( + actual_variance, self.variance, decimal=2) + + def test_nanvar_nans(self): + samples = np.nan * np.ones(2 * self.samples.shape[0]) + samples[::2] = self.samples + + actual_variance = nanops.nanvar(samples, skipna=True) + np.testing.assert_almost_equal( + actual_variance, self.variance, decimal=2) + + actual_variance = nanops.nanvar(samples, skipna=False) + np.testing.assert_almost_equal( + actual_variance, np.nan, decimal=2) + + def test_nanstd_nans(self): + samples = np.nan * np.ones(2 * self.samples.shape[0]) + samples[::2] = self.samples + + actual_std = nanops.nanstd(samples, skipna=True) + np.testing.assert_almost_equal( + actual_std, self.variance ** 0.5, decimal=2) + + actual_std = nanops.nanvar(samples, skipna=False) + np.testing.assert_almost_equal( + actual_std, np.nan, decimal=2) + + def test_nanvar_axis(self): + # Generate some sample data. + samples_norm = self.samples + samples_unif = self.prng.uniform(size=samples_norm.shape[0]) + samples = np.vstack([samples_norm, samples_unif]) + + actual_variance = nanops.nanvar(samples, axis=1) + np.testing.assert_array_almost_equal( + actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2) + + def test_nanvar_ddof(self): + n = 5 + samples = self.prng.uniform(size=(10000, n+1)) + samples[:, -1] = np.nan # Force use of our own algorithm. + + variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean() + variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean() + variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean() + + # The unbiased estimate. + var = 1.0 / 12 + np.testing.assert_almost_equal(variance_1, var, decimal=2) + # The underestimated variance. + np.testing.assert_almost_equal( + variance_0, (n - 1.0) / n * var, decimal=2) + # The overestimated variance. + np.testing.assert_almost_equal( + variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2) + + def test_ground_truth(self): + # Test against values that were precomputed with Numpy. + samples = np.empty((4, 4)) + samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287], + [0.72980153, 0.03109364, 0.99155171], + [0.09317602, 0.60078248, 0.15871292]]) + samples[3] = samples[:, 3] = np.nan + + # Actual variances along axis=0, 1 for ddof=0, 1, 2 + variance = np.array( + [[[0.13762259, 0.05619224, 0.11568816], + [0.20643388, 0.08428837, 0.17353224], + [0.41286776, 0.16857673, 0.34706449]], + [[0.09519783, 0.16435395, 0.05082054], + [0.14279674, 0.24653093, 0.07623082], + [0.28559348, 0.49306186, 0.15246163]]] + ) + + # Test nanvar. + for axis in range(2): + for ddof in range(3): + var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) + np.testing.assert_array_almost_equal( + var[:3], variance[axis, ddof] + ) + np.testing.assert_equal(var[3], np.nan) + + # Test nanstd. + for axis in range(2): + for ddof in range(3): + std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) + np.testing.assert_array_almost_equal( + std[:3], variance[axis, ddof] ** 0.5 + ) + np.testing.assert_equal(std[3], np.nan) + + def test_nanstd_roundoff(self): + # Regression test for GH 10242 (test data taken from GH 10489). Ensure + # that variance is stable. + data = Series(766897346 * np.ones(10)) + for ddof in range(3): + result = data.std(ddof=ddof) + self.assertEqual(result, 0.0) + + @property + def prng(self): + return np.random.RandomState(1234) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 06533a4ef85ff..a2a8f1484f70e 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -684,8 +684,8 @@ def test_timedelta_ops(self): self.assertEqual(result[0], expected) # invalid ops - for op in ['skew','kurt','sem','var','prod']: - self.assertRaises(TypeError, lambda : getattr(td,op)()) + for op in ['skew','kurt','sem','prod']: + self.assertRaises(TypeError, getattr(td,op)) # GH 10040 # make sure NaT is properly handled by median()