diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 2dbed08aa02f3..08222ef06d21f 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -135,8 +135,11 @@ Bug Fixes - Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`) - Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`) + - Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`) - Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']`` (:issue:`10413`, :issue:`10467`) - Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`) + +- Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 4cffb663748a4..53fcdb61bd1ae 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -1147,58 +1147,43 @@ def group_prod_bin_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, group_var_template = """@cython.wraparound(False) @cython.boundscheck(False) +@cython.cdivision(True) def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, ndarray[int64_t] counts, ndarray[%(dest_type2)s, ndim=2] values, ndarray[int64_t] labels): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, ct - ndarray[%(dest_type2)s, ndim=2] nobs, sumx, sumxx + %(dest_type2)s val, ct, oldmean + ndarray[%(dest_type2)s, ndim=2] nobs, mean if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) + mean = np.zeros_like(out) N, K = ( values).shape - with nogil: - if K > 1: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue + out[:, :] = 0.0 - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - sumxx[lab, j] += val * val - else: - for i in range(N): + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - lab = labels[i] - if lab < 0: - continue + counts[lab] += 1 - counts[lab] += 1 - val = values[i, 0] + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - sumxx[lab, 0] += val * val - + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) for i in range(ncounts): for j in range(K): @@ -1206,8 +1191,8 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, if ct < 2: out[i, j] = NAN else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) + out[i, j] /= (ct - 1) + """ group_var_bin_template = """@cython.wraparound(False) diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index c76838c4a49c9..db0e96d158f0c 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -7232,58 +7232,43 @@ def group_prod_bin_float32(ndarray[float32_t, ndim=2] out, @cython.wraparound(False) @cython.boundscheck(False) +@cython.cdivision(True) def group_var_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, ndarray[int64_t] labels): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, ct - ndarray[float64_t, ndim=2] nobs, sumx, sumxx + float64_t val, ct, oldmean + ndarray[float64_t, ndim=2] nobs, mean if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) + mean = np.zeros_like(out) N, K = ( values).shape - with nogil: - if K > 1: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue + out[:, :] = 0.0 - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - sumxx[lab, j] += val * val - else: - for i in range(N): + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - lab = labels[i] - if lab < 0: - continue + counts[lab] += 1 - counts[lab] += 1 - val = values[i, 0] + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - sumxx[lab, 0] += val * val - + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) for i in range(ncounts): for j in range(K): @@ -7291,63 +7276,48 @@ def group_var_float64(ndarray[float64_t, ndim=2] out, if ct < 2: out[i, j] = NAN else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) + out[i, j] /= (ct - 1) + @cython.wraparound(False) @cython.boundscheck(False) +@cython.cdivision(True) def group_var_float32(ndarray[float32_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float32_t, ndim=2] values, ndarray[int64_t] labels): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, ct - ndarray[float32_t, ndim=2] nobs, sumx, sumxx + float32_t val, ct, oldmean + ndarray[float32_t, ndim=2] nobs, mean if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - sumxx = np.zeros_like(out) + mean = np.zeros_like(out) N, K = ( values).shape - with nogil: - if K > 1: - for i in range(N): - - lab = labels[i] - if lab < 0: - continue + out[:, :] = 0.0 - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - sumxx[lab, j] += val * val - else: - for i in range(N): + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue - lab = labels[i] - if lab < 0: - continue + counts[lab] += 1 - counts[lab] += 1 - val = values[i, 0] + for j in range(K): + val = values[i, j] # not nan if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - sumxx[lab, 0] += val * val - + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) for i in range(ncounts): for j in range(K): @@ -7355,8 +7325,8 @@ def group_var_float32(ndarray[float32_t, ndim=2] out, if ct < 2: out[i, j] = NAN else: - out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / - (ct * ct - ct)) + out[i, j] /= (ct - 1) + @cython.wraparound(False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8192f6e99116b..138ef92831b2a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2,6 +2,7 @@ from pandas.compat import range import numpy as np +from numpy.random import RandomState from pandas.core.api import Series, Categorical import pandas as pd @@ -10,6 +11,7 @@ import pandas.util.testing as tm import pandas.hashtable as hashtable + class TestMatch(tm.TestCase): _multiprocess_can_split_ = True @@ -285,6 +287,125 @@ def test_dropna(self): pd.Series([10.3, 5., 5., None]).value_counts(dropna=False), pd.Series([2, 1, 1], index=[5., 10.3, np.nan])) + +class GroupVarTestMixin(object): + + def test_group_var_generic_1d(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((5, 1))).astype(self.dtype) + counts = np.zeros(5, dtype=int) + values = 10 * prng.rand(15, 1).astype(self.dtype) + labels = np.tile(np.arange(5), (3, )) + + expected_out = (np.squeeze(values) + .reshape((5, 3), order='F') + .std(axis=1, ddof=1) ** 2)[:, np.newaxis] + expected_counts = counts + 3 + + self.algo(out, counts, values, labels) + np.testing.assert_allclose(out, expected_out, self.rtol) + tm.assert_array_equal(counts, expected_counts) + + def test_group_var_generic_1d_flat_labels(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((1, 1))).astype(self.dtype) + counts = np.zeros(1, dtype=int) + values = 10 * prng.rand(5, 1).astype(self.dtype) + labels = np.zeros(5, dtype=int) + + expected_out = np.array([[values.std(ddof=1) ** 2]]) + expected_counts = counts + 5 + + self.algo(out, counts, values, labels) + + np.testing.assert_allclose(out, expected_out, self.rtol) + tm.assert_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_all_finite(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype=int) + values = 10 * prng.rand(10, 2).astype(self.dtype) + labels = np.tile(np.arange(5), (2, )) + + expected_out = np.std( + values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + np.testing.assert_allclose(out, expected_out, self.rtol) + tm.assert_array_equal(counts, expected_counts) + + def test_group_var_generic_2d_some_nan(self): + prng = RandomState(1234) + + out = (np.nan * np.ones((5, 2))).astype(self.dtype) + counts = np.zeros(5, dtype=int) + values = 10 * prng.rand(10, 2).astype(self.dtype) + values[:, 1] = np.nan + labels = np.tile(np.arange(5), (2, )) + + expected_out = np.vstack([ + values[:, 0].reshape(5, 2, order='F').std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5) + ]).T + expected_counts = counts + 2 + + self.algo(out, counts, values, labels) + np.testing.assert_allclose(out, expected_out, self.rtol) + tm.assert_array_equal(counts, expected_counts) + + def test_group_var_constant(self): + # Regression test from GH 10448. + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0]) + values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) + labels = np.zeros(3, dtype=np.int) + + self.algo(out, counts, values, labels) + + self.assertEqual(counts[0], 3) + self.assertTrue(out[0, 0] >= 0) # Python 2.6 has no assertGreaterEqual + tm.assert_almost_equal(out[0, 0], 0.0) + + +class TestGroupVarFloat64(tm.TestCase, GroupVarTestMixin): + __test__ = True + _multiprocess_can_split_ = True + + algo = algos.algos.group_var_float64 + dtype = np.float64 + rtol = 1e-5 + + def test_group_var_large_inputs(self): + + prng = RandomState(1234) + + out = np.array([[np.nan]], dtype=self.dtype) + counts = np.array([0]) + values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) + values.shape = (10 ** 6, 1) + labels = np.zeros(10 ** 6, dtype=np.int) + + self.algo(out, counts, values, labels) + + self.assertEqual(counts[0], 10 ** 6) + tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True) + + +class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin): + __test__ = True + _multiprocess_can_split_ = True + + algo = algos.algos.group_var_float32 + dtype = np.float32 + rtol = 1e-2 + + def test_quantile(): s = Series(np.random.randn(100))