diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f1f24ab7a101b..53c56fe3f6909 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -219,6 +219,7 @@ Other enhancements - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) +- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`) - :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) - :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`) - :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index da5ae97bb067b..620889ad39889 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -268,7 +268,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): ndarray[float64_t, ndim=2] result ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 - float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor + float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx + float64_t ssqdmy, covxy N, K = (mat).shape @@ -283,37 +284,29 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): with nogil: for xi in range(K): for yi in range(xi + 1): - nobs = sumxx = sumyy = sumx = sumy = 0 + # Welford's method for the variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 for i in range(N): if mask[i, xi] and mask[i, yi]: vx = mat[i, xi] vy = mat[i, yi] nobs += 1 - sumx += vx - sumy += vy + prev_meanx = meanx + prev_meany = meany + meanx = meanx + 1 / nobs * (vx - meanx) + meany = meany + 1 / nobs * (vy - meany) + ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx) + ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany) + covxy = covxy + (vx - meanx) * (vy - prev_meany) if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN else: - meanx = sumx / nobs - meany = sumy / nobs - - # now the cov numerator - sumx = 0 - - for i in range(N): - if mask[i, xi] and mask[i, yi]: - vx = mat[i, xi] - meanx - vy = mat[i, yi] - meany - - sumx += vx * vy - sumxx += vx * vx - sumyy += vy * vy - - divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy) + divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) if divisor != 0: - result[xi, yi] = result[yi, xi] = sumx / divisor + result[xi, yi] = result[yi, xi] = covxy / divisor else: result[xi, yi] = result[yi, xi] = NaN diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 7eeeb245534f5..6cea5abcac6d0 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -208,6 +208,25 @@ def test_corr_item_cache(self): assert df["A"] is ser assert df.values[0, 0] == 99 + @pytest.mark.parametrize("length", [2, 20, 200, 2000]) + def test_corr_for_constant_columns(self, length): + # GH: 37448 + df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"]) + result = df.corr() + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + def test_calc_corr_small_numbers(self): + # GH: 37452 + df = DataFrame( + {"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]} + ) + result = df.corr() + expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"]) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame):