diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index ddd3a8cf1ecb7..70d8728a67695 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) - Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`) - Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`) +- Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c2b9c723b7c72..f619f09f85e66 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -481,100 +481,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr return result -# ---------------------------------------------------------------------- -# Kendall correlation -# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient - -@cython.boundscheck(False) -@cython.wraparound(False) -def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: - """ - Perform kendall correlation on a 2d array - - Parameters - ---------- - mat : np.ndarray[float64_t, ndim=2] - Array to compute kendall correlation on - minp : int, default 1 - Minimum number of observations required per pair of columns - to have a valid result. - - Returns - ------- - numpy.ndarray[float64_t, ndim=2] - Correlation matrix - """ - cdef: - Py_ssize_t i, j, k, xi, yi, N, K - ndarray[float64_t, ndim=2] result - ndarray[float64_t, ndim=2] ranked_mat - ndarray[uint8_t, ndim=2] mask - float64_t currj - ndarray[uint8_t, ndim=1] valid - ndarray[int64_t] sorted_idxs - ndarray[float64_t, ndim=1] col - int64_t n_concordant - int64_t total_concordant = 0 - int64_t total_discordant = 0 - float64_t kendall_tau - int64_t n_obs - const intp_t[:] labels_n - - N, K = (mat).shape - - result = np.empty((K, K), dtype=np.float64) - mask = np.isfinite(mat) - - ranked_mat = np.empty((N, K), dtype=np.float64) - # For compatibility when calling rank_1d - labels_n = np.zeros(N, dtype=np.intp) - - for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) - - for xi in range(K): - sorted_idxs = ranked_mat[:, xi].argsort() - ranked_mat = ranked_mat[sorted_idxs] - mask = mask[sorted_idxs] - for yi in range(xi + 1, K): - valid = mask[:, xi] & mask[:, yi] - if valid.sum() < minp: - result[xi, yi] = NaN - result[yi, xi] = NaN - else: - # Get columns and order second column using 1st column ranks - if not valid.all(): - col = ranked_mat[valid.nonzero()][:, yi] - else: - col = ranked_mat[:, yi] - n_obs = col.shape[0] - total_concordant = 0 - total_discordant = 0 - for j in range(n_obs - 1): - currj = col[j] - # Count num concordant and discordant pairs - n_concordant = 0 - for k in range(j, n_obs): - if col[k] > currj: - n_concordant += 1 - total_concordant += n_concordant - total_discordant += (n_obs - 1 - j - n_concordant) - # Note: we do total_concordant+total_discordant here which is - # equivalent to the C(n, 2), the total # of pairs, - # listed on wikipedia - kendall_tau = (total_concordant - total_discordant) / \ - (total_concordant + total_discordant) - result[xi, yi] = kendall_tau - result[yi, xi] = kendall_tau - - if mask[:, xi].sum() > minp: - result[xi, xi] = 1 - else: - result[xi, xi] = NaN - - return result - - # ---------------------------------------------------------------------- ctypedef fused algos_t: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4d4b90be908e9..da37f3c73d68d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9342,7 +9342,8 @@ def corr( regardless of the callable's behavior. min_periods : int, optional Minimum number of observations required per pair of columns - to have a valid result. + to have a valid result. Currently only available for Pearson + and Spearman correlation. Returns ------- @@ -9376,9 +9377,7 @@ def corr( correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall": - correl = libalgos.nancorr_kendall(mat, minp=min_periods) - elif callable(method): + elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 mat = mat.T diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 352d95156bf98..c259902ec2498 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -92,6 +92,7 @@ class TestDataFrameCorr: def test_corr_scipy_method(self, float_frame, method): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan + float_frame["A"][:10] = float_frame["A"][10:20] correls = float_frame.corr(method=method) expected = float_frame["A"].corr(float_frame["C"], method=method)