Skip to content

Commit b7082dd

Browse files
authored
Backport PR pandas-dev#43403: Revert Cythonized Kendall implementation and improve test case to prevent regressions (pandas-dev#43431)
1 parent 6d8b38b commit b7082dd

File tree

4 files changed

+5
-98
lines changed

4 files changed

+5
-98
lines changed

doc/source/whatsnew/v1.3.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Fixed regressions
2222
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
2323
- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`)
2424
- Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`)
25+
- Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`)
2526

2627
.. ---------------------------------------------------------------------------
2728

pandas/_libs/algos.pyx

-94
Original file line numberDiff line numberDiff line change
@@ -481,100 +481,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
481481
return result
482482

483483

484-
# ----------------------------------------------------------------------
485-
# Kendall correlation
486-
# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
487-
488-
@cython.boundscheck(False)
489-
@cython.wraparound(False)
490-
def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
491-
"""
492-
Perform kendall correlation on a 2d array
493-
494-
Parameters
495-
----------
496-
mat : np.ndarray[float64_t, ndim=2]
497-
Array to compute kendall correlation on
498-
minp : int, default 1
499-
Minimum number of observations required per pair of columns
500-
to have a valid result.
501-
502-
Returns
503-
-------
504-
numpy.ndarray[float64_t, ndim=2]
505-
Correlation matrix
506-
"""
507-
cdef:
508-
Py_ssize_t i, j, k, xi, yi, N, K
509-
ndarray[float64_t, ndim=2] result
510-
ndarray[float64_t, ndim=2] ranked_mat
511-
ndarray[uint8_t, ndim=2] mask
512-
float64_t currj
513-
ndarray[uint8_t, ndim=1] valid
514-
ndarray[int64_t] sorted_idxs
515-
ndarray[float64_t, ndim=1] col
516-
int64_t n_concordant
517-
int64_t total_concordant = 0
518-
int64_t total_discordant = 0
519-
float64_t kendall_tau
520-
int64_t n_obs
521-
const intp_t[:] labels_n
522-
523-
N, K = (<object>mat).shape
524-
525-
result = np.empty((K, K), dtype=np.float64)
526-
mask = np.isfinite(mat)
527-
528-
ranked_mat = np.empty((N, K), dtype=np.float64)
529-
# For compatibility when calling rank_1d
530-
labels_n = np.zeros(N, dtype=np.intp)
531-
532-
for i in range(K):
533-
ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
534-
535-
for xi in range(K):
536-
sorted_idxs = ranked_mat[:, xi].argsort()
537-
ranked_mat = ranked_mat[sorted_idxs]
538-
mask = mask[sorted_idxs]
539-
for yi in range(xi + 1, K):
540-
valid = mask[:, xi] & mask[:, yi]
541-
if valid.sum() < minp:
542-
result[xi, yi] = NaN
543-
result[yi, xi] = NaN
544-
else:
545-
# Get columns and order second column using 1st column ranks
546-
if not valid.all():
547-
col = ranked_mat[valid.nonzero()][:, yi]
548-
else:
549-
col = ranked_mat[:, yi]
550-
n_obs = col.shape[0]
551-
total_concordant = 0
552-
total_discordant = 0
553-
for j in range(n_obs - 1):
554-
currj = col[j]
555-
# Count num concordant and discordant pairs
556-
n_concordant = 0
557-
for k in range(j, n_obs):
558-
if col[k] > currj:
559-
n_concordant += 1
560-
total_concordant += n_concordant
561-
total_discordant += (n_obs - 1 - j - n_concordant)
562-
# Note: we do total_concordant+total_discordant here which is
563-
# equivalent to the C(n, 2), the total # of pairs,
564-
# listed on wikipedia
565-
kendall_tau = (total_concordant - total_discordant) / \
566-
(total_concordant + total_discordant)
567-
result[xi, yi] = kendall_tau
568-
result[yi, xi] = kendall_tau
569-
570-
if mask[:, xi].sum() > minp:
571-
result[xi, xi] = 1
572-
else:
573-
result[xi, xi] = NaN
574-
575-
return result
576-
577-
578484
# ----------------------------------------------------------------------
579485

580486
ctypedef fused algos_t:

pandas/core/frame.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -9342,7 +9342,8 @@ def corr(
93429342
regardless of the callable's behavior.
93439343
min_periods : int, optional
93449344
Minimum number of observations required per pair of columns
9345-
to have a valid result.
9345+
to have a valid result. Currently only available for Pearson
9346+
and Spearman correlation.
93469347
93479348
Returns
93489349
-------
@@ -9376,9 +9377,7 @@ def corr(
93769377
correl = libalgos.nancorr(mat, minp=min_periods)
93779378
elif method == "spearman":
93789379
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
9379-
elif method == "kendall":
9380-
correl = libalgos.nancorr_kendall(mat, minp=min_periods)
9381-
elif callable(method):
9380+
elif method == "kendall" or callable(method):
93829381
if min_periods is None:
93839382
min_periods = 1
93849383
mat = mat.T

pandas/tests/frame/methods/test_cov_corr.py

+1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class TestDataFrameCorr:
9292
def test_corr_scipy_method(self, float_frame, method):
9393
float_frame["A"][:5] = np.nan
9494
float_frame["B"][5:10] = np.nan
95+
float_frame["A"][:10] = float_frame["A"][10:20]
9596

9697
correls = float_frame.corr(method=method)
9798
expected = float_frame["A"].corr(float_frame["C"], method=method)

0 commit comments

Comments
 (0)