Skip to content

Commit 50f384b

Browse files
zraitfeefladder
authored andcommitted
Revert Cythonized Kendall implementation and improve test case to prevent regressions (pandas-dev#43403)
1 parent 4867cf9 commit 50f384b

File tree

4 files changed

+5
-95
lines changed

4 files changed

+5
-95
lines changed

doc/source/whatsnew/v1.3.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Fixed regressions
2323
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
2424
- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`)
2525
- Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`)
26+
- Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`)
2627

2728
.. ---------------------------------------------------------------------------
2829

pandas/_libs/algos.pyx

-91
Original file line numberDiff line numberDiff line change
@@ -518,97 +518,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
518518
return result
519519

520520

521-
# ----------------------------------------------------------------------
522-
# Kendall correlation
523-
# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
524-
525-
@cython.boundscheck(False)
526-
@cython.wraparound(False)
527-
def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
528-
"""
529-
Perform kendall correlation on a 2d array
530-
531-
Parameters
532-
----------
533-
mat : np.ndarray[float64_t, ndim=2]
534-
Array to compute kendall correlation on
535-
minp : int, default 1
536-
Minimum number of observations required per pair of columns
537-
to have a valid result.
538-
539-
Returns
540-
-------
541-
numpy.ndarray[float64_t, ndim=2]
542-
Correlation matrix
543-
"""
544-
cdef:
545-
Py_ssize_t i, j, k, xi, yi, N, K
546-
ndarray[float64_t, ndim=2] result
547-
ndarray[float64_t, ndim=2] ranked_mat
548-
ndarray[uint8_t, ndim=2] mask
549-
float64_t currj
550-
ndarray[uint8_t, ndim=1] valid
551-
ndarray[int64_t] sorted_idxs
552-
ndarray[float64_t, ndim=1] col
553-
int64_t n_concordant
554-
int64_t total_concordant = 0
555-
int64_t total_discordant = 0
556-
float64_t kendall_tau
557-
int64_t n_obs
558-
559-
N, K = (<object>mat).shape
560-
561-
result = np.empty((K, K), dtype=np.float64)
562-
mask = np.isfinite(mat)
563-
564-
ranked_mat = np.empty((N, K), dtype=np.float64)
565-
566-
for i in range(K):
567-
ranked_mat[:, i] = rank_1d(mat[:, i])
568-
569-
for xi in range(K):
570-
sorted_idxs = ranked_mat[:, xi].argsort()
571-
ranked_mat = ranked_mat[sorted_idxs]
572-
mask = mask[sorted_idxs]
573-
for yi in range(xi + 1, K):
574-
valid = mask[:, xi] & mask[:, yi]
575-
if valid.sum() < minp:
576-
result[xi, yi] = NaN
577-
result[yi, xi] = NaN
578-
else:
579-
# Get columns and order second column using 1st column ranks
580-
if not valid.all():
581-
col = ranked_mat[valid.nonzero()][:, yi]
582-
else:
583-
col = ranked_mat[:, yi]
584-
n_obs = col.shape[0]
585-
total_concordant = 0
586-
total_discordant = 0
587-
for j in range(n_obs - 1):
588-
currj = col[j]
589-
# Count num concordant and discordant pairs
590-
n_concordant = 0
591-
for k in range(j, n_obs):
592-
if col[k] > currj:
593-
n_concordant += 1
594-
total_concordant += n_concordant
595-
total_discordant += (n_obs - 1 - j - n_concordant)
596-
# Note: we do total_concordant+total_discordant here which is
597-
# equivalent to the C(n, 2), the total # of pairs,
598-
# listed on wikipedia
599-
kendall_tau = (total_concordant - total_discordant) / \
600-
(total_concordant + total_discordant)
601-
result[xi, yi] = kendall_tau
602-
result[yi, xi] = kendall_tau
603-
604-
if mask[:, xi].sum() > minp:
605-
result[xi, xi] = 1
606-
else:
607-
result[xi, xi] = NaN
608-
609-
return result
610-
611-
612521
# ----------------------------------------------------------------------
613522

614523
ctypedef fused algos_t:

pandas/core/frame.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -9390,7 +9390,8 @@ def corr(
93909390
regardless of the callable's behavior.
93919391
min_periods : int, optional
93929392
Minimum number of observations required per pair of columns
9393-
to have a valid result.
9393+
to have a valid result. Currently only available for Pearson
9394+
and Spearman correlation.
93949395
93959396
Returns
93969397
-------
@@ -9424,9 +9425,7 @@ def corr(
94249425
correl = libalgos.nancorr(mat, minp=min_periods)
94259426
elif method == "spearman":
94269427
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
9427-
elif method == "kendall":
9428-
correl = libalgos.nancorr_kendall(mat, minp=min_periods)
9429-
elif callable(method):
9428+
elif method == "kendall" or callable(method):
94309429
if min_periods is None:
94319430
min_periods = 1
94329431
mat = mat.T

pandas/tests/frame/methods/test_cov_corr.py

+1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class TestDataFrameCorr:
9292
def test_corr_scipy_method(self, float_frame, method):
9393
float_frame["A"][:5] = np.nan
9494
float_frame["B"][5:10] = np.nan
95+
float_frame["A"][:10] = float_frame["A"][10:20]
9596

9697
correls = float_frame.corr(method=method)
9798
expected = float_frame["A"].corr(float_frame["C"], method=method)

0 commit comments

Comments
 (0)