Skip to content

Commit d61a518

Browse files
lithomas1nofarmish
authored andcommitted
PERF: cythonize kendall correlation (pandas-dev#39132)
1 parent 75a485c commit d61a518

File tree

3 files changed

+99
-4
lines changed

3 files changed

+99
-4
lines changed

doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ Performance improvements
205205
- Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`)
206206
- Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`)
207207
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
208-
-
208+
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
209209

210210
.. ---------------------------------------------------------------------------
211211

pandas/_libs/algos.pyx

+94
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,100 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
393393
return result
394394

395395

396+
# ----------------------------------------------------------------------
397+
# Kendall correlation
398+
# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient
399+
400+
@cython.boundscheck(False)
401+
@cython.wraparound(False)
402+
def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
403+
"""
404+
Perform kendall correlation on a 2d array
405+
406+
Parameters
407+
----------
408+
mat : np.ndarray[float64_t, ndim=2]
409+
Array to compute kendall correlation on
410+
minp : int, default 1
411+
Minimum number of observations required per pair of columns
412+
to have a valid result.
413+
414+
Returns
415+
-------
416+
numpy.ndarray[float64_t, ndim=2]
417+
Correlation matrix
418+
"""
419+
cdef:
420+
Py_ssize_t i, j, k, xi, yi, N, K
421+
ndarray[float64_t, ndim=2] result
422+
ndarray[float64_t, ndim=2] ranked_mat
423+
ndarray[uint8_t, ndim=2] mask
424+
float64_t currj
425+
ndarray[uint8_t, ndim=1] valid
426+
ndarray[int64_t] sorted_idxs
427+
ndarray[float64_t, ndim=1] col
428+
int64_t n_concordant
429+
int64_t total_concordant = 0
430+
int64_t total_discordant = 0
431+
float64_t kendall_tau
432+
int64_t n_obs
433+
const int64_t[:] labels_n
434+
435+
N, K = (<object>mat).shape
436+
437+
result = np.empty((K, K), dtype=np.float64)
438+
mask = np.isfinite(mat)
439+
440+
ranked_mat = np.empty((N, K), dtype=np.float64)
441+
# For compatibility when calling rank_1d
442+
labels_n = np.zeros(N, dtype=np.int64)
443+
444+
for i in range(K):
445+
ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
446+
447+
for xi in range(K):
448+
sorted_idxs = ranked_mat[:, xi].argsort()
449+
ranked_mat = ranked_mat[sorted_idxs]
450+
mask = mask[sorted_idxs]
451+
for yi in range(xi + 1, K):
452+
valid = mask[:, xi] & mask[:, yi]
453+
if valid.sum() < minp:
454+
result[xi, yi] = NaN
455+
result[yi, xi] = NaN
456+
else:
457+
# Get columns and order second column using 1st column ranks
458+
if not valid.all():
459+
col = ranked_mat[valid.nonzero()][:, yi]
460+
else:
461+
col = ranked_mat[:, yi]
462+
n_obs = col.shape[0]
463+
total_concordant = 0
464+
total_discordant = 0
465+
for j in range(n_obs - 1):
466+
currj = col[j]
467+
# Count num concordant and discordant pairs
468+
n_concordant = 0
469+
for k in range(j, n_obs):
470+
if col[k] > currj:
471+
n_concordant += 1
472+
total_concordant += n_concordant
473+
total_discordant += (n_obs - 1 - j - n_concordant)
474+
# Note: we do total_concordant+total_discordant here which is
475+
# equivalent to the C(n, 2), the total # of pairs,
476+
# listed on wikipedia
477+
kendall_tau = (total_concordant - total_discordant) / \
478+
(total_concordant + total_discordant)
479+
result[xi, yi] = kendall_tau
480+
result[yi, xi] = kendall_tau
481+
482+
if mask[:, xi].sum() > minp:
483+
result[xi, xi] = 1
484+
else:
485+
result[xi, xi] = NaN
486+
487+
return result
488+
489+
396490
# ----------------------------------------------------------------------
397491

398492
ctypedef fused algos_t:

pandas/core/frame.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -8463,8 +8463,7 @@ def corr(self, method="pearson", min_periods=1) -> DataFrame:
84638463
84648464
min_periods : int, optional
84658465
Minimum number of observations required per pair of columns
8466-
to have a valid result. Currently only available for Pearson
8467-
and Spearman correlation.
8466+
to have a valid result.
84688467
84698468
Returns
84708469
-------
@@ -8498,7 +8497,9 @@ def corr(self, method="pearson", min_periods=1) -> DataFrame:
84988497
correl = libalgos.nancorr(mat, minp=min_periods)
84998498
elif method == "spearman":
85008499
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
8501-
elif method == "kendall" or callable(method):
8500+
elif method == "kendall":
8501+
correl = libalgos.nancorr_kendall(mat, minp=min_periods)
8502+
elif callable(method):
85028503
if min_periods is None:
85038504
min_periods = 1
85048505
mat = mat.T

0 commit comments

Comments
 (0)