Skip to content

Commit f1ff315

Browse files
committed
Merge branch 'correlations' of https://github.com/jniznan/pandas into jniznan-correlations
2 parents fe03b01 + 0a42ae3 commit f1ff315

File tree

4 files changed

+80
-3
lines changed

4 files changed

+80
-3
lines changed

doc/source/v0.11.1.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,8 @@ Enhancements
295295
- DatetimeIndexes no longer try to convert mixed-integer indexes during join
296296
operations (GH3877_)
297297

298+
- DataFrame corr method (spearman) is now cythonized.
299+
298300

299301
Bug Fixes
300302
~~~~~~~~~

pandas/algos.pyx

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,69 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None):
997997

998998
return result
999999

1000+
#----------------------------------------------------------------------
1001+
# Pairwise Spearman correlation
1002+
1003+
@cython.boundscheck(False)
1004+
@cython.wraparound(False)
1005+
def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
1006+
cdef:
1007+
Py_ssize_t i, j, xi, yi, N, K
1008+
ndarray[float64_t, ndim=2] result
1009+
ndarray[float64_t, ndim=1] maskedx
1010+
ndarray[float64_t, ndim=1] maskedy
1011+
ndarray[uint8_t, ndim=2] mask
1012+
int64_t nobs = 0
1013+
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
1014+
1015+
N, K = (<object> mat).shape
1016+
1017+
result = np.empty((K, K), dtype=np.float64)
1018+
mask = np.isfinite(mat).view(np.uint8)
1019+
1020+
for xi in range(K):
1021+
for yi in range(xi + 1):
1022+
nobs = 0
1023+
for i in range(N):
1024+
if mask[i, xi] and mask[i, yi]:
1025+
nobs += 1
1026+
1027+
if nobs < minp:
1028+
result[xi, yi] = result[yi, xi] = np.NaN
1029+
else:
1030+
maskedx = np.empty(nobs, dtype=np.float64)
1031+
maskedy = np.empty(nobs, dtype=np.float64)
1032+
j = 0
1033+
for i in range(N):
1034+
if mask[i, xi] and mask[i, yi]:
1035+
maskedx[j] = mat[i, xi]
1036+
maskedy[j] = mat[i, yi]
1037+
j += 1
1038+
maskedx = rank_1d_float64(maskedx)
1039+
maskedy = rank_1d_float64(maskedy)
1040+
1041+
mean = (nobs + 1) / 2.
1042+
1043+
# now the cov numerator
1044+
sumx = sumxx = sumyy = 0
1045+
1046+
for i in range(nobs):
1047+
vx = maskedx[i] - mean
1048+
vy = maskedy[i] - mean
1049+
1050+
sumx += vx * vy
1051+
sumxx += vx * vx
1052+
sumyy += vy * vy
1053+
1054+
divisor = sqrt(sumxx * sumyy)
1055+
1056+
if divisor != 0:
1057+
result[xi, yi] = result[yi, xi] = sumx / divisor
1058+
else:
1059+
result[xi, yi] = result[yi, xi] = np.NaN
1060+
1061+
return result
1062+
10001063
#----------------------------------------------------------------------
10011064
# Rolling variance
10021065

pandas/core/frame.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,7 +1528,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-
15281528
from pandas.io.stata import StataWriter
15291529
writer = StataWriter(fname,self,convert_dates=convert_dates, encoding=encoding, byteorder=byteorder)
15301530
writer.write_file()
1531-
1531+
15321532
def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs):
15331533
"""
15341534
Write records stored in a DataFrame to a SQL database.
@@ -4711,7 +4711,7 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
47114711
#----------------------------------------------------------------------
47124712
# Statistical methods, etc.
47134713

4714-
def corr(self, method='pearson', min_periods=None):
4714+
def corr(self, method='pearson', min_periods=1):
47154715
"""
47164716
Compute pairwise correlation of columns, excluding NA/null values
47174717
@@ -4724,7 +4724,7 @@ def corr(self, method='pearson', min_periods=None):
47244724
min_periods : int, optional
47254725
Minimum number of observations required per pair of columns
47264726
to have a valid result. Currently only available for pearson
4727-
correlation
4727+
and spearman correlation
47284728
47294729
Returns
47304730
-------
@@ -4737,6 +4737,9 @@ def corr(self, method='pearson', min_periods=None):
47374737
if method == 'pearson':
47384738
correl = _algos.nancorr(com._ensure_float64(mat),
47394739
minp=min_periods)
4740+
elif method == 'spearman':
4741+
correl = _algos.nancorr_spearman(com._ensure_float64(mat),
4742+
minp=min_periods)
47404743
else:
47414744
if min_periods is None:
47424745
min_periods = 1

vb_suite/stat_ops.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,12 @@
8282

8383
stats_rolling_mean = Benchmark('rolling_mean(arr, 100)', setup,
8484
start_date=datetime(2011, 6, 1))
85+
86+
# spearman correlation
87+
88+
setup = common_setup + """
89+
df = DataFrame(np.random.randn(1000, 300))
90+
"""
91+
92+
stats_corr_spearman = Benchmark("df.corr(method='spearman')", setup,
93+
start_date=datetime(2011, 12, 4))

0 commit comments

Comments
 (0)