Skip to content

ENH: DataFrame.corr(method='spearman') is cythonized. #3823

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 17, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/v0.11.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ Enhancements
- DatetimeIndexes no longer try to convert mixed-integer indexes during join
operations (GH3877_)

- DataFrame corr method (spearman) is now cythonized.


Bug Fixes
~~~~~~~~~
Expand Down
63 changes: 63 additions & 0 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,69 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None):

return result

#----------------------------------------------------------------------
# Pairwise Spearman correlation

@cython.boundscheck(False)
@cython.wraparound(False)
def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1):
cdef:
Py_ssize_t i, j, xi, yi, N, K
ndarray[float64_t, ndim=2] result
ndarray[float64_t, ndim=1] maskedx
ndarray[float64_t, ndim=1] maskedy
ndarray[uint8_t, ndim=2] mask
int64_t nobs = 0
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor

N, K = (<object> mat).shape

result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat).view(np.uint8)

for xi in range(K):
for yi in range(xi + 1):
nobs = 0
for i in range(N):
if mask[i, xi] and mask[i, yi]:
nobs += 1

if nobs < minp:
result[xi, yi] = result[yi, xi] = np.NaN
else:
maskedx = np.empty(nobs, dtype=np.float64)
maskedy = np.empty(nobs, dtype=np.float64)
j = 0
for i in range(N):
if mask[i, xi] and mask[i, yi]:
maskedx[j] = mat[i, xi]
maskedy[j] = mat[i, yi]
j += 1
maskedx = rank_1d_float64(maskedx)
maskedy = rank_1d_float64(maskedy)

mean = (nobs + 1) / 2.

# now the cov numerator
sumx = sumxx = sumyy = 0

for i in range(nobs):
vx = maskedx[i] - mean
vy = maskedy[i] - mean

sumx += vx * vy
sumxx += vx * vx
sumyy += vy * vy

divisor = sqrt(sumxx * sumyy)

if divisor != 0:
result[xi, yi] = result[yi, xi] = sumx / divisor
else:
result[xi, yi] = result[yi, xi] = np.NaN

return result

#----------------------------------------------------------------------
# Rolling variance

Expand Down
9 changes: 6 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1528,7 +1528,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-
from pandas.io.stata import StataWriter
writer = StataWriter(fname,self,convert_dates=convert_dates, encoding=encoding, byteorder=byteorder)
writer.write_file()

def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs):
"""
Write records stored in a DataFrame to a SQL database.
Expand Down Expand Up @@ -4711,7 +4711,7 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
#----------------------------------------------------------------------
# Statistical methods, etc.

def corr(self, method='pearson', min_periods=None):
def corr(self, method='pearson', min_periods=1):
"""
Compute pairwise correlation of columns, excluding NA/null values

Expand All @@ -4724,7 +4724,7 @@ def corr(self, method='pearson', min_periods=None):
min_periods : int, optional
Minimum number of observations required per pair of columns
to have a valid result. Currently only available for pearson
correlation
and spearman correlation

Returns
-------
Expand All @@ -4737,6 +4737,9 @@ def corr(self, method='pearson', min_periods=None):
if method == 'pearson':
correl = _algos.nancorr(com._ensure_float64(mat),
minp=min_periods)
elif method == 'spearman':
correl = _algos.nancorr_spearman(com._ensure_float64(mat),
minp=min_periods)
else:
if min_periods is None:
min_periods = 1
Expand Down
9 changes: 9 additions & 0 deletions vb_suite/stat_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,12 @@

stats_rolling_mean = Benchmark('rolling_mean(arr, 100)', setup,
start_date=datetime(2011, 6, 1))

# spearman correlation

setup = common_setup + """
df = DataFrame(np.random.randn(1000, 300))
"""

stats_corr_spearman = Benchmark("df.corr(method='spearman')", setup,
start_date=datetime(2011, 12, 4))