diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 76ae85a53102b..8dbf2e86e972e 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -295,6 +295,8 @@ Enhancements - DatetimeIndexes no longer try to convert mixed-integer indexes during join operations (GH3877_) + - DataFrame corr method (spearman) is now cythonized. + Bug Fixes ~~~~~~~~~ diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 836101ecafa2d..08ec707b0d96d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -997,6 +997,69 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None): return result +#---------------------------------------------------------------------- +# Pairwise Spearman correlation + +@cython.boundscheck(False) +@cython.wraparound(False) +def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): + cdef: + Py_ssize_t i, j, xi, yi, N, K + ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=1] maskedx + ndarray[float64_t, ndim=1] maskedy + ndarray[uint8_t, ndim=2] mask + int64_t nobs = 0 + float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor + + N, K = ( mat).shape + + result = np.empty((K, K), dtype=np.float64) + mask = np.isfinite(mat).view(np.uint8) + + for xi in range(K): + for yi in range(xi + 1): + nobs = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + nobs += 1 + + if nobs < minp: + result[xi, yi] = result[yi, xi] = np.NaN + else: + maskedx = np.empty(nobs, dtype=np.float64) + maskedy = np.empty(nobs, dtype=np.float64) + j = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + maskedx[j] = mat[i, xi] + maskedy[j] = mat[i, yi] + j += 1 + maskedx = rank_1d_float64(maskedx) + maskedy = rank_1d_float64(maskedy) + + mean = (nobs + 1) / 2. + + # now the cov numerator + sumx = sumxx = sumyy = 0 + + for i in range(nobs): + vx = maskedx[i] - mean + vy = maskedy[i] - mean + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = np.NaN + + return result + #---------------------------------------------------------------------- # Rolling variance diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5e3d3e95d8e56..f0145364363ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1528,7 +1528,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin- from pandas.io.stata import StataWriter writer = StataWriter(fname,self,convert_dates=convert_dates, encoding=encoding, byteorder=byteorder) writer.write_file() - + def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs): """ Write records stored in a DataFrame to a SQL database. @@ -4711,7 +4711,7 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None, #---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=None): + def corr(self, method='pearson', min_periods=1): """ Compute pairwise correlation of columns, excluding NA/null values @@ -4724,7 +4724,7 @@ def corr(self, method='pearson', min_periods=None): min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. Currently only available for pearson - correlation + and spearman correlation Returns ------- @@ -4737,6 +4737,9 @@ def corr(self, method='pearson', min_periods=None): if method == 'pearson': correl = _algos.nancorr(com._ensure_float64(mat), minp=min_periods) + elif method == 'spearman': + correl = _algos.nancorr_spearman(com._ensure_float64(mat), + minp=min_periods) else: if min_periods is None: min_periods = 1 diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py index 86e879d0be523..f01a867ea2893 100644 --- a/vb_suite/stat_ops.py +++ b/vb_suite/stat_ops.py @@ -82,3 +82,12 @@ stats_rolling_mean = Benchmark('rolling_mean(arr, 100)', setup, start_date=datetime(2011, 6, 1)) + +# spearman correlation + +setup = common_setup + """ +df = DataFrame(np.random.randn(1000, 300)) +""" + +stats_corr_spearman = Benchmark("df.corr(method='spearman')", setup, + start_date=datetime(2011, 12, 4))