Skip to content

Commit abac90d

Browse files
committed
ENH: handle Series input in DataFrame.corrwith, close pandas-dev#1367
1 parent f1fd50b commit abac90d

File tree

6 files changed

+329
-267
lines changed

6 files changed

+329
-267
lines changed

pandas/core/frame.py

+3
Original file line numberDiff line numberDiff line change
@@ -3876,6 +3876,9 @@ def corrwith(self, other, axis=0, drop=False):
38763876
-------
38773877
correls : Series
38783878
"""
3879+
if isinstance(other, Series):
3880+
return self.apply(other.corr, axis=axis)
3881+
38793882
this = self._get_numeric_data()
38803883
other = other._get_numeric_data()
38813884

pandas/stats/misc.py

+3-266
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
from pandas.core.series import remove_na
66
from pandas.compat.scipy import scoreatpercentile
77

8+
from pandas.tools.tile import (bucket, bucketcat, bucketpanel,
9+
quantileTS, makeQuantiles)
810

9-
__all__ = ['bucket', 'bucketpanel']
1011

1112
def zscore(series):
1213
return (series - series.mean()) / np.std(series, ddof = 0)
1314

15+
1416
def correl_ts(frame1, frame2):
1517
"""
1618
Pairwise correlation of columns of two DataFrame objects
@@ -41,271 +43,6 @@ def correl_ts(frame1, frame2):
4143
def correl_xs(frame1, frame2):
4244
return correl_ts(frame1.T, frame2.T)
4345

44-
#-------------------------------------------------------------------------------
45-
# Quantilization functions
46-
47-
def bucket(series, k, by=None):
48-
"""
49-
Produce DataFrame representing quantiles of a Series
50-
51-
Parameters
52-
----------
53-
series : Series
54-
k : int
55-
number of quantiles
56-
by : Series or same-length array
57-
bucket by value
58-
59-
Returns
60-
-------
61-
DataFrame
62-
"""
63-
if by is None:
64-
by = series
65-
else:
66-
by = by.reindex(series.index)
67-
68-
split = _split_quantile(by, k)
69-
mat = np.empty((len(series), k), dtype=float) * np.NaN
70-
71-
for i, v in enumerate(split):
72-
mat[:, i][v] = series.take(v)
73-
74-
return DataFrame(mat, index=series.index, columns=np.arange(k) + 1)
75-
76-
def _split_quantile(arr, k):
77-
arr = np.asarray(arr)
78-
mask = np.isfinite(arr)
79-
order = arr[mask].argsort()
80-
n = len(arr)
81-
82-
return np.array_split(np.arange(n)[mask].take(order), k)
83-
84-
def bucketcat(series, cats):
85-
"""
86-
Produce DataFrame representing quantiles of a Series
87-
88-
Parameters
89-
----------
90-
series : Series
91-
cat : Series or same-length array
92-
bucket by category; mutually exxlusive with 'by'
93-
94-
Returns
95-
-------
96-
DataFrame
97-
"""
98-
if not isinstance(series, Series):
99-
series = Series(series, index=np.arange(len(series)))
100-
101-
cats = np.asarray(cats)
102-
103-
unique_labels = np.unique(cats)
104-
unique_labels = unique_labels[notnull(unique_labels)]
105-
106-
# group by
107-
data = {}
108-
109-
for i, label in enumerate(unique_labels):
110-
data[label] = series[cats == label]
111-
112-
return DataFrame(data, columns=unique_labels)
113-
114-
def bucketpanel(series, bins=None, by=None, cat=None):
115-
"""
116-
Bucket data by two Series to create summary panel
117-
118-
Parameters
119-
----------
120-
series : Series
121-
bins : tuple (length-2)
122-
e.g. (2, 2)
123-
by : tuple of Series
124-
bucket by value
125-
cat : tuple of Series
126-
bucket by category; mutually exxlusive with 'by'
127-
128-
Returns
129-
-------
130-
DataFrame
131-
"""
132-
use_by = by is not None
133-
use_cat = cat is not None
134-
135-
if use_by and use_cat:
136-
raise Exception('must specify by or cat, but not both')
137-
elif use_by:
138-
if len(by) != 2:
139-
raise Exception('must provide two bucketing series')
140-
141-
xby, yby = by
142-
xbins, ybins = bins
143-
144-
return _bucketpanel_by(series, xby, yby, xbins, ybins)
145-
146-
elif use_cat:
147-
xcat, ycat = cat
148-
return _bucketpanel_cat(series, xcat, ycat)
149-
else:
150-
raise Exception('must specify either values or categories to bucket by')
151-
152-
def _bucketpanel_by(series, xby, yby, xbins, ybins):
153-
xby = xby.reindex(series.index)
154-
yby = yby.reindex(series.index)
155-
156-
n = len(series)
157-
# indices = np.arange(n)
158-
159-
xlabels = _bucket_labels(xby.reindex(series.index), xbins)
160-
ylabels = _bucket_labels(yby.reindex(series.index), ybins)
161-
162-
labels = _uniquify(xlabels, ylabels, xbins, ybins)
163-
164-
mask = isnull(labels)
165-
labels[mask] = -1
166-
167-
unique_labels = np.unique(labels)
168-
bucketed = bucketcat(series, labels)
169-
170-
_ulist = list(labels)
171-
index_map = dict((x, _ulist.index(x)) for x in unique_labels)
172-
173-
def relabel(key):
174-
pos = index_map[key]
175-
176-
xlab = xlabels[pos]
177-
ylab = ylabels[pos]
178-
179-
return '%sx%s' % (int(xlab) if notnull(xlab) else 'NULL',
180-
int(ylab) if notnull(ylab) else 'NULL')
181-
182-
return bucketed.rename(columns=relabel)
183-
184-
def _bucketpanel_cat(series, xcat, ycat):
185-
xlabels, xmapping = _intern(xcat)
186-
ylabels, ymapping = _intern(ycat)
187-
188-
shift = 10 ** (np.ceil(np.log10(ylabels.max())))
189-
labels = xlabels * shift + ylabels
190-
191-
sorter = labels.argsort()
192-
sorted_labels = labels.take(sorter)
193-
sorted_xlabels = xlabels.take(sorter)
194-
sorted_ylabels = ylabels.take(sorter)
195-
196-
unique_labels = np.unique(labels)
197-
unique_labels = unique_labels[notnull(unique_labels)]
198-
199-
locs = sorted_labels.searchsorted(unique_labels)
200-
xkeys = sorted_xlabels.take(locs)
201-
ykeys = sorted_ylabels.take(locs)
202-
203-
stringified = ['(%s, %s)' % arg
204-
for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))]
205-
206-
result = bucketcat(series, labels)
207-
result.columns = stringified
208-
209-
return result
210-
211-
def _intern(values):
212-
# assumed no NaN values
213-
values = np.asarray(values)
214-
215-
uniqued = np.unique(values)
216-
labels = uniqued.searchsorted(values)
217-
return labels, uniqued
218-
219-
def _intern_fast(values):
220-
pass
221-
222-
def _uniquify(xlabels, ylabels, xbins, ybins):
223-
# encode the stuff, create unique label
224-
shifter = 10 ** max(xbins, ybins)
225-
_xpiece = xlabels * shifter
226-
_ypiece = ylabels
227-
228-
return _xpiece + _ypiece
229-
230-
def _cat_labels(labels):
231-
# group by
232-
data = {}
233-
234-
unique_labels = np.unique(labels)
235-
unique_labels = unique_labels[notnull(unique_labels)]
236-
237-
for label in unique_labels:
238-
mask = labels == label
239-
data[stringified] = series[mask]
240-
241-
return DataFrame(data, index=series.index)
242-
243-
def _bucket_labels(series, k):
244-
arr = np.asarray(series)
245-
mask = np.isfinite(arr)
246-
order = arr[mask].argsort()
247-
n = len(series)
248-
249-
split = np.array_split(np.arange(n)[mask].take(order), k)
250-
251-
bucketsize = n / k
252-
253-
mat = np.empty(n, dtype=float) * np.NaN
254-
for i, v in enumerate(split):
255-
mat[v] = i
256-
257-
return mat + 1
258-
259-
def makeQuantiles(series, n):
260-
"""
261-
Compute quantiles of input series.
262-
263-
Parameters
264-
----------
265-
series: Series
266-
Must have 'order' method and index
267-
n: int
268-
Number of quantile buckets
269-
270-
Returns
271-
-------
272-
(edges, quantiles)
273-
edges: ith bucket --> (left edge, right edge)
274-
quantiles: ith bucket --> set of values
275-
"""
276-
series = remove_na(series).copy()
277-
series = series.order()
278-
quantiles = {}
279-
edges = {}
280-
T = float(len(series))
281-
inc = T / n
282-
for i in range(n):
283-
theSlice = series[inc*i:(i+1)*inc]
284-
quantiles[i+1] = theSlice
285-
edges[i+1] = theSlice[0], theSlice[-1]
286-
return edges, quantiles
287-
288-
def quantileTS(frame, percentile):
289-
"""
290-
Return score at percentile for each point in time (cross-section)
291-
292-
Parameters
293-
----------
294-
frame: DataFrame
295-
percentile: int
296-
nth percentile
297-
298-
Returns
299-
-------
300-
Series (or TimeSeries)
301-
"""
302-
def func(x):
303-
x = np.asarray(x.valid())
304-
if x.any():
305-
return scoreatpercentile(x, percentile)
306-
else:
307-
return NaN
308-
return frame.apply(func, axis=1)
30946

31047
def percentileRank(frame, column=None, kind='mean'):
31148
"""

pandas/tests/test_frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -3500,6 +3500,12 @@ def test_corrwith_with_objects(self):
35003500
expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1)
35013501
assert_series_equal(result, expected)
35023502

3503+
def test_corrwith_series(self):
3504+
result = self.tsframe.corrwith(self.tsframe['A'])
3505+
expected = self.tsframe.apply(self.tsframe['A'].corr)
3506+
3507+
assert_series_equal(result, expected)
3508+
35033509
def test_dropEmptyRows(self):
35043510
N = len(self.frame.index)
35053511
mat = randn(N)

pandas/tools/tests/test_tile.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import nose
2+
import unittest
3+
4+
import numpy as np
5+
6+
from pandas import DataFrame, Series
7+
import pandas.util.testing as tm
8+
9+
from pandas.tools.tile import cut
10+
11+
12+
if __name__ == '__main__':
13+
import nose
14+
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
15+
exit=False)
16+
17+

0 commit comments

Comments
 (0)