|
5 | 5 | from pandas.core.series import remove_na
|
6 | 6 | from pandas.compat.scipy import scoreatpercentile
|
7 | 7 |
|
| 8 | +from pandas.tools.tile import (bucket, bucketcat, bucketpanel, |
| 9 | + quantileTS, makeQuantiles) |
8 | 10 |
|
9 |
| -__all__ = ['bucket', 'bucketpanel'] |
10 | 11 |
|
11 | 12 | def zscore(series):
|
12 | 13 | return (series - series.mean()) / np.std(series, ddof = 0)
|
13 | 14 |
|
| 15 | + |
14 | 16 | def correl_ts(frame1, frame2):
|
15 | 17 | """
|
16 | 18 | Pairwise correlation of columns of two DataFrame objects
|
@@ -41,271 +43,6 @@ def correl_ts(frame1, frame2):
|
41 | 43 | def correl_xs(frame1, frame2):
|
42 | 44 | return correl_ts(frame1.T, frame2.T)
|
43 | 45 |
|
44 |
| -#------------------------------------------------------------------------------- |
45 |
| -# Quantilization functions |
46 |
| - |
47 |
| -def bucket(series, k, by=None): |
48 |
| - """ |
49 |
| - Produce DataFrame representing quantiles of a Series |
50 |
| -
|
51 |
| - Parameters |
52 |
| - ---------- |
53 |
| - series : Series |
54 |
| - k : int |
55 |
| - number of quantiles |
56 |
| - by : Series or same-length array |
57 |
| - bucket by value |
58 |
| -
|
59 |
| - Returns |
60 |
| - ------- |
61 |
| - DataFrame |
62 |
| - """ |
63 |
| - if by is None: |
64 |
| - by = series |
65 |
| - else: |
66 |
| - by = by.reindex(series.index) |
67 |
| - |
68 |
| - split = _split_quantile(by, k) |
69 |
| - mat = np.empty((len(series), k), dtype=float) * np.NaN |
70 |
| - |
71 |
| - for i, v in enumerate(split): |
72 |
| - mat[:, i][v] = series.take(v) |
73 |
| - |
74 |
| - return DataFrame(mat, index=series.index, columns=np.arange(k) + 1) |
75 |
| - |
76 |
| -def _split_quantile(arr, k): |
77 |
| - arr = np.asarray(arr) |
78 |
| - mask = np.isfinite(arr) |
79 |
| - order = arr[mask].argsort() |
80 |
| - n = len(arr) |
81 |
| - |
82 |
| - return np.array_split(np.arange(n)[mask].take(order), k) |
83 |
| - |
84 |
| -def bucketcat(series, cats): |
85 |
| - """ |
86 |
| - Produce DataFrame representing quantiles of a Series |
87 |
| -
|
88 |
| - Parameters |
89 |
| - ---------- |
90 |
| - series : Series |
91 |
| - cat : Series or same-length array |
92 |
| - bucket by category; mutually exxlusive with 'by' |
93 |
| -
|
94 |
| - Returns |
95 |
| - ------- |
96 |
| - DataFrame |
97 |
| - """ |
98 |
| - if not isinstance(series, Series): |
99 |
| - series = Series(series, index=np.arange(len(series))) |
100 |
| - |
101 |
| - cats = np.asarray(cats) |
102 |
| - |
103 |
| - unique_labels = np.unique(cats) |
104 |
| - unique_labels = unique_labels[notnull(unique_labels)] |
105 |
| - |
106 |
| - # group by |
107 |
| - data = {} |
108 |
| - |
109 |
| - for i, label in enumerate(unique_labels): |
110 |
| - data[label] = series[cats == label] |
111 |
| - |
112 |
| - return DataFrame(data, columns=unique_labels) |
113 |
| - |
114 |
| -def bucketpanel(series, bins=None, by=None, cat=None): |
115 |
| - """ |
116 |
| - Bucket data by two Series to create summary panel |
117 |
| -
|
118 |
| - Parameters |
119 |
| - ---------- |
120 |
| - series : Series |
121 |
| - bins : tuple (length-2) |
122 |
| - e.g. (2, 2) |
123 |
| - by : tuple of Series |
124 |
| - bucket by value |
125 |
| - cat : tuple of Series |
126 |
| - bucket by category; mutually exxlusive with 'by' |
127 |
| -
|
128 |
| - Returns |
129 |
| - ------- |
130 |
| - DataFrame |
131 |
| - """ |
132 |
| - use_by = by is not None |
133 |
| - use_cat = cat is not None |
134 |
| - |
135 |
| - if use_by and use_cat: |
136 |
| - raise Exception('must specify by or cat, but not both') |
137 |
| - elif use_by: |
138 |
| - if len(by) != 2: |
139 |
| - raise Exception('must provide two bucketing series') |
140 |
| - |
141 |
| - xby, yby = by |
142 |
| - xbins, ybins = bins |
143 |
| - |
144 |
| - return _bucketpanel_by(series, xby, yby, xbins, ybins) |
145 |
| - |
146 |
| - elif use_cat: |
147 |
| - xcat, ycat = cat |
148 |
| - return _bucketpanel_cat(series, xcat, ycat) |
149 |
| - else: |
150 |
| - raise Exception('must specify either values or categories to bucket by') |
151 |
| - |
152 |
| -def _bucketpanel_by(series, xby, yby, xbins, ybins): |
153 |
| - xby = xby.reindex(series.index) |
154 |
| - yby = yby.reindex(series.index) |
155 |
| - |
156 |
| - n = len(series) |
157 |
| - # indices = np.arange(n) |
158 |
| - |
159 |
| - xlabels = _bucket_labels(xby.reindex(series.index), xbins) |
160 |
| - ylabels = _bucket_labels(yby.reindex(series.index), ybins) |
161 |
| - |
162 |
| - labels = _uniquify(xlabels, ylabels, xbins, ybins) |
163 |
| - |
164 |
| - mask = isnull(labels) |
165 |
| - labels[mask] = -1 |
166 |
| - |
167 |
| - unique_labels = np.unique(labels) |
168 |
| - bucketed = bucketcat(series, labels) |
169 |
| - |
170 |
| - _ulist = list(labels) |
171 |
| - index_map = dict((x, _ulist.index(x)) for x in unique_labels) |
172 |
| - |
173 |
| - def relabel(key): |
174 |
| - pos = index_map[key] |
175 |
| - |
176 |
| - xlab = xlabels[pos] |
177 |
| - ylab = ylabels[pos] |
178 |
| - |
179 |
| - return '%sx%s' % (int(xlab) if notnull(xlab) else 'NULL', |
180 |
| - int(ylab) if notnull(ylab) else 'NULL') |
181 |
| - |
182 |
| - return bucketed.rename(columns=relabel) |
183 |
| - |
184 |
| -def _bucketpanel_cat(series, xcat, ycat): |
185 |
| - xlabels, xmapping = _intern(xcat) |
186 |
| - ylabels, ymapping = _intern(ycat) |
187 |
| - |
188 |
| - shift = 10 ** (np.ceil(np.log10(ylabels.max()))) |
189 |
| - labels = xlabels * shift + ylabels |
190 |
| - |
191 |
| - sorter = labels.argsort() |
192 |
| - sorted_labels = labels.take(sorter) |
193 |
| - sorted_xlabels = xlabels.take(sorter) |
194 |
| - sorted_ylabels = ylabels.take(sorter) |
195 |
| - |
196 |
| - unique_labels = np.unique(labels) |
197 |
| - unique_labels = unique_labels[notnull(unique_labels)] |
198 |
| - |
199 |
| - locs = sorted_labels.searchsorted(unique_labels) |
200 |
| - xkeys = sorted_xlabels.take(locs) |
201 |
| - ykeys = sorted_ylabels.take(locs) |
202 |
| - |
203 |
| - stringified = ['(%s, %s)' % arg |
204 |
| - for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))] |
205 |
| - |
206 |
| - result = bucketcat(series, labels) |
207 |
| - result.columns = stringified |
208 |
| - |
209 |
| - return result |
210 |
| - |
211 |
| -def _intern(values): |
212 |
| - # assumed no NaN values |
213 |
| - values = np.asarray(values) |
214 |
| - |
215 |
| - uniqued = np.unique(values) |
216 |
| - labels = uniqued.searchsorted(values) |
217 |
| - return labels, uniqued |
218 |
| - |
219 |
| -def _intern_fast(values): |
220 |
| - pass |
221 |
| - |
222 |
| -def _uniquify(xlabels, ylabels, xbins, ybins): |
223 |
| - # encode the stuff, create unique label |
224 |
| - shifter = 10 ** max(xbins, ybins) |
225 |
| - _xpiece = xlabels * shifter |
226 |
| - _ypiece = ylabels |
227 |
| - |
228 |
| - return _xpiece + _ypiece |
229 |
| - |
230 |
| -def _cat_labels(labels): |
231 |
| - # group by |
232 |
| - data = {} |
233 |
| - |
234 |
| - unique_labels = np.unique(labels) |
235 |
| - unique_labels = unique_labels[notnull(unique_labels)] |
236 |
| - |
237 |
| - for label in unique_labels: |
238 |
| - mask = labels == label |
239 |
| - data[stringified] = series[mask] |
240 |
| - |
241 |
| - return DataFrame(data, index=series.index) |
242 |
| - |
243 |
| -def _bucket_labels(series, k): |
244 |
| - arr = np.asarray(series) |
245 |
| - mask = np.isfinite(arr) |
246 |
| - order = arr[mask].argsort() |
247 |
| - n = len(series) |
248 |
| - |
249 |
| - split = np.array_split(np.arange(n)[mask].take(order), k) |
250 |
| - |
251 |
| - bucketsize = n / k |
252 |
| - |
253 |
| - mat = np.empty(n, dtype=float) * np.NaN |
254 |
| - for i, v in enumerate(split): |
255 |
| - mat[v] = i |
256 |
| - |
257 |
| - return mat + 1 |
258 |
| - |
259 |
| -def makeQuantiles(series, n): |
260 |
| - """ |
261 |
| - Compute quantiles of input series. |
262 |
| -
|
263 |
| - Parameters |
264 |
| - ---------- |
265 |
| - series: Series |
266 |
| - Must have 'order' method and index |
267 |
| - n: int |
268 |
| - Number of quantile buckets |
269 |
| -
|
270 |
| - Returns |
271 |
| - ------- |
272 |
| - (edges, quantiles) |
273 |
| - edges: ith bucket --> (left edge, right edge) |
274 |
| - quantiles: ith bucket --> set of values |
275 |
| - """ |
276 |
| - series = remove_na(series).copy() |
277 |
| - series = series.order() |
278 |
| - quantiles = {} |
279 |
| - edges = {} |
280 |
| - T = float(len(series)) |
281 |
| - inc = T / n |
282 |
| - for i in range(n): |
283 |
| - theSlice = series[inc*i:(i+1)*inc] |
284 |
| - quantiles[i+1] = theSlice |
285 |
| - edges[i+1] = theSlice[0], theSlice[-1] |
286 |
| - return edges, quantiles |
287 |
| - |
288 |
| -def quantileTS(frame, percentile): |
289 |
| - """ |
290 |
| - Return score at percentile for each point in time (cross-section) |
291 |
| -
|
292 |
| - Parameters |
293 |
| - ---------- |
294 |
| - frame: DataFrame |
295 |
| - percentile: int |
296 |
| - nth percentile |
297 |
| -
|
298 |
| - Returns |
299 |
| - ------- |
300 |
| - Series (or TimeSeries) |
301 |
| - """ |
302 |
| - def func(x): |
303 |
| - x = np.asarray(x.valid()) |
304 |
| - if x.any(): |
305 |
| - return scoreatpercentile(x, percentile) |
306 |
| - else: |
307 |
| - return NaN |
308 |
| - return frame.apply(func, axis=1) |
309 | 46 |
|
310 | 47 | def percentileRank(frame, column=None, kind='mean'):
|
311 | 48 | """
|
|
0 commit comments