Skip to content

Commit 9bfdc3c

Browse files
committed
BUG: test coverage, groupby bug fixes
1 parent ec221c6 commit 9bfdc3c

13 files changed

+347
-350
lines changed

pandas/core/frame.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def na_op(x, y):
240240
mask = notnull(xrav)
241241
result[mask] = op(np.array(list(xrav[mask])), y)
242242

243-
if op == operator.ne:
243+
if op == operator.ne: # pragma: no cover
244244
np.putmask(result, -mask, True)
245245
else:
246246
np.putmask(result, -mask, False)
@@ -1869,12 +1869,12 @@ def xs(self, key, axis=0, level=None, copy=True):
18691869
if np.isscalar(loc):
18701870
new_values = self._data.fast_2d_xs(loc, copy=copy)
18711871
return Series(new_values, index=self.columns, name=key)
1872-
elif isinstance(loc, slice) or loc.dtype == np.bool_:
1872+
else: # isinstance(loc, slice) or loc.dtype == np.bool_:
18731873
result = self[loc]
18741874
result.index = new_index
18751875
return result
1876-
else:
1877-
return self.take(loc)
1876+
# else:
1877+
# return self.take(loc)
18781878

18791879
def lookup(self, row_labels, col_labels):
18801880
"""

pandas/core/groupby.py

+23-37
Original file line numberDiff line numberDiff line change
@@ -345,12 +345,11 @@ def ohlc(self):
345345

346346
def nth(self, n):
347347
def picker(arr):
348-
if arr is not None:
349-
n_ok_pos = n >= 0 and len(arr) > n
350-
n_ok_neg = n < 0 and len(arr) >= n
351-
if n_ok_pos or n_ok_neg:
352-
return arr.iget(n)
353-
return np.nan
348+
arr = arr[com.notnull(arr)]
349+
if len(arr) >= n + 1:
350+
return arr.iget(n)
351+
else:
352+
return np.nan
354353
return self.agg(picker)
355354

356355
def _cython_agg_general(self, how):
@@ -656,11 +655,13 @@ def aggregate(self, values, how, axis=0):
656655
arity = self._cython_arity.get(how, 1)
657656

658657
vdim = values.ndim
658+
swapped = False
659659
if vdim == 1:
660660
values = values[:, None]
661661
out_shape = (self.ngroups, arity)
662662
else:
663663
if axis > 0:
664+
swapped = True
664665
values = values.swapaxes(0, axis)
665666
if arity > 1:
666667
raise NotImplementedError
@@ -673,8 +674,11 @@ def aggregate(self, values, how, axis=0):
673674
result = self._aggregate(result, counts, values, how)
674675

675676
if self._filter_empty_groups:
676-
result = lib.row_bool_subset(result,
677-
(counts > 0).view(np.uint8))
677+
if result.ndim == 2:
678+
result = lib.row_bool_subset(result,
679+
(counts > 0).view(np.uint8))
680+
else:
681+
result = result[counts > 0]
678682

679683
if vdim == 1 and arity == 1:
680684
result = result[:, 0]
@@ -685,7 +689,7 @@ def aggregate(self, values, how, axis=0):
685689
else:
686690
names = None
687691

688-
if axis > 0:
692+
if swapped:
689693
result = result.swapaxes(0, axis)
690694

691695
return result, names
@@ -700,7 +704,8 @@ def _aggregate(self, result, counts, values, how):
700704
raise NotImplementedError
701705
elif values.ndim > 2:
702706
for i, chunk in enumerate(values.transpose(2, 0, 1)):
703-
agg_func(result[:, :, i], counts, chunk, comp_ids)
707+
agg_func(result[:, :, i], counts, chunk.squeeze(),
708+
comp_ids)
704709
else:
705710
agg_func(result, counts, values, comp_ids)
706711

@@ -942,10 +947,6 @@ def __init__(self, index, grouper=None, name=None, level=None,
942947
# pre-computed
943948
self._was_factor = False
944949

945-
# did we pass a custom grouper object? Do nothing
946-
if isinstance(grouper, Grouper):
947-
return
948-
949950
if level is not None:
950951
if not isinstance(level, int):
951952
assert(level in index.names)
@@ -1349,7 +1350,13 @@ def _wrap_agged_blocks(self, blocks):
13491350
obj = self._obj_with_exclusions
13501351

13511352
new_axes = list(obj._data.axes)
1352-
new_axes[self.axis] = self.grouper.result_index
1353+
1354+
# more kludge
1355+
if self.axis == 0:
1356+
new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index
1357+
else:
1358+
new_axes[self.axis] = self.grouper.result_index
1359+
13531360
mgr = BlockManager(blocks, new_axes)
13541361

13551362
new_obj = type(obj)(mgr)
@@ -1693,7 +1700,7 @@ def _transform_item_by_item(self, obj, wrapper):
16931700
except Exception:
16941701
pass
16951702

1696-
if len(output) == 0:
1703+
if len(output) == 0: # pragma: no cover
16971704
raise TypeError('Transform function invalid for data types')
16981705

16991706
columns = obj.columns
@@ -1769,12 +1776,6 @@ def _wrap_aggregated_output(self, output, names=None):
17691776

17701777
return result
17711778

1772-
def _post_process_cython_aggregate(self, obj):
1773-
# undoing kludge from below
1774-
if self.axis == 0:
1775-
obj = obj.T
1776-
return obj
1777-
17781779
def _wrap_agged_blocks(self, blocks):
17791780
obj = self._obj_with_exclusions
17801781

@@ -1827,8 +1828,6 @@ def _iterate_slices(self):
18271828
slicer = lambda x: self.obj[x]
18281829
else:
18291830
raise NotImplementedError
1830-
# slice_axis = self.obj.index
1831-
# slicer = lambda x: self.obj.xs(x, axis=self.axis)
18321831

18331832
for val in slice_axis:
18341833
if val in self.exclusions:
@@ -1857,7 +1856,6 @@ def aggregate(self, arg, *args, **kwargs):
18571856
return self._aggregate_generic(arg, *args, **kwargs)
18581857

18591858
def _wrap_generic_output(self, result, obj):
1860-
18611859
new_axes = list(obj.axes)
18621860
new_axes[self.axis] = self.grouper.result_index
18631861

@@ -1882,8 +1880,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs):
18821880
result[item] = itemg.aggregate(func, *args, **kwargs)
18831881
except (ValueError, TypeError):
18841882
raise
1885-
# cannot_agg.append(item)
1886-
# continue
18871883
new_axes = list(obj.axes)
18881884
new_axes[self.axis] = self.grouper.result_index
18891885
return Panel._from_axes(result, new_axes)
@@ -1892,16 +1888,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs):
18921888

18931889
def _wrap_aggregated_output(self, output, names=None):
18941890
raise NotImplementedError
1895-
new_axes = list(self._obj_with_exclusions.axes)
1896-
new_axes[self.axis] = self.grouper.result_index
1897-
1898-
result = Panel(output, index=self.grouper.result_index,
1899-
columns=output_keys)
1900-
1901-
if self.axis > 0:
1902-
result = result.swapaxes(0, self.axis)
1903-
1904-
return result
19051891

19061892

19071893
class NDArrayGroupBy(GroupBy):

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2736,7 +2736,7 @@ def _resolve_offset(freq, kwds):
27362736
offset = freq
27372737
warn = False
27382738

2739-
if warn and _SHOW_WARNINGS:
2739+
if warn and _SHOW_WARNINGS: # pragma: no cover
27402740
import warnings
27412741
warnings.warn("'timeRule' and 'offset' parameters are deprecated,"
27422742
" please use 'freq' instead",

pandas/stats/misc.py

+194-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
from pandas.core.api import Series, DataFrame, isnull, notnull
55
from pandas.core.series import remove_na
66

7-
from pandas.tools.tile import (bucket, bucketcat, bucketpanel,
8-
quantileTS)
9-
7+
from pandas.tools.tile import quantileTS
108

119
def zscore(series):
1210
return (series - series.mean()) / np.std(series, ddof = 0)
@@ -96,3 +94,196 @@ def percentileRank(frame, column=None, kind='mean'):
9694
results.setdefault(date, {})[column] = fun(xs, xs[column])
9795
results = DataFrame(results).T
9896
return results
97+
98+
99+
def bucket(series, k, by=None):
100+
"""
101+
Produce DataFrame representing quantiles of a Series
102+
103+
Parameters
104+
----------
105+
series : Series
106+
k : int
107+
number of quantiles
108+
by : Series or same-length array
109+
bucket by value
110+
111+
Returns
112+
-------
113+
DataFrame
114+
"""
115+
if by is None:
116+
by = series
117+
else:
118+
by = by.reindex(series.index)
119+
120+
split = _split_quantile(by, k)
121+
mat = np.empty((len(series), k), dtype=float) * np.NaN
122+
123+
for i, v in enumerate(split):
124+
mat[:, i][v] = series.take(v)
125+
126+
return DataFrame(mat, index=series.index, columns=np.arange(k) + 1)
127+
128+
def _split_quantile(arr, k):
129+
arr = np.asarray(arr)
130+
mask = np.isfinite(arr)
131+
order = arr[mask].argsort()
132+
n = len(arr)
133+
134+
return np.array_split(np.arange(n)[mask].take(order), k)
135+
136+
def bucketcat(series, cats):
137+
"""
138+
Produce DataFrame representing quantiles of a Series
139+
140+
Parameters
141+
----------
142+
series : Series
143+
cat : Series or same-length array
144+
bucket by category; mutually exxlusive with 'by'
145+
146+
Returns
147+
-------
148+
DataFrame
149+
"""
150+
if not isinstance(series, Series):
151+
series = Series(series, index=np.arange(len(series)))
152+
153+
cats = np.asarray(cats)
154+
155+
unique_labels = np.unique(cats)
156+
unique_labels = unique_labels[com.notnull(unique_labels)]
157+
158+
# group by
159+
data = {}
160+
161+
for label in unique_labels:
162+
data[label] = series[cats == label]
163+
164+
return DataFrame(data, columns=unique_labels)
165+
166+
def bucketpanel(series, bins=None, by=None, cat=None):
167+
"""
168+
Bucket data by two Series to create summary panel
169+
170+
Parameters
171+
----------
172+
series : Series
173+
bins : tuple (length-2)
174+
e.g. (2, 2)
175+
by : tuple of Series
176+
bucket by value
177+
cat : tuple of Series
178+
bucket by category; mutually exxlusive with 'by'
179+
180+
Returns
181+
-------
182+
DataFrame
183+
"""
184+
use_by = by is not None
185+
use_cat = cat is not None
186+
187+
if use_by and use_cat:
188+
raise Exception('must specify by or cat, but not both')
189+
elif use_by:
190+
if len(by) != 2:
191+
raise Exception('must provide two bucketing series')
192+
193+
xby, yby = by
194+
xbins, ybins = bins
195+
196+
return _bucketpanel_by(series, xby, yby, xbins, ybins)
197+
198+
elif use_cat:
199+
xcat, ycat = cat
200+
return _bucketpanel_cat(series, xcat, ycat)
201+
else:
202+
raise Exception('must specify either values or categories to bucket by')
203+
204+
def _bucketpanel_by(series, xby, yby, xbins, ybins):
205+
xby = xby.reindex(series.index)
206+
yby = yby.reindex(series.index)
207+
208+
xlabels = _bucket_labels(xby.reindex(series.index), xbins)
209+
ylabels = _bucket_labels(yby.reindex(series.index), ybins)
210+
211+
labels = _uniquify(xlabels, ylabels, xbins, ybins)
212+
213+
mask = com.isnull(labels)
214+
labels[mask] = -1
215+
216+
unique_labels = np.unique(labels)
217+
bucketed = bucketcat(series, labels)
218+
219+
_ulist = list(labels)
220+
index_map = dict((x, _ulist.index(x)) for x in unique_labels)
221+
222+
def relabel(key):
223+
pos = index_map[key]
224+
225+
xlab = xlabels[pos]
226+
ylab = ylabels[pos]
227+
228+
return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL',
229+
int(ylab) if com.notnull(ylab) else 'NULL')
230+
231+
return bucketed.rename(columns=relabel)
232+
233+
def _bucketpanel_cat(series, xcat, ycat):
234+
xlabels, xmapping = _intern(xcat)
235+
ylabels, ymapping = _intern(ycat)
236+
237+
shift = 10 ** (np.ceil(np.log10(ylabels.max())))
238+
labels = xlabels * shift + ylabels
239+
240+
sorter = labels.argsort()
241+
sorted_labels = labels.take(sorter)
242+
sorted_xlabels = xlabels.take(sorter)
243+
sorted_ylabels = ylabels.take(sorter)
244+
245+
unique_labels = np.unique(labels)
246+
unique_labels = unique_labels[com.notnull(unique_labels)]
247+
248+
locs = sorted_labels.searchsorted(unique_labels)
249+
xkeys = sorted_xlabels.take(locs)
250+
ykeys = sorted_ylabels.take(locs)
251+
252+
stringified = ['(%s, %s)' % arg
253+
for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))]
254+
255+
result = bucketcat(series, labels)
256+
result.columns = stringified
257+
258+
return result
259+
260+
def _intern(values):
261+
# assumed no NaN values
262+
values = np.asarray(values)
263+
264+
uniqued = np.unique(values)
265+
labels = uniqued.searchsorted(values)
266+
return labels, uniqued
267+
268+
269+
def _uniquify(xlabels, ylabels, xbins, ybins):
270+
# encode the stuff, create unique label
271+
shifter = 10 ** max(xbins, ybins)
272+
_xpiece = xlabels * shifter
273+
_ypiece = ylabels
274+
275+
return _xpiece + _ypiece
276+
277+
def _bucket_labels(series, k):
278+
arr = np.asarray(series)
279+
mask = np.isfinite(arr)
280+
order = arr[mask].argsort()
281+
n = len(series)
282+
283+
split = np.array_split(np.arange(n)[mask].take(order), k)
284+
285+
mat = np.empty(n, dtype=float) * np.NaN
286+
for i, v in enumerate(split):
287+
mat[v] = i
288+
289+
return mat + 1

0 commit comments

Comments
 (0)