Skip to content

Commit 71e9046

Browse files
committed
BUG: handle non-numeric aggregates in pure python Series aggregation, GH #612
1 parent fbb1102 commit 71e9046

File tree

3 files changed

+46
-7
lines changed

3 files changed

+46
-7
lines changed

pandas/core/groupby.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -414,15 +414,26 @@ def _aggregate_series(self, obj, func, group_index, ngroups):
414414

415415
def _aggregate_series_pure_python(self, obj, func, ngroups):
416416
counts = np.zeros(ngroups, dtype=int)
417-
result = np.empty(ngroups, dtype=float)
418-
result.fill(np.nan)
417+
result = None
419418

420419
for label, group in self._generator_factory(obj):
421420
if group is None:
422421
continue
422+
res = func(group)
423+
if result is None:
424+
try:
425+
assert(not isinstance(res, np.ndarray))
426+
assert(not (isinstance(res, list) and
427+
len(res) == len(self.dummy)))
428+
429+
result = np.empty(ngroups, dtype='O')
430+
except Exception:
431+
raise ValueError('function does not reduce')
432+
423433
counts[label] = group.shape[0]
424-
result[label] = func(group)
434+
result[label] = res
425435

436+
result = lib.maybe_convert_objects(result)
426437
return result, counts
427438

428439
def _python_apply_general(self, func, *args, **kwargs):

pandas/tests/test_groupby.py

+22
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,28 @@ def test_series_agg_multikey(self):
343343
expected = grouped.sum()
344344
assert_series_equal(result, expected)
345345

346+
def test_series_agg_multi_pure_python(self):
347+
data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
348+
'bar', 'bar', 'bar', 'bar',
349+
'foo', 'foo', 'foo'],
350+
'B' : ['one', 'one', 'one', 'two',
351+
'one', 'one', 'one', 'two',
352+
'two', 'two', 'one'],
353+
'C' : ['dull', 'dull', 'shiny', 'dull',
354+
'dull', 'shiny', 'shiny', 'dull',
355+
'shiny', 'shiny', 'shiny'],
356+
'D' : np.random.randn(11),
357+
'E' : np.random.randn(11),
358+
'F' : np.random.randn(11)})
359+
360+
def bad(x):
361+
assert(len(x.base) == len(x))
362+
return 'foo'
363+
364+
result = data.groupby(['A', 'B']).agg(bad)
365+
expected = data.groupby(['A', 'B']).agg(lambda x: 'foo')
366+
assert_frame_equal(result, expected)
367+
346368
def test_series_index_name(self):
347369
grouped = self.df.ix[:, ['C']].groupby(self.df['A'])
348370
result = grouped.agg(lambda x: x.mean())

vb_suite/stat_ops.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@
44
common_setup = """from pandas_vb_common import *
55
"""
66

7+
#----------------------------------------------------------------------
8+
# nanops
9+
10+
setup = common_setup + """
11+
s = Series(np.random.randn(100000))
12+
s[::2] = np.nan
13+
"""
14+
15+
stat_ops_series_std = Benchmark("s.std()", setup)
16+
717
#----------------------------------------------------------------------
818
# ops by level
919

@@ -19,21 +29,17 @@
1929

2030
stat_ops_level_frame_sum = \
2131
Benchmark("df.sum(level=1)", setup,
22-
name='stat_ops_level_frame_sum',
2332
start_date=datetime(2011, 11, 15))
2433

2534
stat_ops_level_frame_sum_multiple = \
2635
Benchmark("df.sum(level=[0, 1])", setup, repeat=1,
27-
name='stat_ops_level_frame_sum_multiple',
2836
start_date=datetime(2011, 11, 15))
2937

3038
stat_ops_level_series_sum = \
3139
Benchmark("df[1].sum(level=1)", setup,
32-
name='stat_ops_level_series_sum',
3340
start_date=datetime(2011, 11, 15))
3441

3542
stat_ops_level_series_sum_multiple = \
3643
Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1,
37-
name='stat_ops_level_series_sum_multiple',
3844
start_date=datetime(2011, 11, 15))
3945

0 commit comments

Comments
 (0)