Skip to content

Commit 108e181

Browse files
committed
ENH: api change, more consistent GroupBy.apply behavior with Series, close #938
1 parent cab9cab commit 108e181

File tree

4 files changed

+68
-28
lines changed

4 files changed

+68
-28
lines changed

RELEASE.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,20 @@ Where to get it
2222
* Binary installers on PyPI: http://pypi.python.org/pypi/pandas
2323
* Documentation: http://pandas.pydata.org
2424

25+
pandas 0.7.3
26+
============
27+
28+
**Release date:** NOT YET RELEAED
29+
30+
**New features / modules**
31+
32+
**API Changes**
33+
34+
- Calling apply on grouped Series, e.g. describe(), will no longer yield
35+
DataFrame by default. Will have to call unstack() to get prior behavior
36+
37+
**Bug fixes**
38+
2539
pandas 0.7.2
2640
============
2741

pandas/core/generic.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ def get(self, key, default=None):
9090
except KeyError:
9191
return default
9292

93-
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True):
93+
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
94+
group_keys=True):
9495
"""
9596
Group series using mapper (dict or key function, apply given function
9697
to group, return result as series) or by a series of columns
@@ -112,6 +113,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True):
112113
effectively "SQL-style" grouped output
113114
sort : boolean, default True
114115
Sort group keys. Get better performance by turning this off
116+
group_keys : boolean, default True
117+
When calling apply, add group keys to index to identify pieces
115118
116119
Examples
117120
--------
@@ -130,7 +133,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True):
130133
"""
131134
from pandas.core.groupby import groupby
132135
return groupby(self, by, axis=axis, level=level, as_index=as_index,
133-
sort=sort)
136+
sort=sort, group_keys=group_keys)
134137

135138
def select(self, crit, axis=0):
136139
"""

pandas/core/groupby.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ class GroupBy(object):
8787

8888
def __init__(self, obj, keys=None, axis=0, level=None,
8989
grouper=None, exclusions=None, column=None, as_index=True,
90-
sort=True):
90+
sort=True, group_keys=True):
9191
self._column = column
9292

9393
if isinstance(obj, NDFrame):
@@ -108,6 +108,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
108108
self.as_index = as_index
109109
self.keys = keys
110110
self.sort = sort
111+
self.group_keys = group_keys
111112

112113
if grouper is None:
113114
grouper, exclusions = _get_grouper(obj, keys, axis=axis,
@@ -370,23 +371,28 @@ def _python_apply_general(self, func, *args, **kwargs):
370371
def _wrap_applied_output(self, *args, **kwargs):
371372
raise NotImplementedError
372373

373-
def _wrap_frames(self, keys, values, not_indexed_same=False):
374+
def _concat_objects(self, keys, values, not_indexed_same=False):
374375
from pandas.tools.merge import concat
375376

376-
if not_indexed_same:
377+
if not not_indexed_same:
378+
result = concat(values, axis=self.axis)
379+
ax = self.obj._get_axis(self.axis)
380+
381+
if isinstance(result, Series):
382+
result = result.reindex(ax)
383+
else:
384+
result = result.reindex_axis(ax, axis=self.axis)
385+
elif self.group_keys:
377386
group_keys = keys
378387
group_levels = self.grouper.levels
379388
group_names = self.grouper.names
380389
result = concat(values, axis=self.axis, keys=group_keys,
381390
levels=group_levels, names=group_names)
382391
else:
383392
result = concat(values, axis=self.axis)
384-
ax = self.obj._get_axis(self.axis)
385-
result = result.reindex_axis(ax, axis=self.axis)
386393

387394
return result
388395

389-
390396
def _generate_groups(obj, group_index, ngroups, axis=0):
391397
if isinstance(obj, NDFrame) and not isinstance(obj, DataFrame):
392398
factory = obj._constructor
@@ -428,10 +434,11 @@ class Grouper(object):
428434
"""
429435
430436
"""
431-
def __init__(self, axis, groupings, sort=True):
437+
def __init__(self, axis, groupings, sort=True, group_keys=True):
432438
self.axis = axis
433439
self.groupings = groupings
434440
self.sort = sort
441+
self.group_keys = group_keys
435442

436443
@property
437444
def shape(self):
@@ -964,21 +971,12 @@ def _get_index():
964971
return index
965972

966973
if isinstance(values[0], Series):
967-
if not_indexed_same:
968-
data_dict = dict(zip(keys, values))
969-
result = DataFrame(data_dict).T
970-
result.index = _get_index()
971-
return result
972-
else:
973-
cat_values = np.concatenate([x.values for x in values])
974-
cat_index = values[0].index
975-
if len(values) > 1:
976-
cat_index = cat_index.append([x.index for x in values[1:]])
977-
return Series(cat_values, index=cat_index)
974+
return self._concat_objects(keys, values,
975+
not_indexed_same=not_indexed_same)
978976
elif isinstance(values[0], DataFrame):
979977
# possible that Series -> DataFrame by applied function
980-
return self._wrap_frames(keys, values,
981-
not_indexed_same=not_indexed_same)
978+
return self._concat_objects(keys, values,
979+
not_indexed_same=not_indexed_same)
982980
else:
983981
return Series(values, index=_get_index())
984982

@@ -1318,8 +1316,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13181316
key_names = self.grouper.names
13191317

13201318
if isinstance(values[0], DataFrame):
1321-
return self._wrap_frames(keys, values,
1322-
not_indexed_same=not_indexed_same)
1319+
return self._concat_objects(keys, values,
1320+
not_indexed_same=not_indexed_same)
13231321
else:
13241322
if len(self.grouper.groupings) > 1:
13251323
key_index = MultiIndex.from_tuples(keys, names=key_names)

pandas/tests/test_groupby.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def test_attr_wrapper(self):
383383
for name, gp in grouped:
384384
expected[name] = gp.describe()
385385
expected = DataFrame(expected).T
386-
assert_frame_equal(result, expected)
386+
assert_frame_equal(result.unstack(), expected)
387387

388388
# get attribute
389389
result = grouped.dtype
@@ -395,7 +395,7 @@ def test_attr_wrapper(self):
395395
def test_series_describe_multikey(self):
396396
ts = tm.makeTimeSeries()
397397
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
398-
result = grouped.describe()
398+
result = grouped.describe().unstack()
399399
assert_series_equal(result['mean'], grouped.mean())
400400
assert_series_equal(result['std'], grouped.std())
401401
assert_series_equal(result['min'], grouped.min())
@@ -405,7 +405,7 @@ def test_series_describe_single(self):
405405
grouped = ts.groupby(lambda x: x.month)
406406
result = grouped.apply(lambda x: x.describe())
407407
expected = grouped.describe()
408-
assert_frame_equal(result, expected)
408+
assert_series_equal(result, expected)
409409

410410
def test_series_agg_multikey(self):
411411
ts = tm.makeTimeSeries()
@@ -449,7 +449,7 @@ def test_frame_describe_multikey(self):
449449

450450
for col in self.tsframe:
451451
expected = grouped[col].describe()
452-
assert_frame_equal(result[col].unstack(), expected)
452+
assert_series_equal(result[col], expected)
453453

454454
groupedT = self.tsframe.groupby({'A' : 0, 'B' : 0,
455455
'C' : 1, 'D' : 1}, axis=1)
@@ -1581,6 +1581,31 @@ def test_dont_clobber_name_column(self):
15811581
result = df.groupby('key').apply(lambda x: x)
15821582
assert_frame_equal(result, df)
15831583

1584+
def test_skip_group_keys(self):
1585+
from pandas import concat
1586+
1587+
tsf = tm.makeTimeDataFrame()
1588+
1589+
grouped = tsf.groupby(lambda x: x.month, group_keys=False)
1590+
result = grouped.apply(lambda x: x.sort_index(by='A')[:3])
1591+
1592+
pieces = []
1593+
for key, group in grouped:
1594+
pieces.append(group.sort_index(by='A')[:3])
1595+
1596+
expected = concat(pieces)
1597+
assert_frame_equal(result, expected)
1598+
1599+
grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
1600+
result = grouped.apply(lambda x: x.order()[:3])
1601+
1602+
pieces = []
1603+
for key, group in grouped:
1604+
pieces.append(group.order()[:3])
1605+
1606+
expected = concat(pieces)
1607+
assert_series_equal(result, expected)
1608+
15841609
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
15851610
tups = map(tuple, df[keys].values)
15861611
tups = com._asarray_tuplesafe(tups)

0 commit comments

Comments
 (0)