Skip to content

Commit 57063f9

Browse files
bburan-galeneagouthambs
authored andcommitted
ENH: Keep series name in GroupBy agg/apply ops
When possible, attempt to preserve the series name when performing groupby operations. This facilitates reshaping/indexing operations on the result of the groupby/apply or groupby/agg operation. Fixes GH6265 and GH6124. Added example to groupby.rst and description to API changes for v0.14.
1 parent d2c9adc commit 57063f9

File tree

4 files changed

+87
-3
lines changed

4 files changed

+87
-3
lines changed

doc/source/groupby.rst

+25
Original file line numberDiff line numberDiff line change
@@ -734,3 +734,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
734734
df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]})
735735
df
736736
df.groupby(df.sum(), axis=1).sum()
737+
738+
739+
Group DataFrame columns, compute a set of metrics and return a named Series.
740+
The Series name is used as the name for the column index. This is especially
741+
useful in conjunction with reshaping operations such as stacking in which the
742+
column index name will be used as the name of the inserted column:
743+
744+
.. ipython:: python
745+
746+
df = pd.DataFrame({
747+
'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
748+
'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
749+
'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
750+
'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
751+
})
752+
753+
def compute_metrics(x):
754+
result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}
755+
return pd.Series(result, name='metrics')
756+
757+
result = df.groupby('a').apply(compute_metrics)
758+
759+
result
760+
761+
result.stack()

doc/source/release.rst

+10
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,16 @@ API Changes
112112
- ``df.iloc[:-len(df)]`` is now empty
113113
- ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse
114114

115+
- Better propagation/preservation of Series names when performing groupby
116+
operations:
117+
- ``SeriesGroupBy.agg`` will ensure that the name attribute of the original
118+
series is propagated to the result (:issue:`6265`).
119+
- If the function provided to ``GroupBy.apply`` returns a named series, the
120+
name of the series will be kept as the name of the column index of the
121+
DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates
122+
``DataFrame.stack`` operations where the name of the column index is used as
123+
the name of the inserted column containing the pivoted data.
124+
115125
Experimental Features
116126
~~~~~~~~~~~~~~~~~~~~~
117127

pandas/core/groupby.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -1786,7 +1786,8 @@ def _wrap_aggregated_output(self, output, names=None):
17861786

17871787
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
17881788
if len(keys) == 0:
1789-
return Series([])
1789+
# GH #6265
1790+
return Series([], name=self.name)
17901791

17911792
def _get_index():
17921793
if self.grouper.nkeys > 1:
@@ -1808,7 +1809,8 @@ def _get_index():
18081809
return self._concat_objects(keys, values,
18091810
not_indexed_same=not_indexed_same)
18101811
else:
1811-
return Series(values, index=_get_index())
1812+
# GH #6265
1813+
return Series(values, index=_get_index(), name=self.name)
18121814

18131815
def _aggregate_named(self, func, *args, **kwargs):
18141816
result = {}
@@ -2265,17 +2267,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
22652267

22662268
try:
22672269
if self.axis == 0:
2270+
# GH6124 if the list of Series have a consistent name,
2271+
# then propagate that name to the result.
2272+
index = v.index.copy()
2273+
if index.name is None:
2274+
# Only propagate the series name to the result
2275+
# if all series have a consistent name. If the
2276+
# series do not have a consistent name, do
2277+
# nothing.
2278+
names = set(v.name for v in values)
2279+
if len(names) == 1:
2280+
index.name = list(names)[0]
22682281

22692282
# normally use vstack as its faster than concat
22702283
# and if we have mi-columns
22712284
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
22722285
stacked_values = np.vstack([np.asarray(x) for x in values])
2273-
result = DataFrame(stacked_values,index=key_index,columns=v.index)
2286+
result = DataFrame(stacked_values,index=key_index,columns=index)
22742287
else:
22752288
# GH5788 instead of stacking; concat gets the dtypes correct
22762289
from pandas.tools.merge import concat
22772290
result = concat(values,keys=key_index,names=key_index.names,
22782291
axis=self.axis).unstack()
2292+
result.columns = index
22792293
else:
22802294
stacked_values = np.vstack([np.asarray(x) for x in values])
22812295
result = DataFrame(stacked_values.T,index=v.index,columns=key_index)

pandas/tests/test_groupby.py

+35
Original file line numberDiff line numberDiff line change
@@ -2064,6 +2064,41 @@ def test_groupby_series_with_name(self):
20642064
self.assertIn('A', result2)
20652065
self.assertIn('B', result2)
20662066

2067+
def test_seriesgroupby_name_attr(self):
2068+
# GH 6265
2069+
result = self.df.groupby('A')['C']
2070+
self.assertEquals(result.count().name, 'C')
2071+
self.assertEquals(result.mean().name, 'C')
2072+
2073+
testFunc = lambda x: np.sum(x)*2
2074+
self.assertEquals(result.agg(testFunc).name, 'C')
2075+
2076+
def test_groupby_name_propagation(self):
2077+
# GH 6124
2078+
def summarize(df, name=None):
2079+
return Series({
2080+
'count': 1,
2081+
'mean': 2,
2082+
'omissions': 3,
2083+
}, name=name)
2084+
2085+
def summarize_random_name(df):
2086+
# Provide a different name for each Series. In this case, groupby
2087+
# should not attempt to propagate the Series name since they are
2088+
# inconsistent.
2089+
return Series({
2090+
'count': 1,
2091+
'mean': 2,
2092+
'omissions': 3,
2093+
}, name=df.iloc[0]['A'])
2094+
2095+
metrics = self.df.groupby('A').apply(summarize)
2096+
self.assertEqual(metrics.columns.name, None)
2097+
metrics = self.df.groupby('A').apply(summarize, 'metrics')
2098+
self.assertEqual(metrics.columns.name, 'metrics')
2099+
metrics = self.df.groupby('A').apply(summarize_random_name)
2100+
self.assertEqual(metrics.columns.name, None)
2101+
20672102
def test_groupby_nonstring_columns(self):
20682103
df = DataFrame([np.arange(10) for x in range(10)])
20692104
grouped = df.groupby(0)

0 commit comments

Comments
 (0)