Skip to content

Commit 1fca5be

Browse files
committed
Merge pull request #6068 from bburan-galenea/bburan/dataframe_groupby_apply_series_name
ENH: Keep series name when merging GroupBy result
2 parents 549a390 + df63b3a commit 1fca5be

File tree

4 files changed

+87
-3
lines changed

4 files changed

+87
-3
lines changed

doc/source/groupby.rst

+25
Original file line numberDiff line numberDiff line change
@@ -734,3 +734,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
734734
df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]})
735735
df
736736
df.groupby(df.sum(), axis=1).sum()
737+
738+
739+
Group DataFrame columns, compute a set of metrics and return a named Series.
740+
The Series name is used as the name for the column index. This is especially
741+
useful in conjunction with reshaping operations such as stacking in which the
742+
column index name will be used as the name of the inserted column:
743+
744+
.. ipython:: python
745+
746+
df = pd.DataFrame({
747+
'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
748+
'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
749+
'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
750+
'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
751+
})
752+
753+
def compute_metrics(x):
754+
result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}
755+
return pd.Series(result, name='metrics')
756+
757+
result = df.groupby('a').apply(compute_metrics)
758+
759+
result
760+
761+
result.stack()

doc/source/release.rst

+10
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,16 @@ API Changes
112112
- ``df.iloc[:-len(df)]`` is now empty
113113
- ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse
114114

115+
- Better propagation/preservation of Series names when performing groupby
116+
operations:
117+
- ``SeriesGroupBy.agg`` will ensure that the name attribute of the original
118+
series is propagated to the result (:issue:`6265`).
119+
- If the function provided to ``GroupBy.apply`` returns a named series, the
120+
name of the series will be kept as the name of the column index of the
121+
DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates
122+
``DataFrame.stack`` operations where the name of the column index is used as
123+
the name of the inserted column containing the pivoted data.
124+
115125
Experimental Features
116126
~~~~~~~~~~~~~~~~~~~~~
117127

pandas/core/groupby.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,8 @@ def _wrap_aggregated_output(self, output, names=None):
17831783

17841784
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
17851785
if len(keys) == 0:
1786-
return Series([])
1786+
# GH #6265
1787+
return Series([], name=self.name)
17871788

17881789
def _get_index():
17891790
if self.grouper.nkeys > 1:
@@ -1805,7 +1806,8 @@ def _get_index():
18051806
return self._concat_objects(keys, values,
18061807
not_indexed_same=not_indexed_same)
18071808
else:
1808-
return Series(values, index=_get_index())
1809+
# GH #6265
1810+
return Series(values, index=_get_index(), name=self.name)
18091811

18101812
def _aggregate_named(self, func, *args, **kwargs):
18111813
result = {}
@@ -2262,17 +2264,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
22622264

22632265
try:
22642266
if self.axis == 0:
2267+
# GH6124 if the list of Series have a consistent name,
2268+
# then propagate that name to the result.
2269+
index = v.index.copy()
2270+
if index.name is None:
2271+
# Only propagate the series name to the result
2272+
# if all series have a consistent name. If the
2273+
# series do not have a consistent name, do
2274+
# nothing.
2275+
names = set(v.name for v in values)
2276+
if len(names) == 1:
2277+
index.name = list(names)[0]
22652278

22662279
# normally use vstack as its faster than concat
22672280
# and if we have mi-columns
22682281
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
22692282
stacked_values = np.vstack([np.asarray(x) for x in values])
2270-
result = DataFrame(stacked_values,index=key_index,columns=v.index)
2283+
result = DataFrame(stacked_values,index=key_index,columns=index)
22712284
else:
22722285
# GH5788 instead of stacking; concat gets the dtypes correct
22732286
from pandas.tools.merge import concat
22742287
result = concat(values,keys=key_index,names=key_index.names,
22752288
axis=self.axis).unstack()
2289+
result.columns = index
22762290
else:
22772291
stacked_values = np.vstack([np.asarray(x) for x in values])
22782292
result = DataFrame(stacked_values.T,index=v.index,columns=key_index)

pandas/tests/test_groupby.py

+35
Original file line numberDiff line numberDiff line change
@@ -2057,6 +2057,41 @@ def test_groupby_series_with_name(self):
20572057
self.assertIn('A', result2)
20582058
self.assertIn('B', result2)
20592059

2060+
def test_seriesgroupby_name_attr(self):
2061+
# GH 6265
2062+
result = self.df.groupby('A')['C']
2063+
self.assertEquals(result.count().name, 'C')
2064+
self.assertEquals(result.mean().name, 'C')
2065+
2066+
testFunc = lambda x: np.sum(x)*2
2067+
self.assertEquals(result.agg(testFunc).name, 'C')
2068+
2069+
def test_groupby_name_propagation(self):
2070+
# GH 6124
2071+
def summarize(df, name=None):
2072+
return Series({
2073+
'count': 1,
2074+
'mean': 2,
2075+
'omissions': 3,
2076+
}, name=name)
2077+
2078+
def summarize_random_name(df):
2079+
# Provide a different name for each Series. In this case, groupby
2080+
# should not attempt to propagate the Series name since they are
2081+
# inconsistent.
2082+
return Series({
2083+
'count': 1,
2084+
'mean': 2,
2085+
'omissions': 3,
2086+
}, name=df.iloc[0]['A'])
2087+
2088+
metrics = self.df.groupby('A').apply(summarize)
2089+
self.assertEqual(metrics.columns.name, None)
2090+
metrics = self.df.groupby('A').apply(summarize, 'metrics')
2091+
self.assertEqual(metrics.columns.name, 'metrics')
2092+
metrics = self.df.groupby('A').apply(summarize_random_name)
2093+
self.assertEqual(metrics.columns.name, None)
2094+
20602095
def test_groupby_nonstring_columns(self):
20612096
df = DataFrame([np.arange(10) for x in range(10)])
20622097
grouped = df.groupby(0)

0 commit comments

Comments
 (0)