Skip to content

ENH: Keep series name when merging GroupBy result #6068

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -734,3 +734,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]})
df
df.groupby(df.sum(), axis=1).sum()


Group DataFrame columns, compute a set of metrics and return a named Series.
The Series name is used as the name for the column index. This is especially
useful in conjunction with reshaping operations such as stacking in which the
column index name will be used as the name of the inserted column:

.. ipython:: python

df = pd.DataFrame({
'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
})

def compute_metrics(x):
result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}
return pd.Series(result, name='metrics')

result = df.groupby('a').apply(compute_metrics)

result

result.stack()
10 changes: 10 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,16 @@ API Changes
- ``df.iloc[:-len(df)]`` is now empty
- ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse

- Better propagation/preservation of Series names when performing groupby
operations:
- ``SeriesGroupBy.agg`` will ensure that the name attribute of the original
series is propagated to the result (:issue:`6265`).
- If the function provided to ``GroupBy.apply`` returns a named series, the
name of the series will be kept as the name of the column index of the
DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates
``DataFrame.stack`` operations where the name of the column index is used as
the name of the inserted column containing the pivoted data.

Experimental Features
~~~~~~~~~~~~~~~~~~~~~

Expand Down
20 changes: 17 additions & 3 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1783,7 +1783,8 @@ def _wrap_aggregated_output(self, output, names=None):

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
return Series([])
# GH #6265
return Series([], name=self.name)

def _get_index():
if self.grouper.nkeys > 1:
Expand All @@ -1805,7 +1806,8 @@ def _get_index():
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)
else:
return Series(values, index=_get_index())
# GH #6265
return Series(values, index=_get_index(), name=self.name)

def _aggregate_named(self, func, *args, **kwargs):
result = {}
Expand Down Expand Up @@ -2262,17 +2264,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

try:
if self.axis == 0:
# GH6124 if the list of Series have a consistent name,
# then propagate that name to the result.
index = v.index.copy()
if index.name is None:
# Only propagate the series name to the result
# if all series have a consistent name. If the
# series do not have a consistent name, do
# nothing.
names = set(v.name for v in values)
if len(names) == 1:
index.name = list(names)[0]

# normally use vstack as its faster than concat
# and if we have mi-columns
if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
stacked_values = np.vstack([np.asarray(x) for x in values])
result = DataFrame(stacked_values,index=key_index,columns=v.index)
result = DataFrame(stacked_values,index=key_index,columns=index)
else:
# GH5788 instead of stacking; concat gets the dtypes correct
from pandas.tools.merge import concat
result = concat(values,keys=key_index,names=key_index.names,
axis=self.axis).unstack()
result.columns = index
else:
stacked_values = np.vstack([np.asarray(x) for x in values])
result = DataFrame(stacked_values.T,index=v.index,columns=key_index)
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2057,6 +2057,41 @@ def test_groupby_series_with_name(self):
self.assertIn('A', result2)
self.assertIn('B', result2)

def test_seriesgroupby_name_attr(self):
# GH 6265
result = self.df.groupby('A')['C']
self.assertEquals(result.count().name, 'C')
self.assertEquals(result.mean().name, 'C')

testFunc = lambda x: np.sum(x)*2
self.assertEquals(result.agg(testFunc).name, 'C')

def test_groupby_name_propagation(self):
# GH 6124
def summarize(df, name=None):
return Series({
'count': 1,
'mean': 2,
'omissions': 3,
}, name=name)

def summarize_random_name(df):
# Provide a different name for each Series. In this case, groupby
# should not attempt to propagate the Series name since they are
# inconsistent.
return Series({
'count': 1,
'mean': 2,
'omissions': 3,
}, name=df.iloc[0]['A'])

metrics = self.df.groupby('A').apply(summarize)
self.assertEqual(metrics.columns.name, None)
metrics = self.df.groupby('A').apply(summarize, 'metrics')
self.assertEqual(metrics.columns.name, 'metrics')
metrics = self.df.groupby('A').apply(summarize_random_name)
self.assertEqual(metrics.columns.name, None)

def test_groupby_nonstring_columns(self):
df = DataFrame([np.arange(10) for x in range(10)])
grouped = df.groupby(0)
Expand Down