Merge pull request #6068 from bburan-galenea/bburan/dataframe_groupby_apply_series_name

jreback · jreback · commit 1fca5be5e492 · 2014-03-05T16:58:53.000-05:00
ENH: Keep series name when merging GroupBy result
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -734,3 +734,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
    df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]})
    df
    df.groupby(df.sum(), axis=1).sum()
+
+
+Group DataFrame columns, compute a set of metrics and return a named Series.
+The Series name is used as the name for the column index.  This is especially
+useful in conjunction with reshaping operations such as stacking in which the
+column index name will be used as the name of the inserted column:
+
+.. ipython:: python
+
+   df = pd.DataFrame({
+        'a':  [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
+        'b':  [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
+        'c':  [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
+        'd':  [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
+        }) 
+
+   def compute_metrics(x):
+       result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}
+       return pd.Series(result, name='metrics')
+
+   result = df.groupby('a').apply(compute_metrics)
+
+   result
+
+   result.stack()
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -112,6 +112,16 @@ API Changes
   - ``df.iloc[:-len(df)]`` is now empty
   - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse
 
+- Better propagation/preservation of Series names when performing groupby
+  operations:
+  - ``SeriesGroupBy.agg`` will ensure that the name attribute of the original
+    series is propagated to the result (:issue:`6265`). 
+  - If the function provided to ``GroupBy.apply`` returns a named series, the
+    name of the series will be kept as the name of the column index of the
+    DataFrame returned by ``GroupBy.apply`` (:issue:`6124`).  This facilitates
+    ``DataFrame.stack`` operations where the name of the column index is used as
+    the name of the inserted column containing the pivoted data.
+
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1783,7 +1783,8 @@ def _wrap_aggregated_output(self, output, names=None):
 
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
-            return Series([])
+            # GH #6265
+            return Series([], name=self.name)
 
         def _get_index():
             if self.grouper.nkeys > 1:
@@ -1805,7 +1806,8 @@ def _get_index():
             return self._concat_objects(keys, values,
                                         not_indexed_same=not_indexed_same)
         else:
-            return Series(values, index=_get_index())
+            # GH #6265
+            return Series(values, index=_get_index(), name=self.name)
 
     def _aggregate_named(self, func, *args, **kwargs):
         result = {}
@@ -2262,17 +2264,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
                 try:
                     if self.axis == 0:
+                        # GH6124 if the list of Series have a consistent name,
+                        # then propagate that name to the result.
+                        index = v.index.copy()
+                        if index.name is None:
+                            # Only propagate the series name to the result
+                            # if all series have a consistent name.  If the
+                            # series do not have a consistent name, do
+                            # nothing.
+                            names = set(v.name for v in values)
+                            if len(names) == 1:
+                                index.name = list(names)[0]
 
                         # normally use vstack as its faster than concat
                         # and if we have mi-columns
                         if not _np_version_under1p7 or isinstance(v.index,MultiIndex):
                             stacked_values = np.vstack([np.asarray(x) for x in values])
-                            result = DataFrame(stacked_values,index=key_index,columns=v.index)
+                            result = DataFrame(stacked_values,index=key_index,columns=index)
                         else:
                             # GH5788 instead of stacking; concat gets the dtypes correct
                             from pandas.tools.merge import concat
                             result = concat(values,keys=key_index,names=key_index.names,
                                             axis=self.axis).unstack()
+                            result.columns = index
                     else:
                         stacked_values = np.vstack([np.asarray(x) for x in values])
                         result = DataFrame(stacked_values.T,index=v.index,columns=key_index)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2057,6 +2057,41 @@ def test_groupby_series_with_name(self):
         self.assertIn('A', result2)
         self.assertIn('B', result2)
 
+    def test_seriesgroupby_name_attr(self):
+        # GH 6265
+        result = self.df.groupby('A')['C']
+        self.assertEquals(result.count().name, 'C')
+        self.assertEquals(result.mean().name, 'C')
+
+        testFunc = lambda x: np.sum(x)*2
+        self.assertEquals(result.agg(testFunc).name, 'C')
+
+    def test_groupby_name_propagation(self):
+        # GH 6124
+        def summarize(df, name=None):
+            return Series({
+                'count': 1,
+                'mean': 2,
+                'omissions': 3,
+            }, name=name)
+
+        def summarize_random_name(df):
+            # Provide a different name for each Series.  In this case, groupby
+            # should not attempt to propagate the Series name since they are
+            # inconsistent.
+            return Series({
+                'count': 1,
+                'mean': 2,
+                'omissions': 3,
+            }, name=df.iloc[0]['A'])
+
+        metrics = self.df.groupby('A').apply(summarize)
+        self.assertEqual(metrics.columns.name, None)
+        metrics = self.df.groupby('A').apply(summarize, 'metrics')
+        self.assertEqual(metrics.columns.name, 'metrics')
+        metrics = self.df.groupby('A').apply(summarize_random_name)
+        self.assertEqual(metrics.columns.name, None)
+
     def test_groupby_nonstring_columns(self):
         df = DataFrame([np.arange(10) for x in range(10)])
         grouped = df.groupby(0)