ENH: can pass a list of functions to DataFrame.groupby per #166

wesm · wesm · commit 3440a9cc6b77 · 2012-01-11T21:28:31.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -128,6 +128,8 @@ pandas 0.7.0
     the number of displayed digits (GH #395)
   - Use bottleneck if available for performing NaN-friendly statistical
     operations that it implemented (GH #91)
+  - Can pass a list of functions to aggregate with groupby on a DataFrame,
+    yielding an aggregated result with hierarchical columns (GH #166)
 
 **Bug fixes**
 
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -310,8 +310,15 @@ function's name (stored in the function object) will be used.
    grouped['D'].agg({'result1' : np.sum,
                      'result2' : np.mean})
 
-We would like to enable this functionality for DataFrame, too. The result will
-likely have a MultiIndex for the columns.
+On a grouped DataFrame, you can pass a list of functions to apply to each
+column, which produces an aggregated result with a hierarchical index:
+
+.. ipython:: python
+
+   grouped.agg([np.sum, np.mean, np.std])
+
+Passing a dict of functions has different behavior by default, see the next
+section.
 
 Applying different functions to DataFrame columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -971,6 +971,8 @@ def aggregate(self, arg, *args, **kwargs):
                 result[col] = colg.agg(func)
 
             result = DataFrame(result)
+        elif isinstance(arg, list):
+            return self._aggregate_multiple_funcs(arg)
         else:
             if len(self.groupings) > 1:
                 return self._python_agg_general(arg, *args, **kwargs)
@@ -992,6 +994,29 @@ def aggregate(self, arg, *args, **kwargs):
 
         return result
 
+    def _aggregate_multiple_funcs(self, arg):
+        from pandas.tools.merge import concat
+
+        if self.axis != 0:
+            raise NotImplementedError
+
+        obj = self._obj_with_exclusions
+
+        results = []
+        keys = []
+        for col in obj:
+            try:
+                colg = SeriesGroupBy(obj[col], column=col,
+                                     groupings=self.groupings)
+                results.append(colg.agg(arg))
+                keys.append(col)
+            except TypeError:
+                pass
+
+        result = concat(results, keys=keys, axis=1)
+
+        return result
+
     def _aggregate_generic(self, func, *args, **kwargs):
         assert(len(self.groupings) == 1)
 
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -1896,7 +1896,7 @@ def _get_combined_index(indexes, intersect=False):
             index = index.intersection(other)
         return index
     union =  _union_indexes(indexes)
-    return Index(union)
+    return _ensure_index(union)
 
 def _get_distinct_indexes(indexes):
     return dict((id(x), x) for x in indexes).values()
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -13,6 +13,7 @@
 from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
                                  assert_series_equal, assert_almost_equal)
 from pandas.core.panel import Panel
+from pandas.tools.merge import concat
 from collections import defaultdict
 import pandas._tseries as lib
 import pandas.core.datetools as dt
@@ -564,6 +565,30 @@ def test_multi_key_multiple_functions(self):
                               'std' : grouped.agg(np.std)})
         assert_frame_equal(agged, expected)
 
+    def test_frame_multi_key_function_list(self):
+        data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
+                                 'bar', 'bar', 'bar', 'bar',
+                                 'foo', 'foo', 'foo'],
+                          'B' : ['one', 'one', 'one', 'two',
+                                 'one', 'one', 'one', 'two',
+                                 'two', 'two', 'one'],
+                          'C' : ['dull', 'dull', 'shiny', 'dull',
+                                 'dull', 'shiny', 'shiny', 'dull',
+                                 'shiny', 'shiny', 'shiny'],
+                          'D' : np.random.randn(11),
+                          'E' : np.random.randn(11),
+                          'F' : np.random.randn(11)})
+
+        grouped = data.groupby(['A', 'B'])
+        funcs = [np.mean, np.std]
+        agged = grouped.agg(funcs)
+        expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
+                           grouped['F'].agg(funcs)],
+                          keys=['D', 'E', 'F'], axis=1)
+        assert(isinstance(agged.index, MultiIndex))
+        assert(isinstance(expected.index, MultiIndex))
+        assert_frame_equal(agged, expected)
+
     def test_groupby_multiple_columns(self):
         data = self.df
         grouped = data.groupby(['A', 'B'])
diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py
@@ -21,6 +21,6 @@ def sample(values, k):
 ts2 = Series(np.random.randn(sz), idx2)
 """
 stmt = "ts1 + ts2"
-bm_align1 = Benchmark(stmt, setup,
-                      name="series_align_int64_index",
-                      start_date=datetime(2010, 6, 1), logy=True)
+series_align_int64_index = Benchmark(stmt, setup,
+                                     start_date=datetime(2010, 6, 1),
+                                     logy=True)
diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py
@@ -17,14 +17,11 @@
 dict_list = [dict(zip(columns, row)) for row in frame.values]
 """
 
-frame_ctor_nested_dict = \
-    Benchmark("DataFrame(data)", setup, name='frame_ctor_nested_dict')
+frame_ctor_nested_dict = Benchmark("DataFrame(data)", setup)
 
 # From JSON-like stuff
 
-frame_ctor_list_of_dict = \
-    Benchmark("DataFrame(dict_list)", setup, name='frame_ctor_list_of_dict',
-              start_date=datetime(2011, 12, 20))
+frame_ctor_list_of_dict = Benchmark("DataFrame(dict_list)", setup,
+                                    start_date=datetime(2011, 12, 20))
 
-series_ctor_from_dict = \
-    Benchmark("Series(some_dict)", setup, name='series_ctor_from_dict')
+series_ctor_from_dict = Benchmark("Series(some_dict)", setup)
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -33,21 +33,17 @@ def f():
 
 stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
 groupby_multi_python = Benchmark(stmt1, setup,
-                                 name="groupby_multi_python",
                                  start_date=datetime(2011, 7, 1))
 
 stmt3 = "df.groupby(['key1', 'key2']).sum()"
 groupby_multi_cython = Benchmark(stmt3, setup,
-                                 name="groupby_multi_cython",
                                  start_date=datetime(2011, 7, 1))
 
 stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
 groupby_multi_series_op = Benchmark(stmt, setup,
-                                    name="groupby_multi_series_op",
                                     start_date=datetime(2011, 8, 1))
 
 groupby_series_simple_cython = \
     Benchmark('simple_series.groupby(key1).sum()', setup,
-              name='groupby_series_simple_cython',
               start_date=datetime(2011, 3, 1))
 
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
@@ -16,6 +16,9 @@
                           if isinstance(v, Benchmark)]
     benchmarks.extend(by_module[modname])
 
+for bm in benchmarks:
+    assert(bm.name is not None)
+
 REPO_PATH = '/home/wesm/code/pandas'
 REPO_URL = 'git@github.com:wesm/pandas.git'
 DB_PATH = '/home/wesm/code/pandas/vb_suite/benchmarks.db'