BUG: implement remaining as_index=False handling in DataFrameGroupBy.aggregate. address GH #181

wesm · wesm · commit aaea503ce211 · 2011-10-01T15:55:49.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -38,6 +38,11 @@ This is an incremental bug fix and performance enhancement release
   - Implemented `BlockManager.take` resulting in significantly faster `take`
     performance on mixed-type `DataFrame` objects (GH #104)
   - Improved performance of `Series.sort_index`
+  - Significant groupby performance enhancement: removed unnecessary integrity
+    checks in DataFrame internals that were slowing down slicing operations to
+    retrieve groups
+  - Added informative Exception when passing dict to DataFrame groupby
+    aggregation with axis != 0
 
 **Bug fixes**
 
@@ -46,6 +51,8 @@ This is an incremental bug fix and performance enhancement release
   - Fixed bug in unstacking code manifesting with more than 3 hierarchical
     levels
   - Throw exception when step specified in label-based slice (GH #185)
+  - Fix isnull to correctly work with np.float32. Fix upstream bug described in
+    GH #182
 
 pandas 0.4.1
 ============
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -88,8 +88,11 @@ def __init__(self, obj, grouper=None, axis=0, level=None,
         self.level = level
 
         if not as_index:
-            assert(isinstance(obj, DataFrame))
-            assert(axis == 0)
+            if not isinstance(obj, DataFrame):
+                raise TypeError('as_index=False only valid with DataFrame')
+            if axis != 0:
+                raise ValueError('as_index=False only valid for axis=0')
+
         self.as_index = as_index
 
         if groupings is None:
@@ -686,6 +689,9 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
             else:
                 ret = Series({})
 
+        if not self.as_index:  # pragma: no cover
+            print 'Warning, ignoring as_index=True'
+
         return ret
 
     def _wrap_aggregated_output(self, output, mask):
@@ -858,6 +864,9 @@ def aggregate(self, arg, *args, **kwargs):
 
         result = {}
         if isinstance(arg, dict):
+            if self.axis != 0:  # pragma: no cover
+                raise ValueError('Can only pass dict with axis=0')
+
             for col, func in arg.iteritems():
                 result[col] = self[col].agg(func)
 
@@ -870,6 +879,19 @@ def aggregate(self, arg, *args, **kwargs):
                     return self._aggregate_item_by_item(arg, *args, **kwargs)
             result = self._aggregate_generic(arg, *args, **kwargs)
 
+        if not self.as_index:
+            if isinstance(result.index, MultiIndex):
+                zipped = zip(result.index.levels, result.index.labels,
+                             result.index.names)
+                for i, (lev, lab, name) in enumerate(zipped):
+                    result.insert(i, name, lev.values.take(lab))
+                result = result.consolidate()
+            else:
+                values = result.index.values
+                name = self.groupings[0].name
+                result.insert(0, name, values)
+            result.index = np.arange(len(result))
+
         return result
 
     def _aggregate_generic(self, func, *args, **kwargs):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -517,7 +517,34 @@ def _check_op(op):
 
         assert_series_equal(result, expected)
 
-    def test_groupby_as_index(self):
+    def test_groupby_as_index_agg(self):
+        grouped = self.df.groupby('A', as_index=False)
+
+        # single-key
+
+        result = grouped.agg(np.mean)
+        expected = grouped.mean()
+        assert_frame_equal(result, expected)
+
+        result2 = grouped.agg({'C' : np.mean, 'D' : np.sum})
+        expected2 = grouped.mean()
+        expected2['D'] = grouped.sum()['D']
+        assert_frame_equal(result2, expected2)
+
+        # multi-key
+
+        grouped = self.df.groupby(['A', 'B'], as_index=False)
+
+        result = grouped.agg(np.mean)
+        expected = grouped.mean()
+        assert_frame_equal(result, expected)
+
+        result2 = grouped.agg({'C' : np.mean, 'D' : np.sum})
+        expected2 = grouped.mean()
+        expected2['D'] = grouped.sum()['D']
+        assert_frame_equal(result2, expected2)
+
+    def test_groupby_as_index_cython(self):
         data = self.df
 
         # single-key