ENH: intercept numpy.sum in groupby, plotting/console improvements

wesm · wesm · commit f58749e39847 · 2012-03-20T11:44:48.000-04:00
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -778,7 +778,7 @@ def reset(self):
 def _put_lines(buf, lines):
     if any(isinstance(x, unicode) for x in lines):
         lines = [unicode(x) for x in lines]
-    print >> buf, '\n'.join(lines)
+    buf.write('\n'.join(lines))
 
 
 if __name__ == '__main__':
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -454,7 +454,9 @@ def _need_info_repr_(self):
             else:
                 return True
         else:
-            if len(self.index) > max_rows:
+            # save us
+            if (len(self.index) > max_rows or
+                len(self.columns) > terminal_width // 2):
                 return True
             else:
                 buf = StringIO()
@@ -1134,7 +1136,8 @@ def info(self, verbose=True, buf=None):
 
         cols = self.columns
 
-        if verbose:
+        # hack
+        if verbose and len(self.columns) < 100:
             lines.append('Data columns:')
             space = max([len(_stringify(k)) for k in self.columns]) + 4
             counts = self.count()
@@ -1145,11 +1148,7 @@ def info(self, verbose=True, buf=None):
                 lines.append(_put_str(col, space) +
                              '%d  non-null values' % count)
         else:
-            if len(cols) <= 2:
-                lines.append('Columns: %s' % repr(cols))
-            else:
-                lines.append('Columns: %s to %s' % (_stringify(cols[0]),
-                                                    _stringify(cols[-1])))
+            lines.append(self.columns.summary(name='Columns'))
 
         counts = self.get_dtype_counts()
         dtypes = ['%s(%d)' % k for k in sorted(counts.iteritems())]
@@ -3960,8 +3959,12 @@ def plot(self, subplots=False, sharex=True, sharey=False, use_index=True,
         if xlim is not None:
             ax.set_xlim(xlim)
 
-        if title and not subplots:
-            ax.set_title(title)
+        if title:
+            if subplots:
+                fig.suptitle(title)
+            else:
+                ax.set_title(title)
+
 
         plt.draw_if_interactive()
         if subplots:
@@ -4011,8 +4014,8 @@ def _bar_plot(self, axes, subplots=False, use_index=True, grid=True,
 
         if legend and not subplots:
             fig = ax.get_figure()
-            fig.legend([r[0] for r in rects], labels, loc='upper center',
-                       fancybox=True, ncol=6)
+            fig.legend([r[0] for r in rects], labels, loc='lower center',
+                       fancybox=True, ncol=6, borderaxespad=20)
                        #mode='expand')
 
         import matplotlib.pyplot as plt
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -899,6 +899,10 @@ def aggregate(self, func_or_funcs, *args, **kwargs):
         if hasattr(func_or_funcs,'__iter__'):
             ret = self._aggregate_multiple_funcs(func_or_funcs)
         else:
+            cyfunc = _intercept_cython(func_or_funcs)
+            if cyfunc and not args and not kwargs:
+                return getattr(self, cyfunc)()
+
             if len(self.grouper.groupings) > 1:
                 return self._python_agg_general(func_or_funcs, *args, **kwargs)
 
@@ -1158,6 +1162,10 @@ def aggregate(self, arg, *args, **kwargs):
         elif isinstance(arg, list):
             return self._aggregate_multiple_funcs(arg)
         else:
+            cyfunc = _intercept_cython(arg)
+            if cyfunc and not args and not kwargs:
+                return getattr(self, cyfunc)()
+
             if len(self.grouper.groupings) > 1:
                 return self._python_agg_general(arg, *args, **kwargs)
             else:
@@ -1194,7 +1202,7 @@ def _aggregate_multiple_funcs(self, arg):
                                      grouper=self.grouper)
                 results.append(colg.agg(arg))
                 keys.append(col)
-            except TypeError:
+            except (TypeError, GroupByError):
                 pass
 
         result = concat(results, keys=keys, axis=1)
@@ -1634,9 +1642,20 @@ def _reorder_by_uniques(uniques, labels):
     __builtin__.sum : np.sum
 }
 
+_cython_table = {
+    __builtin__.sum : 'sum',
+    np.sum : 'sum',
+    np.mean : 'mean',
+    np.std : 'std',
+    np.var : 'var'
+}
+
 def _intercept_function(func):
     return _func_table.get(func, func)
 
+def _intercept_cython(func):
+    return _cython_table.get(func)
+
 def _groupby_indices(values):
     if values.dtype != np.object_:
         values = values.astype('O')
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -113,13 +113,14 @@ def _has_complex_internals(self):
         # to disable groupby tricks in MultiIndex
         return False
 
-    def summary(self):
+    def summary(self, name=None):
         if len(self) > 0:
             index_summary = ', %s to %s' % (str(self[0]), str(self[-1]))
         else:
             index_summary = ''
 
-        name = type(self).__name__
+        if name is None:
+            name = type(self).__name__
         return '%s: %s entries%s' % (name, len(self), index_summary)
 
     def __str__(self):
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -202,7 +202,8 @@ def test_agg_apply_corner(self):
         grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan)
         assert_frame_equal(grouped.sum(),
                            DataFrame(columns=self.tsframe.columns))
-        assert_frame_equal(grouped.agg(np.sum), DataFrame({}))
+        assert_frame_equal(grouped.agg(np.sum),
+                           DataFrame(columns=self.tsframe.columns))
         assert_frame_equal(grouped.apply(np.sum), DataFrame({}))
 
     def test_agg_grouping_is_list_tuple(self):
@@ -863,7 +864,8 @@ def test_omit_nuisance(self):
 
         # won't work with axis = 1
         grouped = df.groupby({'A' : 0, 'C' : 0, 'D' : 1, 'E' : 1}, axis=1)
-        result = self.assertRaises(TypeError, grouped.agg, np.sum)
+        result = self.assertRaises(TypeError, grouped.agg,
+                                   lambda x: x.sum(1, numeric_only=False))
 
     def test_omit_nuisance_python_multiple(self):
         grouped = self.three_group.groupby(['A', 'B'])
@@ -1552,6 +1554,27 @@ def test_column_select_via_attr(self):
         expected = self.df.groupby('A').agg(np.mean)
         assert_frame_equal(result, expected)
 
+    def test_rank_apply(self):
+        lev1 = np.array([rands(10) for _ in xrange(1000)], dtype=object)
+        lev2 = np.array([rands(10) for _ in xrange(130)], dtype=object)
+        lab1 = np.random.randint(0, 1000, size=10000)
+        lab2 = np.random.randint(0, 130, size=10000)
+
+        df = DataFrame({'value' : np.random.randn(10000),
+                        'key1' : lev1.take(lab1),
+                        'key2' : lev2.take(lab2)})
+
+        result = df.groupby(['key1', 'key2']).value.rank()
+
+        expected = []
+        for key, piece in df.groupby(['key1', 'key2']):
+            expected.append(piece.value.rank())
+        expected = concat(expected, axis=0)
+        expected = expected.reindex(result.index)
+
+        assert_series_equal(result, expected)
+
+
 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
     tups = map(tuple, df[keys].values)
     tups = com._asarray_tuplesafe(tups)