Merge pull request #5105 from cpcloud/enable-groupby-plot-and-completion

cpcloud · cpcloud · commit 7a886b5047dc · 2013-10-04T10:19:17.000-07:00
BUG: allow plot, boxplot, hist and completion on GroupBy objects
diff --git a/doc/source/10min.rst b/doc/source/10min.rst
@@ -74,6 +74,43 @@ Having specific :ref:`dtypes <basics.dtypes>`
 
    df2.dtypes
 
+If you're using IPython, tab completion for column names (as well as public
+attributes) is automatically enabled. Here's a subset of the attributes that
+will be completed:
+
+.. ipython::
+
+   @verbatim
+   In [1]: df2.<TAB>
+
+   df2.A                  df2.boxplot
+   df2.abs                df2.C
+   df2.add                df2.clip
+   df2.add_prefix         df2.clip_lower
+   df2.add_suffix         df2.clip_upper
+   df2.align              df2.columns
+   df2.all                df2.combine
+   df2.any                df2.combineAdd
+   df2.append             df2.combine_first
+   df2.apply              df2.combineMult
+   df2.applymap           df2.compound
+   df2.as_blocks          df2.consolidate
+   df2.asfreq             df2.convert_objects
+   df2.as_matrix          df2.copy
+   df2.astype             df2.corr
+   df2.at                 df2.corrwith
+   df2.at_time            df2.count
+   df2.axes               df2.cov
+   df2.B                  df2.cummax
+   df2.between_time       df2.cummin
+   df2.bfill              df2.cumprod
+   df2.blocks             df2.cumsum
+   df2.bool               df2.D
+
+As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically
+tab completed. ``E`` is there as well; the rest of the attributes have been
+truncated for brevity.
+
 Viewing Data
 ------------
 
diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -188,6 +188,45 @@ however pass ``sort=False`` for potential speedups:
    df2.groupby(['X'], sort=True).sum()
    df2.groupby(['X'], sort=False).sum()
 
+.. _groupby.tabcompletion:
+
+``GroupBy`` will tab complete column names (and other attributes)
+
+.. ipython:: python
+   :suppress:
+
+   n = 10
+   weight = np.random.normal(166, 20, size=n)
+   height = np.random.normal(60, 10, size=n)
+   time = date_range('1/1/2000', periods=n)
+   gender = tm.choice(['male', 'female'], size=n)
+   df = DataFrame({'height': height, 'weight': weight,
+                           'gender': gender}, index=time)
+
+.. ipython:: python
+
+   df
+   gb = df.groupby('gender')
+
+
+.. ipython::
+
+   @verbatim
+   In [1]: gb.<TAB>
+   gb.agg        gb.boxplot    gb.cummin     gb.describe   gb.filter     gb.get_group  gb.height     gb.last       gb.median     gb.ngroups    gb.plot       gb.rank       gb.std        gb.transform
+   gb.aggregate  gb.count      gb.cumprod    gb.dtype      gb.first      gb.groups     gb.hist       gb.max        gb.min        gb.nth        gb.prod       gb.resample   gb.sum        gb.var
+   gb.apply      gb.cummax     gb.cumsum     gb.fillna     gb.gender     gb.head       gb.indices    gb.mean       gb.name       gb.ohlc       gb.quantile   gb.size       gb.tail       gb.weight
+
+
+.. ipython:: python
+   :suppress:
+
+   df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
+                          'foo', 'bar', 'foo', 'foo'],
+                   'B' : ['one', 'one', 'two', 'three',
+                          'two', 'two', 'one', 'three'],
+                   'C' : randn(8), 'D' : randn(8)})
+
 .. _groupby.multiindex:
 
 GroupBy with MultiIndex
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -380,6 +380,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
    function signature.
  - :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from
    bs4/lxml (:issue:`4770`).
+ - Removed the ``keep_internal`` keyword parameter in
+   ``pandas/core/groupby.py`` because it wasn't being used (:issue:`5102`).
 
 .. _release.bug_fixes-0.13.0:
 
@@ -544,7 +546,7 @@ Bug Fixes
   - Bug in setting with ``ix/loc`` and a mixed int/string index (:issue:`4544`)
   - Make sure series-series boolean comparions are label based (:issue:`4947`)
   - Bug in multi-level indexing with a Timestamp partial indexer (:issue:`4294`)
-  - Tests/fix for multi-index construction of an all-nan frame (:isue:`4078`)
+  - Tests/fix for multi-index construction of an all-nan frame (:issue:`4078`)
   - Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring
     values of tables with commas (:issue:`5029`)
   - Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable
@@ -555,6 +557,11 @@ Bug Fixes
     type of headers (:issue:`5048`).
   - Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a
     stack overflow (:issue:`3899`).
+  - Fixed a bug where ``groupby`` objects didn't allow plots (:issue:`5102`).
+  - Fixed a bug where ``groupby`` objects weren't tab-completing column names
+    (:issue:`5102`).
+  - Fixed a bug where ``groupby.plot()`` and friends were duplicating figures
+    multiple times (:issue:`5102`).
 
 
 pandas 0.12.0
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1,4 +1,5 @@
 import types
+from functools import wraps
 import numpy as np
 
 from pandas.compat import(
@@ -45,14 +46,19 @@
 """
 
 
+# special case to prevent duplicate plots when catching exceptions when
+# forwarding methods from NDFrames
+_plotting_methods = frozenset(['plot', 'boxplot', 'hist'])
+
 _apply_whitelist = frozenset(['last', 'first',
                               'mean', 'sum', 'min', 'max',
                               'head', 'tail',
                               'cumsum', 'cumprod', 'cummin', 'cummax',
                               'resample',
                               'describe',
                               'rank', 'quantile', 'count',
-                              'fillna', 'dtype'])
+                              'fillna', 'dtype']) | _plotting_methods
+
 
 
 class GroupByError(Exception):
@@ -180,7 +186,6 @@ class GroupBy(PandasObject):
     len(grouped) : int
         Number of groups
     """
-
     def __init__(self, obj, keys=None, axis=0, level=None,
                  grouper=None, exclusions=None, selection=None, as_index=True,
                  sort=True, group_keys=True, squeeze=False):
@@ -244,6 +249,9 @@ def _selection_list(self):
             return [self._selection]
         return self._selection
 
+    def _local_dir(self):
+        return sorted(set(self.obj._local_dir() + list(_apply_whitelist)))
+
     def __getattr__(self, attr):
         if attr in self.obj:
             return self[attr]
@@ -285,6 +293,15 @@ def curried_with_axis(x):
             def curried(x):
                 return f(x, *args, **kwargs)
 
+            # preserve the name so we can detect it when calling plot methods,
+            # to avoid duplicates
+            curried.__name__ = curried_with_axis.__name__ = name
+
+            # special case otherwise extra plots are created when catching the
+            # exception below
+            if name in _plotting_methods:
+                return self.apply(curried)
+
             try:
                 return self.apply(curried_with_axis)
             except Exception:
@@ -348,7 +365,11 @@ def apply(self, func, *args, **kwargs):
         applied : type depending on grouped object and function
         """
         func = _intercept_function(func)
-        f = lambda g: func(g, *args, **kwargs)
+
+        @wraps(func)
+        def f(g):
+            return func(g, *args, **kwargs)
+
         return self._python_apply_general(f)
 
     def _python_apply_general(self, f):
@@ -598,7 +619,7 @@ def __iter__(self):
     def nkeys(self):
         return len(self.groupings)
 
-    def get_iterator(self, data, axis=0, keep_internal=True):
+    def get_iterator(self, data, axis=0):
         """
         Groupby iterator
 
@@ -607,16 +628,14 @@ def get_iterator(self, data, axis=0, keep_internal=True):
         Generator yielding sequence of (name, subsetted object)
         for each group
         """
-        splitter = self._get_splitter(data, axis=axis,
-                                      keep_internal=keep_internal)
+        splitter = self._get_splitter(data, axis=axis)
         keys = self._get_group_keys()
         for key, (i, group) in zip(keys, splitter):
             yield key, group
 
-    def _get_splitter(self, data, axis=0, keep_internal=True):
+    def _get_splitter(self, data, axis=0):
         comp_ids, _, ngroups = self.group_info
-        return get_splitter(data, comp_ids, ngroups, axis=axis,
-                            keep_internal=keep_internal)
+        return get_splitter(data, comp_ids, ngroups, axis=axis)
 
     def _get_group_keys(self):
         if len(self.groupings) == 1:
@@ -627,19 +646,19 @@ def _get_group_keys(self):
             mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels)
             return [mapper.get_key(i) for i in range(ngroups)]
 
-    def apply(self, f, data, axis=0, keep_internal=False):
+    def apply(self, f, data, axis=0):
         mutated = False
-        splitter = self._get_splitter(data, axis=axis,
-                                      keep_internal=keep_internal)
+        splitter = self._get_splitter(data, axis=axis)
         group_keys = self._get_group_keys()
 
         # oh boy
-        if hasattr(splitter, 'fast_apply') and axis == 0:
+        if (f.__name__ not in _plotting_methods and
+            hasattr(splitter, 'fast_apply') and axis == 0):
             try:
                 values, mutated = splitter.fast_apply(f, group_keys)
                 return group_keys, values, mutated
-            except (Exception) as detail:
-                # we detect a mutatation of some kind
+            except Exception:
+                # we detect a mutation of some kind
                 # so take slow path
                 pass
 
@@ -1043,7 +1062,7 @@ def get_iterator(self, data, axis=0):
                 inds = lrange(start, n)
                 yield self.binlabels[-1], data.take(inds, axis=axis)
 
-    def apply(self, f, data, axis=0, keep_internal=False):
+    def apply(self, f, data, axis=0):
         result_keys = []
         result_values = []
         mutated = False
@@ -1617,6 +1636,7 @@ def filter(self, func, dropna=True, *args, **kwargs):
         else:
             return filtered.reindex(self.obj.index) # Fill with NaNs.
 
+
 class NDFrameGroupBy(GroupBy):
 
     def _iterate_slices(self):
@@ -1939,14 +1959,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                         index = key_index
                     else:
                         stacked_values = np.vstack([np.asarray(x)
-                                                for x in values]).T
+                                                    for x in values]).T
 
                         index = values[0].index
                         columns = key_index
 
-                except ValueError:
-                    #GH1738,, values is list of arrays of unequal lengths
-                    # fall through to the outer else caluse
+                except (ValueError, AttributeError):
+                    # GH1738: values is list of arrays of unequal lengths fall
+                    # through to the outer else caluse
                     return Series(values, index=key_index)
 
                 return DataFrame(stacked_values, index=index,
@@ -2268,6 +2288,7 @@ def ohlc(self):
         """
         return self._apply_to_column_groupbys(lambda x: x._cython_agg_general('ohlc'))
 
+
 from pandas.tools.plotting import boxplot_frame_groupby
 DataFrameGroupBy.boxplot = boxplot_frame_groupby
 
@@ -2364,7 +2385,7 @@ class NDArrayGroupBy(GroupBy):
 
 class DataSplitter(object):
 
-    def __init__(self, data, labels, ngroups, axis=0, keep_internal=False):
+    def __init__(self, data, labels, ngroups, axis=0):
         self.data = data
         self.labels = com._ensure_int64(labels)
         self.ngroups = ngroups
@@ -2419,10 +2440,8 @@ def _chop(self, sdata, slice_obj):
 
 
 class FrameSplitter(DataSplitter):
-
-    def __init__(self, data, labels, ngroups, axis=0, keep_internal=False):
-        DataSplitter.__init__(self, data, labels, ngroups, axis=axis,
-                              keep_internal=keep_internal)
+    def __init__(self, data, labels, ngroups, axis=0):
+        super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
 
     def fast_apply(self, f, names):
         # must return keys::list, values::list, mutated::bool
@@ -2445,10 +2464,8 @@ def _chop(self, sdata, slice_obj):
 
 
 class NDFrameSplitter(DataSplitter):
-
-    def __init__(self, data, labels, ngroups, axis=0, keep_internal=False):
-        DataSplitter.__init__(self, data, labels, ngroups, axis=axis,
-                              keep_internal=keep_internal)
+    def __init__(self, data, labels, ngroups, axis=0):
+        super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis)
 
         self.factory = data._constructor
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py