diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 705514ac0c364..85aafd6787f16 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -74,6 +74,43 @@ Having specific :ref:`dtypes ` df2.dtypes +If you're using IPython, tab completion for column names (as well as public +attributes) is automatically enabled. Here's a subset of the attributes that +will be completed: + +.. ipython:: + + @verbatim + In [1]: df2. + + df2.A df2.boxplot + df2.abs df2.C + df2.add df2.clip + df2.add_prefix df2.clip_lower + df2.add_suffix df2.clip_upper + df2.align df2.columns + df2.all df2.combine + df2.any df2.combineAdd + df2.append df2.combine_first + df2.apply df2.combineMult + df2.applymap df2.compound + df2.as_blocks df2.consolidate + df2.asfreq df2.convert_objects + df2.as_matrix df2.copy + df2.astype df2.corr + df2.at df2.corrwith + df2.at_time df2.count + df2.axes df2.cov + df2.B df2.cummax + df2.between_time df2.cummin + df2.bfill df2.cumprod + df2.blocks df2.cumsum + df2.bool df2.D + +As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically +tab completed. ``E`` is there as well; the rest of the attributes have been +truncated for brevity. + Viewing Data ------------ diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index a8900bd83309f..723aee64fd0d9 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -188,6 +188,45 @@ however pass ``sort=False`` for potential speedups: df2.groupby(['X'], sort=True).sum() df2.groupby(['X'], sort=False).sum() +.. _groupby.tabcompletion: + +``GroupBy`` will tab complete column names (and other attributes) + +.. ipython:: python + :suppress: + + n = 10 + weight = np.random.normal(166, 20, size=n) + height = np.random.normal(60, 10, size=n) + time = date_range('1/1/2000', periods=n) + gender = tm.choice(['male', 'female'], size=n) + df = DataFrame({'height': height, 'weight': weight, + 'gender': gender}, index=time) + +.. ipython:: python + + df + gb = df.groupby('gender') + + +.. ipython:: + + @verbatim + In [1]: gb. + gb.agg gb.boxplot gb.cummin gb.describe gb.filter gb.get_group gb.height gb.last gb.median gb.ngroups gb.plot gb.rank gb.std gb.transform + gb.aggregate gb.count gb.cumprod gb.dtype gb.first gb.groups gb.hist gb.max gb.min gb.nth gb.prod gb.resample gb.sum gb.var + gb.apply gb.cummax gb.cumsum gb.fillna gb.gender gb.head gb.indices gb.mean gb.name gb.ohlc gb.quantile gb.size gb.tail gb.weight + + +.. ipython:: python + :suppress: + + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : randn(8), 'D' : randn(8)}) + .. _groupby.multiindex: GroupBy with MultiIndex diff --git a/doc/source/release.rst b/doc/source/release.rst index ebba7444e82d8..9fa111d32e4bb 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -380,6 +380,8 @@ See :ref:`Internal Refactoring` function signature. - :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from bs4/lxml (:issue:`4770`). + - Removed the ``keep_internal`` keyword parameter in + ``pandas/core/groupby.py`` because it wasn't being used (:issue:`5102`). .. _release.bug_fixes-0.13.0: @@ -544,7 +546,7 @@ Bug Fixes - Bug in setting with ``ix/loc`` and a mixed int/string index (:issue:`4544`) - Make sure series-series boolean comparions are label based (:issue:`4947`) - Bug in multi-level indexing with a Timestamp partial indexer (:issue:`4294`) - - Tests/fix for multi-index construction of an all-nan frame (:isue:`4078`) + - Tests/fix for multi-index construction of an all-nan frame (:issue:`4078`) - Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring values of tables with commas (:issue:`5029`) - Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable @@ -555,6 +557,11 @@ Bug Fixes type of headers (:issue:`5048`). - Fixed a bug where ``DatetimeIndex`` joins with ``PeriodIndex`` caused a stack overflow (:issue:`3899`). + - Fixed a bug where ``groupby`` objects didn't allow plots (:issue:`5102`). + - Fixed a bug where ``groupby`` objects weren't tab-completing column names + (:issue:`5102`). + - Fixed a bug where ``groupby.plot()`` and friends were duplicating figures + multiple times (:issue:`5102`). pandas 0.12.0 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e70c01ffcb12f..8938e48eb493b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1,4 +1,5 @@ import types +from functools import wraps import numpy as np from pandas.compat import( @@ -45,6 +46,10 @@ """ +# special case to prevent duplicate plots when catching exceptions when +# forwarding methods from NDFrames +_plotting_methods = frozenset(['plot', 'boxplot', 'hist']) + _apply_whitelist = frozenset(['last', 'first', 'mean', 'sum', 'min', 'max', 'head', 'tail', @@ -52,7 +57,8 @@ 'resample', 'describe', 'rank', 'quantile', 'count', - 'fillna', 'dtype']) + 'fillna', 'dtype']) | _plotting_methods + class GroupByError(Exception): @@ -180,7 +186,6 @@ class GroupBy(PandasObject): len(grouped) : int Number of groups """ - def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False): @@ -244,6 +249,9 @@ def _selection_list(self): return [self._selection] return self._selection + def _local_dir(self): + return sorted(set(self.obj._local_dir() + list(_apply_whitelist))) + def __getattr__(self, attr): if attr in self.obj: return self[attr] @@ -285,6 +293,15 @@ def curried_with_axis(x): def curried(x): return f(x, *args, **kwargs) + # preserve the name so we can detect it when calling plot methods, + # to avoid duplicates + curried.__name__ = curried_with_axis.__name__ = name + + # special case otherwise extra plots are created when catching the + # exception below + if name in _plotting_methods: + return self.apply(curried) + try: return self.apply(curried_with_axis) except Exception: @@ -348,7 +365,11 @@ def apply(self, func, *args, **kwargs): applied : type depending on grouped object and function """ func = _intercept_function(func) - f = lambda g: func(g, *args, **kwargs) + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + return self._python_apply_general(f) def _python_apply_general(self, f): @@ -598,7 +619,7 @@ def __iter__(self): def nkeys(self): return len(self.groupings) - def get_iterator(self, data, axis=0, keep_internal=True): + def get_iterator(self, data, axis=0): """ Groupby iterator @@ -607,16 +628,14 @@ def get_iterator(self, data, axis=0, keep_internal=True): Generator yielding sequence of (name, subsetted object) for each group """ - splitter = self._get_splitter(data, axis=axis, - keep_internal=keep_internal) + splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): yield key, group - def _get_splitter(self, data, axis=0, keep_internal=True): + def _get_splitter(self, data, axis=0): comp_ids, _, ngroups = self.group_info - return get_splitter(data, comp_ids, ngroups, axis=axis, - keep_internal=keep_internal) + return get_splitter(data, comp_ids, ngroups, axis=axis) def _get_group_keys(self): if len(self.groupings) == 1: @@ -627,19 +646,19 @@ def _get_group_keys(self): mapper = _KeyMapper(comp_ids, ngroups, self.labels, self.levels) return [mapper.get_key(i) for i in range(ngroups)] - def apply(self, f, data, axis=0, keep_internal=False): + def apply(self, f, data, axis=0): mutated = False - splitter = self._get_splitter(data, axis=axis, - keep_internal=keep_internal) + splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() # oh boy - if hasattr(splitter, 'fast_apply') and axis == 0: + if (f.__name__ not in _plotting_methods and + hasattr(splitter, 'fast_apply') and axis == 0): try: values, mutated = splitter.fast_apply(f, group_keys) return group_keys, values, mutated - except (Exception) as detail: - # we detect a mutatation of some kind + except Exception: + # we detect a mutation of some kind # so take slow path pass @@ -1043,7 +1062,7 @@ def get_iterator(self, data, axis=0): inds = lrange(start, n) yield self.binlabels[-1], data.take(inds, axis=axis) - def apply(self, f, data, axis=0, keep_internal=False): + def apply(self, f, data, axis=0): result_keys = [] result_values = [] mutated = False @@ -1617,6 +1636,7 @@ def filter(self, func, dropna=True, *args, **kwargs): else: return filtered.reindex(self.obj.index) # Fill with NaNs. + class NDFrameGroupBy(GroupBy): def _iterate_slices(self): @@ -1939,14 +1959,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): index = key_index else: stacked_values = np.vstack([np.asarray(x) - for x in values]).T + for x in values]).T index = values[0].index columns = key_index - except ValueError: - #GH1738,, values is list of arrays of unequal lengths - # fall through to the outer else caluse + except (ValueError, AttributeError): + # GH1738: values is list of arrays of unequal lengths fall + # through to the outer else caluse return Series(values, index=key_index) return DataFrame(stacked_values, index=index, @@ -2268,6 +2288,7 @@ def ohlc(self): """ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general('ohlc')) + from pandas.tools.plotting import boxplot_frame_groupby DataFrameGroupBy.boxplot = boxplot_frame_groupby @@ -2364,7 +2385,7 @@ class NDArrayGroupBy(GroupBy): class DataSplitter(object): - def __init__(self, data, labels, ngroups, axis=0, keep_internal=False): + def __init__(self, data, labels, ngroups, axis=0): self.data = data self.labels = com._ensure_int64(labels) self.ngroups = ngroups @@ -2419,10 +2440,8 @@ def _chop(self, sdata, slice_obj): class FrameSplitter(DataSplitter): - - def __init__(self, data, labels, ngroups, axis=0, keep_internal=False): - DataSplitter.__init__(self, data, labels, ngroups, axis=axis, - keep_internal=keep_internal) + def __init__(self, data, labels, ngroups, axis=0): + super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis) def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool @@ -2445,10 +2464,8 @@ def _chop(self, sdata, slice_obj): class NDFrameSplitter(DataSplitter): - - def __init__(self, data, labels, ngroups, axis=0, keep_internal=False): - DataSplitter.__init__(self, data, labels, ngroups, axis=axis, - keep_internal=keep_internal) + def __init__(self, data, labels, ngroups, axis=0): + super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis) self.factory = data._constructor diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index fec6460ea31f3..01cea90fa1e5a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2,6 +2,8 @@ import nose import unittest +from numpy.testing.decorators import slow + from datetime import datetime from numpy import nan @@ -9,8 +11,7 @@ from pandas.core.index import Index, MultiIndex from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame -from pandas.core.groupby import (GroupByError, SpecificationError, DataError, - _apply_whitelist) +from pandas.core.groupby import SpecificationError, DataError from pandas.core.series import Series from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -18,14 +19,12 @@ from pandas.compat import( range, long, lrange, StringIO, lmap, lzip, map, zip, builtins, OrderedDict ) -from pandas import compat, _np_version_under1p7 +from pandas import compat from pandas.core.panel import Panel from pandas.tools.merge import concat from collections import defaultdict import pandas.core.common as com -import pandas.core.datetools as dt import numpy as np -from numpy.testing import assert_equal import pandas.core.nanops as nanops @@ -2728,6 +2727,85 @@ def test_groupby_whitelist(self): with tm.assertRaisesRegexp(AttributeError, msg): getattr(gb, bl) + def test_series_groupby_plotting_nominally_works(self): + try: + import matplotlib as mpl + mpl.use('Agg') + except ImportError: + raise nose.SkipTest("matplotlib not installed") + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + gender = tm.choice(['male', 'female'], size=n) + + weight.groupby(gender).plot() + tm.close() + height.groupby(gender).hist() + tm.close() + + @slow + def test_frame_groupby_plot_boxplot(self): + try: + import matplotlib.pyplot as plt + import matplotlib as mpl + mpl.use('Agg') + except ImportError: + raise nose.SkipTest("matplotlib not installed") + tm.close() + + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + gender = tm.choice(['male', 'female'], size=n) + df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) + gb = df.groupby('gender') + + res = gb.plot() + self.assertEqual(len(plt.get_fignums()), 2) + self.assertEqual(len(res), 2) + tm.close() + + res = gb.boxplot() + self.assertEqual(len(plt.get_fignums()), 1) + self.assertEqual(len(res), 2) + tm.close() + + with tm.assertRaisesRegexp(TypeError, '.*str.+float'): + gb.hist() + + @slow + def test_frame_groupby_hist(self): + try: + import matplotlib.pyplot as plt + import matplotlib as mpl + mpl.use('Agg') + except ImportError: + raise nose.SkipTest("matplotlib not installed") + tm.close() + + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + gender_int = tm.choice([0, 1], size=n) + df_int = DataFrame({'height': height, 'weight': weight, + 'gender': gender_int}) + gb = df_int.groupby('gender') + axes = gb.hist() + self.assertEqual(len(axes), 2) + self.assertEqual(len(plt.get_fignums()), 2) + tm.close() + + def test_tab_completion(self): + grp = self.mframe.groupby(level='second') + results = set([v for v in dir(grp) if not v.startswith('_')]) + expected = set(['A','B','C', + 'agg','aggregate','apply','boxplot','filter','first','get_group', + 'groups','hist','indices','last','max','mean','median', + 'min','name','ngroups','nth','ohlc','plot', 'prod', + 'size','std','sum','transform','var', 'count', 'head', 'describe', + 'cummax', 'dtype', 'quantile', 'rank', 'cumprod', 'tail', + 'resample', 'cummin', 'fillna', 'cumsum']) + self.assertEqual(results, expected) def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() @@ -2764,7 +2842,5 @@ def testit(label_list, shape): if __name__ == '__main__': - import nose - nose.runmodule( - argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], - exit=False) + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', + '-s'], exit=False)