From a492b5afcdaa211545d8eec62892821338d39f27 Mon Sep 17 00:00:00 2001 From: Matt Hagy Date: Tue, 8 Nov 2016 17:14:46 -0800 Subject: [PATCH 1/9] Fix indent level bug preventing wrapper function rename Original code intends to rename the wrapper function f using the provided name, but this isn't happening because code is incorrectly indented an extra level. Example: >>> from pandas.core.groupby import GroupBy >>> GroupBy.sum.__name__ 'f' Should be 'sum'. --- pandas/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2cc68bcabdd22..227e14d309306 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -194,7 +194,7 @@ def f(self, **kwargs): result = result._convert(datetime=True) return result - f.__name__ = name + f.__name__ = name return f From 2a54b77ac0150bf31d5e77bf8caa67fb7c1f9090 Mon Sep 17 00:00:00 2001 From: Matt Hagy Date: Tue, 8 Nov 2016 18:39:10 -0800 Subject: [PATCH 2/9] Test renaming of _groupby_function wrapper function --- pandas/tests/groupby/test_groupby.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9f5a7f404e2be..481522b38a34f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3929,6 +3929,12 @@ def test_tab_completion(self): 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) self.assertEqual(results, expected) + def test_groupby_function_rename(self): + grp = self.mframe.groupby(level='second') + for name in ['sum', 'prod', 'min', 'max', 'first', 'last']: + f = getattr(grp, name) + self.assertEqual(f.__name__, name) + def test_lower_int_prec_count(self): df = DataFrame({'a': np.array( [0, 1, 2, 100], np.int8), From 033e42d3051785d14fe39040ecee8db787126630 Mon Sep 17 00:00:00 2001 From: Matt Hagy Date: Tue, 8 Nov 2016 19:06:05 -0800 Subject: [PATCH 3/9] Test for consistency of attribute and method names Commented out and marked with a TODO since some are currently inconsistent and not immediately obvious how to fix all of them. --- pandas/tests/groupby/test_groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 481522b38a34f..f60be63c2882e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3840,7 +3840,9 @@ def test_groupby_whitelist(self): gb = obj.groupby(df.letters) self.assertEqual(whitelist, gb._apply_whitelist) for m in whitelist: - getattr(type(gb), m) + f = getattr(type(gb), m) + # TODO: Fix inconsistencies between attribute and method names + # self.assertEqual(f.__name__, m) AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] From 3bf899366ef0e0f582bb06c8fd0d1dd7f288374d Mon Sep 17 00:00:00 2001 From: Matt Hagy Date: Tue, 8 Nov 2016 19:47:24 -0800 Subject: [PATCH 4/9] Revise attribute/method consistency check to skip known inconsistencies --- pandas/tests/groupby/test_groupby.py | 35 ++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f60be63c2882e..8655eca289895 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3836,13 +3836,44 @@ def test_groupby_whitelist(self): 'nsmallest', ]) + # TODO: Fix these inconsistencies between attribute and method names + inconsistently_named = frozenset([ + 'tshift', + 'any', + 'dtypes', + 'idxmax', + 'all', + 'fillna', + 'rank', + 'quantile', + 'cummax', + 'take', + 'corr', + 'cummin', + 'diff', + 'plot', + 'pct_change', + 'skew', + 'hist', + 'bfill', + 'cov', + 'boxplot', + 'describe', + 'corrwith', + 'idxmin', + 'ffill', + 'mad', + 'dtype', + 'unique' + ]) + for obj, whitelist in zip((df, s), (df_whitelist, s_whitelist)): gb = obj.groupby(df.letters) self.assertEqual(whitelist, gb._apply_whitelist) for m in whitelist: f = getattr(type(gb), m) - # TODO: Fix inconsistencies between attribute and method names - # self.assertEqual(f.__name__, m) + if m not in inconsistently_named: + self.assertEqual(f.__name__, m) AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] From 68013bf832070f8b09b2a70fddfa4be8c7e81f97 Mon Sep 17 00:00:00 2001 From: Matt Hagy Date: Mon, 14 Nov 2016 16:56:08 -0800 Subject: [PATCH 5/9] Added a test for known inconsistent attribute/method names --- pandas/tests/groupby/test_groupby.py | 74 +++++++++++++++------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8655eca289895..de24e9bb56c32 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3753,6 +3753,19 @@ def test_groupby_selection_with_methods(self): assert_frame_equal(g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)) + # The methods returned by these attributes don't have a __name__ attribute + # that matches that attribute. + # TODO: Fix these inconsistencies + DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([ + 'boxplot', + 'bfill', + 'ffill' + ]) + S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([ + 'bfill', + 'ffill' + ]) + def test_groupby_whitelist(self): from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) @@ -3836,44 +3849,37 @@ def test_groupby_whitelist(self): 'nsmallest', ]) - # TODO: Fix these inconsistencies between attribute and method names - inconsistently_named = frozenset([ - 'tshift', - 'any', - 'dtypes', - 'idxmax', - 'all', - 'fillna', - 'rank', - 'quantile', - 'cummax', - 'take', - 'corr', - 'cummin', - 'diff', - 'plot', - 'pct_change', - 'skew', - 'hist', - 'bfill', - 'cov', - 'boxplot', - 'describe', - 'corrwith', - 'idxmin', - 'ffill', - 'mad', - 'dtype', - 'unique' - ]) - - for obj, whitelist in zip((df, s), (df_whitelist, s_whitelist)): + names_dont_match_pair = (self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, + self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) + for obj, whitelist, names_dont_match in zip((df, s), (df_whitelist, s_whitelist), names_dont_match_pair): gb = obj.groupby(df.letters) self.assertEqual(whitelist, gb._apply_whitelist) for m in whitelist: f = getattr(type(gb), m) - if m not in inconsistently_named: - self.assertEqual(f.__name__, m) + try: + n = f.__name__ + except AttributeError: + continue + if m not in names_dont_match: + self.assertEqual(n, m) + + def test_groupby_method_names_that_dont_match_attribute(self): + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 10 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + gb = df.groupby(df.letters) + s = df.floats + + names_dont_match_pair = (self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, + self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) + for obj, names_dont_match in zip((df, s), names_dont_match_pair): + gb = obj.groupby(df.letters) + for m in names_dont_match: + f = getattr(gb, m) + self.assertNotEqual(f.__name__, m) AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] From 781b9b38bab79ab3ec9539c657e74862814931e3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Mar 2017 16:32:29 -0400 Subject: [PATCH 6/9] Move _groupby_function inside GroupBy Add support for __qualname__ --- pandas/core/groupby.py | 140 ++++++++++++++------------- pandas/tests/groupby/test_groupby.py | 34 +++++-- 2 files changed, 98 insertions(+), 76 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 227e14d309306..ba60cda94b247 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -12,8 +12,8 @@ ) from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.compat.numpy import _np_version_under1p8 +from pandas.compat.numpy import function as nv, _np_version_under1p8 +from pandas.compat import set_function_name from pandas.types.common import (is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, @@ -172,64 +172,6 @@ 'cummin', 'cummax']) -def _groupby_function(name, alias, npfunc, numeric_only=True, - _convert=False): - - _local_template = "Compute %(f)s of group values" - - @Substitution(name='groupby', f=name) - @Appender(_doc_template) - @Appender(_local_template) - def f(self, **kwargs): - if 'numeric_only' not in kwargs: - kwargs['numeric_only'] = numeric_only - self._set_group_selection() - try: - return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError as e: - raise SpecificationError(str(e)) - except Exception: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - if _convert: - result = result._convert(datetime=True) - return result - - f.__name__ = name - - return f - - -def _first_compat(x, axis=0): - - def _first(x): - - x = np.asarray(x) - x = x[notnull(x)] - if len(x) == 0: - return np.nan - return x[0] - - if isinstance(x, DataFrame): - return x.apply(_first, axis=axis) - else: - return _first(x) - - -def _last_compat(x, axis=0): - def _last(x): - - x = np.asarray(x) - x = x[notnull(x)] - if len(x) == 0: - return np.nan - return x[-1] - - if isinstance(x, DataFrame): - return x.apply(_last, axis=axis) - else: - return _last(x) - - class Grouper(object): """ A Grouper allows the user to specify a groupby instruction for a target @@ -1184,14 +1126,74 @@ def size(self): result.name = getattr(self, 'name', None) return result - sum = _groupby_function('sum', 'add', np.sum) - prod = _groupby_function('prod', 'prod', np.prod) - min = _groupby_function('min', 'min', np.min, numeric_only=False) - max = _groupby_function('max', 'max', np.max, numeric_only=False) - first = _groupby_function('first', 'first', _first_compat, - numeric_only=False, _convert=True) - last = _groupby_function('last', 'last', _last_compat, numeric_only=False, - _convert=True) + @classmethod + def _add_numeric_operations(cls): + """ add numeric operations to the GroupBy generically """ + + def _groupby_function(name, alias, npfunc, + numeric_only=True, _convert=False): + + _local_template = "Compute %(f)s of group values" + + @Substitution(name='groupby', f=name) + @Appender(_doc_template) + @Appender(_local_template) + def f(self, **kwargs): + if 'numeric_only' not in kwargs: + kwargs['numeric_only'] = numeric_only + self._set_group_selection() + try: + return self._cython_agg_general(alias, alt=npfunc, **kwargs) + except AssertionError as e: + raise SpecificationError(str(e)) + except Exception: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + if _convert: + result = result._convert(datetime=True) + return result + + set_function_name(f, name, cls) + + return f + + def _first_compat(x, axis=0): + + def _first(x): + + x = np.asarray(x) + x = x[notnull(x)] + if len(x) == 0: + return np.nan + return x[0] + + if isinstance(x, DataFrame): + return x.apply(_first, axis=axis) + else: + return _first(x) + + + def _last_compat(x, axis=0): + def _last(x): + + x = np.asarray(x) + x = x[notnull(x)] + if len(x) == 0: + return np.nan + return x[-1] + + if isinstance(x, DataFrame): + return x.apply(_last, axis=axis) + else: + return _last(x) + + cls.sum = _groupby_function('sum', 'add', np.sum) + cls.prod = _groupby_function('prod', 'prod', np.prod) + cls.min = _groupby_function('min', 'min', np.min, numeric_only=False) + cls.max = _groupby_function('max', 'max', np.max, numeric_only=False) + cls.first = _groupby_function('first', 'first', _first_compat, + numeric_only=False, _convert=True) + cls.last = _groupby_function('last', 'last', _last_compat, numeric_only=False, + _convert=True) @Substitution(name='groupby') @Appender(_doc_template) @@ -1603,6 +1605,8 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] +GroupBy._add_numeric_operations() + @Appender(GroupBy.__doc__) def groupby(obj, by, **kwds): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index de24e9bb56c32..c67c0538c39db 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3849,19 +3849,36 @@ def test_groupby_whitelist(self): 'nsmallest', ]) - names_dont_match_pair = (self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, - self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) - for obj, whitelist, names_dont_match in zip((df, s), (df_whitelist, s_whitelist), names_dont_match_pair): + names_dont_match_pair = ( + self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, + self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) + for obj, whitelist, names_dont_match in ( + zip((df, s), + (df_whitelist, s_whitelist), + names_dont_match_pair)): + gb = obj.groupby(df.letters) - self.assertEqual(whitelist, gb._apply_whitelist) + + assert whitelist == gb._apply_whitelist for m in whitelist: f = getattr(type(gb), m) + + # name try: n = f.__name__ except AttributeError: continue if m not in names_dont_match: - self.assertEqual(n, m) + assert n == m + + # qualname + if compat.PY3: + try: + n = f.__qualname__ + except AttributeError: + continue + if m not in names_dont_match: + assert n.endswith(m) def test_groupby_method_names_that_dont_match_attribute(self): from string import ascii_lowercase @@ -3873,9 +3890,10 @@ def test_groupby_method_names_that_dont_match_attribute(self): gb = df.groupby(df.letters) s = df.floats - names_dont_match_pair = (self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, - self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) - for obj, names_dont_match in zip((df, s), names_dont_match_pair): + names_dont_match_pair = ( + self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, + self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) + for obj, names_dont_match in zip((df, s), names_dont_match_pair): gb = obj.groupby(df.letters) for m in names_dont_match: f = getattr(gb, m) From 8b185b4f57dcd53821bf06114bed0f4192a410ad Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Mar 2017 18:17:35 -0400 Subject: [PATCH 7/9] PEP --- pandas/core/groupby.py | 43 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ba60cda94b247..fe764a099bb63 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1130,8 +1130,8 @@ def size(self): def _add_numeric_operations(cls): """ add numeric operations to the GroupBy generically """ - def _groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False): + def groupby_function(name, alias, npfunc, + numeric_only=True, _convert=False): _local_template = "Compute %(f)s of group values" @@ -1143,11 +1143,13 @@ def f(self, **kwargs): kwargs['numeric_only'] = numeric_only self._set_group_selection() try: - return self._cython_agg_general(alias, alt=npfunc, **kwargs) + return self._cython_agg_general( + alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self.aggregate( + lambda x: npfunc(x, axis=self.axis)) if _convert: result = result._convert(datetime=True) return result @@ -1156,9 +1158,9 @@ def f(self, **kwargs): return f - def _first_compat(x, axis=0): + def first_compat(x, axis=0): - def _first(x): + def first(x): x = np.asarray(x) x = x[notnull(x)] @@ -1167,13 +1169,13 @@ def _first(x): return x[0] if isinstance(x, DataFrame): - return x.apply(_first, axis=axis) + return x.apply(first, axis=axis) else: - return _first(x) + return first(x) + def last_compat(x, axis=0): - def _last_compat(x, axis=0): - def _last(x): + def last(x): x = np.asarray(x) x = x[notnull(x)] @@ -1182,18 +1184,18 @@ def _last(x): return x[-1] if isinstance(x, DataFrame): - return x.apply(_last, axis=axis) + return x.apply(last, axis=axis) else: - return _last(x) + return last(x) - cls.sum = _groupby_function('sum', 'add', np.sum) - cls.prod = _groupby_function('prod', 'prod', np.prod) - cls.min = _groupby_function('min', 'min', np.min, numeric_only=False) - cls.max = _groupby_function('max', 'max', np.max, numeric_only=False) - cls.first = _groupby_function('first', 'first', _first_compat, - numeric_only=False, _convert=True) - cls.last = _groupby_function('last', 'last', _last_compat, numeric_only=False, - _convert=True) + cls.sum = groupby_function('sum', 'add', np.sum) + cls.prod = groupby_function('prod', 'prod', np.prod) + cls.min = groupby_function('min', 'min', np.min, numeric_only=False) + cls.max = groupby_function('max', 'max', np.max, numeric_only=False) + cls.first = groupby_function('first', 'first', first_compat, + numeric_only=False, _convert=True) + cls.last = groupby_function('last', 'last', last_compat, + numeric_only=False, _convert=True) @Substitution(name='groupby') @Appender(_doc_template) @@ -1605,6 +1607,7 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] + GroupBy._add_numeric_operations() From 205489b511528d23752506700c684b91b2a232c0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 Mar 2017 17:04:07 -0400 Subject: [PATCH 8/9] doc --- doc/source/whatsnew/v0.20.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 15566d207e31f..519d1e9f58cc8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -883,6 +883,7 @@ Bug Fixes - Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`) +- Properly set ``__name__`` and ``__qualname__`` for ``Groupby.*`` functions (:issue:`14620`) - Bug in ``.at`` when selecting from a tz-aware column (:issue:`15822`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`) From db3c6e4645f2d60f076c857df55f5b82743ff280 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 Mar 2017 17:14:06 -0400 Subject: [PATCH 9/9] clean/reorg tests --- pandas/tests/groupby/common.py | 36 +-- pandas/tests/groupby/test_groupby.py | 286 ----------------------- pandas/tests/groupby/test_whitelist.py | 301 +++++++++++++++++++++++++ 3 files changed, 324 insertions(+), 299 deletions(-) create mode 100644 pandas/tests/groupby/test_whitelist.py diff --git a/pandas/tests/groupby/common.py b/pandas/tests/groupby/common.py index 8a70777d08682..f3dccf473f53a 100644 --- a/pandas/tests/groupby/common.py +++ b/pandas/tests/groupby/common.py @@ -1,10 +1,31 @@ """ Base setup """ +import pytest import numpy as np from pandas.util import testing as tm from pandas import DataFrame, MultiIndex +@pytest.fixture +def mframe(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + +@pytest.fixture +def df(): + return DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + class MixIn(object): def setUp(self): @@ -15,12 +36,7 @@ def setUp(self): self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - + self.df = df() self.df_mixed_floats = DataFrame( {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], @@ -28,13 +44,7 @@ def setUp(self): 'D': np.array( np.random.randn(8), dtype='float32')}) - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + self.mframe = mframe() self.three_group = DataFrame( {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c67c0538c39db..83502434e6053 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3706,292 +3706,6 @@ def test_index_label_overlaps_location(self): expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) - def test_groupby_selection_with_methods(self): - # some methods which require DatetimeIndex - rng = pd.date_range('2014', periods=len(self.df)) - self.df.index = rng - - g = self.df.groupby(['A'])[['C']] - g_exp = self.df[['C']].groupby(self.df['A']) - # TODO check groupby with > 1 col ? - - # methods which are called as .foo() - methods = ['count', - 'corr', - 'cummax', - 'cummin', - 'cumprod', - 'describe', - 'rank', - 'quantile', - 'diff', - 'shift', - 'all', - 'any', - 'idxmin', - 'idxmax', - 'ffill', - 'bfill', - 'pct_change', - 'tshift'] - - for m in methods: - res = getattr(g, m)() - exp = getattr(g_exp, m)() - assert_frame_equal(res, exp) # should always be frames! - - # methods which aren't just .foo() - assert_frame_equal(g.fillna(0), g_exp.fillna(0)) - assert_frame_equal(g.dtypes, g_exp.dtypes) - assert_frame_equal(g.apply(lambda x: x.sum()), - g_exp.apply(lambda x: x.sum())) - - assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean()) - assert_frame_equal(g.resample('D').ohlc(), - g_exp.resample('D').ohlc()) - - assert_frame_equal(g.filter(lambda x: len(x) == 3), - g_exp.filter(lambda x: len(x) == 3)) - - # The methods returned by these attributes don't have a __name__ attribute - # that matches that attribute. - # TODO: Fix these inconsistencies - DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([ - 'boxplot', - 'bfill', - 'ffill' - ]) - S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE = frozenset([ - 'bfill', - 'ffill' - ]) - - def test_groupby_whitelist(self): - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 10 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - s = df.floats - - df_whitelist = frozenset([ - 'last', - 'first', - 'mean', - 'sum', - 'min', - 'max', - 'head', - 'tail', - 'cumcount', - 'resample', - 'rank', - 'quantile', - 'fillna', - 'mad', - 'any', - 'all', - 'take', - 'idxmax', - 'idxmin', - 'shift', - 'tshift', - 'ffill', - 'bfill', - 'pct_change', - 'skew', - 'plot', - 'boxplot', - 'hist', - 'median', - 'dtypes', - 'corrwith', - 'corr', - 'cov', - 'diff', - ]) - s_whitelist = frozenset([ - 'last', - 'first', - 'mean', - 'sum', - 'min', - 'max', - 'head', - 'tail', - 'cumcount', - 'resample', - 'rank', - 'quantile', - 'fillna', - 'mad', - 'any', - 'all', - 'take', - 'idxmax', - 'idxmin', - 'shift', - 'tshift', - 'ffill', - 'bfill', - 'pct_change', - 'skew', - 'plot', - 'hist', - 'median', - 'dtype', - 'corr', - 'cov', - 'diff', - 'unique', - 'nlargest', - 'nsmallest', - ]) - - names_dont_match_pair = ( - self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, - self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) - for obj, whitelist, names_dont_match in ( - zip((df, s), - (df_whitelist, s_whitelist), - names_dont_match_pair)): - - gb = obj.groupby(df.letters) - - assert whitelist == gb._apply_whitelist - for m in whitelist: - f = getattr(type(gb), m) - - # name - try: - n = f.__name__ - except AttributeError: - continue - if m not in names_dont_match: - assert n == m - - # qualname - if compat.PY3: - try: - n = f.__qualname__ - except AttributeError: - continue - if m not in names_dont_match: - assert n.endswith(m) - - def test_groupby_method_names_that_dont_match_attribute(self): - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 10 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - gb = df.groupby(df.letters) - s = df.floats - - names_dont_match_pair = ( - self.DF_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE, - self.S_METHOD_NAMES_THAT_DONT_MATCH_ATTRIBUTE) - for obj, names_dont_match in zip((df, s), names_dont_match_pair): - gb = obj.groupby(df.letters) - for m in names_dont_match: - f = getattr(gb, m) - self.assertNotEqual(f.__name__, m) - - AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', - 'mad', 'std', 'var', 'sem'] - AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] - - def test_regression_whitelist_methods(self): - - # GH6944 - # explicity test the whitelest methods - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - raw_frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - raw_frame.iloc[1, [1, 2]] = np.nan - raw_frame.iloc[7, [0, 1]] = np.nan - - for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, - lrange(2), lrange(2), - [True, False]): - - if axis == 0: - frame = raw_frame - else: - frame = raw_frame.T - - if op in self.AGG_FUNCTIONS_WITH_SKIPNA: - grouped = frame.groupby(level=level, axis=axis) - result = getattr(grouped, op)(skipna=skipna) - expected = getattr(frame, op)(level=level, axis=axis, - skipna=skipna) - assert_frame_equal(result, expected) - else: - grouped = frame.groupby(level=level, axis=axis) - result = getattr(grouped, op)() - expected = getattr(frame, op)(level=level, axis=axis) - assert_frame_equal(result, expected) - - def test_groupby_blacklist(self): - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 10 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - s = df.floats - - blacklist = [ - 'eval', 'query', 'abs', 'where', - 'mask', 'align', 'groupby', 'clip', 'astype', - 'at', 'combine', 'consolidate', 'convert_objects', - ] - to_methods = [method for method in dir(df) if method.startswith('to_')] - - blacklist.extend(to_methods) - - # e.g., to_csv - defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the " - "'apply' method$)") - - # e.g., query, eval - not_defined = "(?:^{1!r} object has no attribute {0!r}$)" - fmt = defined_but_not_allowed + '|' + not_defined - for bl in blacklist: - for obj in (df, s): - gb = obj.groupby(df.letters) - msg = fmt.format(bl, type(gb).__name__) - with tm.assertRaisesRegexp(AttributeError, msg): - getattr(gb, bl) - - def test_tab_completion(self): - grp = self.mframe.groupby(level='second') - results = set([v for v in dir(grp) if not v.startswith('_')]) - expected = set( - ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) - self.assertEqual(results, expected) - - def test_groupby_function_rename(self): - grp = self.mframe.groupby(level='second') - for name in ['sum', 'prod', 'min', 'max', 'first', 'last']: - f = getattr(grp, name) - self.assertEqual(f.__name__, name) - def test_lower_int_prec_count(self): df = DataFrame({'a': np.array( [0, 1, 2, 100], np.int8), diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py new file mode 100644 index 0000000000000..d566f34b7eae8 --- /dev/null +++ b/pandas/tests/groupby/test_whitelist.py @@ -0,0 +1,301 @@ +""" +test methods relating to generic function evaluation +the so-called white/black lists +""" + +import pytest +from string import ascii_lowercase +import numpy as np +from pandas import DataFrame, Series, compat, date_range, Index, MultiIndex +from pandas.util import testing as tm +from pandas.compat import lrange, product + +AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', + 'mad', 'std', 'var', 'sem'] +AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] + +df_whitelist = frozenset([ + 'last', + 'first', + 'mean', + 'sum', + 'min', + 'max', + 'head', + 'tail', + 'cumcount', + 'resample', + 'rank', + 'quantile', + 'fillna', + 'mad', + 'any', + 'all', + 'take', + 'idxmax', + 'idxmin', + 'shift', + 'tshift', + 'ffill', + 'bfill', + 'pct_change', + 'skew', + 'plot', + 'boxplot', + 'hist', + 'median', + 'dtypes', + 'corrwith', + 'corr', + 'cov', + 'diff', +]) + +s_whitelist = frozenset([ + 'last', + 'first', + 'mean', + 'sum', + 'min', + 'max', + 'head', + 'tail', + 'cumcount', + 'resample', + 'rank', + 'quantile', + 'fillna', + 'mad', + 'any', + 'all', + 'take', + 'idxmax', + 'idxmin', + 'shift', + 'tshift', + 'ffill', + 'bfill', + 'pct_change', + 'skew', + 'plot', + 'hist', + 'median', + 'dtype', + 'corr', + 'cov', + 'diff', + 'unique', + 'nlargest', + 'nsmallest', +]) + + +@pytest.fixture +def mframe(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + +@pytest.fixture +def df(): + return DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + +@pytest.fixture +def df_letters(): + letters = np.array(list(ascii_lowercase)) + N = 10 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + return df + + +@pytest.mark.parametrize( + "obj, whitelist", zip((df_letters(), df_letters().floats), + (df_whitelist, s_whitelist))) +def test_groupby_whitelist(df_letters, obj, whitelist): + df = df_letters + + # these are aliases so ok to have the alias __name__ + alias = {'bfill': 'backfill', + 'ffill': 'pad', + 'boxplot': None} + + gb = obj.groupby(df.letters) + + assert whitelist == gb._apply_whitelist + for m in whitelist: + + m = alias.get(m, m) + if m is None: + continue + + f = getattr(type(gb), m) + + # name + try: + n = f.__name__ + except AttributeError: + continue + assert n == m + + # qualname + if compat.PY3: + try: + n = f.__qualname__ + except AttributeError: + continue + assert n.endswith(m) + + +@pytest.fixture +def raw_frame(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + raw_frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + raw_frame.iloc[1, [1, 2]] = np.nan + raw_frame.iloc[7, [0, 1]] = np.nan + return raw_frame + + +@pytest.mark.parametrize( + "op, level, axis, skipna", + product(AGG_FUNCTIONS, + lrange(2), lrange(2), + [True, False])) +def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna): + # GH6944 + # explicity test the whitelest methods + + if axis == 0: + frame = raw_frame + else: + frame = raw_frame.T + + if op in AGG_FUNCTIONS_WITH_SKIPNA: + grouped = frame.groupby(level=level, axis=axis) + result = getattr(grouped, op)(skipna=skipna) + expected = getattr(frame, op)(level=level, axis=axis, + skipna=skipna) + tm.assert_frame_equal(result, expected) + else: + grouped = frame.groupby(level=level, axis=axis) + result = getattr(grouped, op)() + expected = getattr(frame, op)(level=level, axis=axis) + tm.assert_frame_equal(result, expected) + + +def test_groupby_blacklist(df_letters): + df = df_letters + s = df_letters.floats + + blacklist = [ + 'eval', 'query', 'abs', 'where', + 'mask', 'align', 'groupby', 'clip', 'astype', + 'at', 'combine', 'consolidate', 'convert_objects', + ] + to_methods = [method for method in dir(df) if method.startswith('to_')] + + blacklist.extend(to_methods) + + # e.g., to_csv + defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the " + "'apply' method$)") + + # e.g., query, eval + not_defined = "(?:^{1!r} object has no attribute {0!r}$)" + fmt = defined_but_not_allowed + '|' + not_defined + for bl in blacklist: + for obj in (df, s): + gb = obj.groupby(df.letters) + msg = fmt.format(bl, type(gb).__name__) + with tm.assertRaisesRegexp(AttributeError, msg): + getattr(gb, bl) + + +def test_tab_completion(mframe): + grp = mframe.groupby(level='second') + results = set([v for v in dir(grp) if not v.startswith('_')]) + expected = set( + ['A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', + 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', + 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', + 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', + 'nunique', 'head', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'all', 'shift', 'skew', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) + assert results == expected + + +def test_groupby_function_rename(mframe): + grp = mframe.groupby(level='second') + for name in ['sum', 'prod', 'min', 'max', 'first', 'last']: + f = getattr(grp, name) + assert f.__name__ == name + + +def test_groupby_selection_with_methods(df): + # some methods which require DatetimeIndex + rng = date_range('2014', periods=len(df)) + df.index = rng + + g = df.groupby(['A'])[['C']] + g_exp = df[['C']].groupby(df['A']) + # TODO check groupby with > 1 col ? + + # methods which are called as .foo() + methods = ['count', + 'corr', + 'cummax', + 'cummin', + 'cumprod', + 'describe', + 'rank', + 'quantile', + 'diff', + 'shift', + 'all', + 'any', + 'idxmin', + 'idxmax', + 'ffill', + 'bfill', + 'pct_change', + 'tshift'] + + for m in methods: + res = getattr(g, m)() + exp = getattr(g_exp, m)() + + # should always be frames! + tm.assert_frame_equal(res, exp) + + # methods which aren't just .foo() + tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + tm.assert_frame_equal(g.dtypes, g_exp.dtypes) + tm.assert_frame_equal(g.apply(lambda x: x.sum()), + g_exp.apply(lambda x: x.sum())) + + tm.assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean()) + tm.assert_frame_equal(g.resample('D').ohlc(), + g_exp.resample('D').ohlc()) + + tm.assert_frame_equal(g.filter(lambda x: len(x) == 3), + g_exp.filter(lambda x: len(x) == 3))