From d196bb58c5302de61615a8572acb57d817d5a266 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 10 Oct 2017 16:34:11 -0400 Subject: [PATCH 1/5] Refactor groupby tests for using strings to reference index levels - Extract to separate file (test_index_as_string.py) - Parameterize over test DataFrames - Add series test case - Update test_grouper_column_index_level_precedence to reproduce false warning problem as described in GH17383 - Update test_grouper_column_index_level_precedence to verify when warning shouldn't be raised (Results in test failure due to GH17383) --- pandas/tests/groupby/test_groupby.py | 152 ------------------- pandas/tests/groupby/test_index_as_string.py | 142 +++++++++++++++++ 2 files changed, 142 insertions(+), 152 deletions(-) create mode 100644 pandas/tests/groupby/test_index_as_string.py diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 657de9b589dc9..740526e262d16 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -253,158 +253,6 @@ def test_grouper_column_and_index(self): expected = df_single.reset_index().groupby(['inner', 'B']).mean() assert_frame_equal(result, expected) - def test_grouper_index_level_as_string(self): - # GH 5677, allow strings passed as the `by` parameter to reference - # columns or index levels - - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - - df_single = df_multi.reset_index('outer') - - # Column and Index on MultiIndex - result = df_multi.groupby(['B', 'inner']).mean() - expected = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() - assert_frame_equal(result, expected) - - # Index and Column on MultiIndex - result = df_multi.groupby(['inner', 'B']).mean() - expected = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() - assert_frame_equal(result, expected) - - # Column and Index on single Index - result = df_single.groupby(['B', 'inner']).mean() - expected = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() - assert_frame_equal(result, expected) - - # Index and Column on single Index - result = df_single.groupby(['inner', 'B']).mean() - expected = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() - assert_frame_equal(result, expected) - - # Single element list of Index on MultiIndex - result = df_multi.groupby(['inner']).mean() - expected = df_multi.groupby(pd.Grouper(level='inner')).mean() - assert_frame_equal(result, expected) - - # Single element list of Index on single Index - result = df_single.groupby(['inner']).mean() - expected = df_single.groupby(pd.Grouper(level='inner')).mean() - assert_frame_equal(result, expected) - - # Index on MultiIndex - result = df_multi.groupby('inner').mean() - expected = df_multi.groupby(pd.Grouper(level='inner')).mean() - assert_frame_equal(result, expected) - - # Index on single Index - result = df_single.groupby('inner').mean() - expected = df_single.groupby(pd.Grouper(level='inner')).mean() - assert_frame_equal(result, expected) - - def test_grouper_column_index_level_precedence(self): - # GH 5677, when a string passed as the `by` parameter - # matches a column and an index level the column takes - # precedence - - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi_both = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one'], - 'inner': [1, 1, 1, 1, 1, 1]}, - index=idx) - - df_single_both = df_multi_both.reset_index('outer') - - # Group MultiIndex by single key - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_multi_both.groupby('inner').mean() - - expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean() - assert_frame_equal(result, expected) - not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean() - assert not result.index.equals(not_expected.index) - - # Group single Index by single key - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_single_both.groupby('inner').mean() - - expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean() - assert_frame_equal(result, expected) - not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean() - assert not result.index.equals(not_expected.index) - - # Group MultiIndex by single key list - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_multi_both.groupby(['inner']).mean() - - expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean() - assert_frame_equal(result, expected) - not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean() - assert not result.index.equals(not_expected.index) - - # Group single Index by single key list - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_single_both.groupby(['inner']).mean() - - expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean() - assert_frame_equal(result, expected) - not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean() - assert not result.index.equals(not_expected.index) - - # Group MultiIndex by two keys (1) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_multi_both.groupby(['B', 'inner']).mean() - - expected = df_multi_both.groupby(['B', - pd.Grouper(key='inner')]).mean() - assert_frame_equal(result, expected) - not_expected = df_multi_both.groupby(['B', - pd.Grouper(level='inner') - ]).mean() - assert not result.index.equals(not_expected.index) - - # Group MultiIndex by two keys (2) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_multi_both.groupby(['inner', 'B']).mean() - - expected = df_multi_both.groupby([pd.Grouper(key='inner'), - 'B']).mean() - assert_frame_equal(result, expected) - not_expected = df_multi_both.groupby([pd.Grouper(level='inner'), - 'B']).mean() - assert not result.index.equals(not_expected.index) - - # Group single Index by two keys (1) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_single_both.groupby(['B', 'inner']).mean() - - expected = df_single_both.groupby(['B', - pd.Grouper(key='inner')]).mean() - assert_frame_equal(result, expected) - not_expected = df_single_both.groupby(['B', - pd.Grouper(level='inner') - ]).mean() - assert not result.index.equals(not_expected.index) - - # Group single Index by two keys (2) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_single_both.groupby(['inner', 'B']).mean() - - expected = df_single_both.groupby([pd.Grouper(key='inner'), - 'B']).mean() - assert_frame_equal(result, expected) - not_expected = df_single_both.groupby([pd.Grouper(level='inner'), - 'B']).mean() - assert not result.index.equals(not_expected.index) - def test_grouper_getting_correct_binner(self): # GH 10063 diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py new file mode 100644 index 0000000000000..4f886dd5cbefa --- /dev/null +++ b/pandas/tests/groupby/test_index_as_string.py @@ -0,0 +1,142 @@ +import pytest +import pandas as pd +import numpy as np + +from pandas.util.testing import assert_frame_equal, assert_series_equal +import pandas.util.testing as tm + + +def build_df_multi(): + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), + ('b', 1), ('b', 2), ('b', 3)]) + idx.names = ['outer', 'inner'] + df_multi = pd.DataFrame({"A": np.arange(6), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + return df_multi + + +def build_df_single(): + df_single = build_df_multi().reset_index('outer') + return df_single + + +def build_test_series(): + series_multi = build_df_multi().set_index('B', append=True)['A'] + return series_multi + + +class TestGroupByIndexAsString(object): + + @pytest.mark.parametrize('frame', [build_df_multi(), build_df_single()]) + def test_grouper_index_level_as_string(self, frame): + # Column and Index + result = frame.groupby(['B', 'inner']).mean() + expected = frame.groupby(['B', pd.Grouper(level='inner')]).mean() + assert_frame_equal(result, expected) + + # Index and Column + result = frame.groupby(['inner', 'B']).mean() + expected = frame.groupby([pd.Grouper(level='inner'), 'B']).mean() + assert_frame_equal(result, expected) + + # Single element list of Index + result = frame.groupby(['inner']).mean() + expected = frame.groupby(pd.Grouper(level='inner')).mean() + assert_frame_equal(result, expected) + + # Index name + result = frame.groupby('inner').mean() + expected = frame.groupby(pd.Grouper(level='inner')).mean() + assert_frame_equal(result, expected) + + @pytest.mark.parametrize('levels', [ + 'inner', 'outer', 'B', + ['inner'], ['outer'], ['B'], + ['inner', 'outer'], ['inner', 'outer', 'B'] + ]) + def test_grouper_index_level_as_string_series(self, levels): + s = build_test_series() + + # Compute expected result + if isinstance(levels, list): + groupers = [pd.Grouper(level=lv) for lv in levels] + else: + groupers = pd.Grouper(level=levels) + + expected = s.groupby(groupers).mean() + + # Compute and check result + result = s.groupby(levels).mean() + assert_series_equal(result, expected) + + @pytest.mark.parametrize('frame', [build_df_multi(), build_df_single()]) + def test_grouper_column_index_level_precedence(self, frame): + # GH 5677, when a string passed as the `by` parameter + # matches a column and an index level the column takes + # precedence + + # Add 'inner' column to frame + # (frame already has an 'inner' index) + frame['inner'] = [1, 1, 1, 1, 1, 1] + + # Group by single key + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame.groupby('inner').mean() + + with tm.assert_produces_warning(False): + expected = frame.groupby(pd.Grouper(key='inner')).mean() + + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(False): + not_expected = frame.groupby(pd.Grouper(level='inner')).mean() + + assert not result.index.equals(not_expected.index) + + # Group by single key list + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame.groupby(['inner']).mean() + + with tm.assert_produces_warning(False): + expected = frame.groupby([pd.Grouper(key='inner')]).mean() + + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(False): + not_expected = frame.groupby(pd.Grouper(level='inner')).mean() + + assert not result.index.equals(not_expected.index) + + # Group by two keys ('B', 'inner') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame.groupby(['B', 'inner']).mean() + + with tm.assert_produces_warning(False): + expected = frame.groupby(['B', + pd.Grouper(key='inner')]).mean() + + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(False): + not_expected = frame.groupby(['B', + pd.Grouper(level='inner') + ]).mean() + + assert not result.index.equals(not_expected.index) + + # Group MultiIndex by two keys ('inner', 'B') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame.groupby(['inner', 'B']).mean() + + with tm.assert_produces_warning(False): + expected = frame.groupby([pd.Grouper(key='inner'), + 'B']).mean() + + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(False): + not_expected = frame.groupby([pd.Grouper(level='inner'), + 'B']).mean() + assert not result.index.equals(not_expected.index) From 571a46213ed11a185c976da96a2684469de93166 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 10 Oct 2017 16:34:48 -0400 Subject: [PATCH 2/5] Fix for GH17383 --- pandas/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 9518f17e5f4f1..54ced824b7353 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2704,7 +2704,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, # a passed-in Grouper, directly convert if isinstance(key, Grouper): - binner, grouper, obj = key._get_grouper(obj) + binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: From f9ae19a91c23cc158790cc752dc7eed4991d20b8 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 10 Oct 2017 16:56:23 -0400 Subject: [PATCH 3/5] Added whatsnew entry --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index f04410ef63531..5727888044229 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -974,6 +974,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) - Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) - Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) +- Bug in ``DataFrame.groupby`` where spurious warning is raised when ``Grouper`` object is used to override ambiguous column name (:issue:`17383`) - Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`) Sparse From edfbc3f0275fbfc96d71ebd515e9f68ed66ab9af Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 10 Oct 2017 17:07:51 -0400 Subject: [PATCH 4/5] Missed comment update during refactor --- pandas/tests/groupby/test_index_as_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 4f886dd5cbefa..86deb4eaf6757 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -126,7 +126,7 @@ def test_grouper_column_index_level_precedence(self, frame): assert not result.index.equals(not_expected.index) - # Group MultiIndex by two keys ('inner', 'B') + # Group by two keys ('inner', 'B') with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = frame.groupby(['inner', 'B']).mean() From 39451075320b18d7d06ff1da3b687cfc57d59dde Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Fri, 13 Oct 2017 10:04:05 -0400 Subject: [PATCH 5/5] Parameterize and fixturize tests --- pandas/tests/groupby/test_index_as_string.py | 242 +++++++++---------- 1 file changed, 108 insertions(+), 134 deletions(-) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 86deb4eaf6757..3b6e15036cfe2 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -6,137 +6,111 @@ import pandas.util.testing as tm -def build_df_multi(): - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - return df_multi - - -def build_df_single(): - df_single = build_df_multi().reset_index('outer') - return df_single - - -def build_test_series(): - series_multi = build_df_multi().set_index('B', append=True)['A'] - return series_multi - - -class TestGroupByIndexAsString(object): - - @pytest.mark.parametrize('frame', [build_df_multi(), build_df_single()]) - def test_grouper_index_level_as_string(self, frame): - # Column and Index - result = frame.groupby(['B', 'inner']).mean() - expected = frame.groupby(['B', pd.Grouper(level='inner')]).mean() - assert_frame_equal(result, expected) - - # Index and Column - result = frame.groupby(['inner', 'B']).mean() - expected = frame.groupby([pd.Grouper(level='inner'), 'B']).mean() - assert_frame_equal(result, expected) - - # Single element list of Index - result = frame.groupby(['inner']).mean() - expected = frame.groupby(pd.Grouper(level='inner')).mean() - assert_frame_equal(result, expected) - - # Index name - result = frame.groupby('inner').mean() - expected = frame.groupby(pd.Grouper(level='inner')).mean() - assert_frame_equal(result, expected) - - @pytest.mark.parametrize('levels', [ - 'inner', 'outer', 'B', - ['inner'], ['outer'], ['B'], - ['inner', 'outer'], ['inner', 'outer', 'B'] - ]) - def test_grouper_index_level_as_string_series(self, levels): - s = build_test_series() - - # Compute expected result - if isinstance(levels, list): - groupers = [pd.Grouper(level=lv) for lv in levels] - else: - groupers = pd.Grouper(level=levels) - - expected = s.groupby(groupers).mean() - - # Compute and check result - result = s.groupby(levels).mean() - assert_series_equal(result, expected) - - @pytest.mark.parametrize('frame', [build_df_multi(), build_df_single()]) - def test_grouper_column_index_level_precedence(self, frame): - # GH 5677, when a string passed as the `by` parameter - # matches a column and an index level the column takes - # precedence - - # Add 'inner' column to frame - # (frame already has an 'inner' index) - frame['inner'] = [1, 1, 1, 1, 1, 1] - - # Group by single key - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = frame.groupby('inner').mean() - - with tm.assert_produces_warning(False): - expected = frame.groupby(pd.Grouper(key='inner')).mean() - - assert_frame_equal(result, expected) - - with tm.assert_produces_warning(False): - not_expected = frame.groupby(pd.Grouper(level='inner')).mean() - - assert not result.index.equals(not_expected.index) - - # Group by single key list - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = frame.groupby(['inner']).mean() - - with tm.assert_produces_warning(False): - expected = frame.groupby([pd.Grouper(key='inner')]).mean() - - assert_frame_equal(result, expected) - - with tm.assert_produces_warning(False): - not_expected = frame.groupby(pd.Grouper(level='inner')).mean() - - assert not result.index.equals(not_expected.index) - - # Group by two keys ('B', 'inner') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = frame.groupby(['B', 'inner']).mean() - - with tm.assert_produces_warning(False): - expected = frame.groupby(['B', - pd.Grouper(key='inner')]).mean() - - assert_frame_equal(result, expected) - - with tm.assert_produces_warning(False): - not_expected = frame.groupby(['B', - pd.Grouper(level='inner') - ]).mean() - - assert not result.index.equals(not_expected.index) - - # Group by two keys ('inner', 'B') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = frame.groupby(['inner', 'B']).mean() - - with tm.assert_produces_warning(False): - expected = frame.groupby([pd.Grouper(key='inner'), - 'B']).mean() - - assert_frame_equal(result, expected) - - with tm.assert_produces_warning(False): - not_expected = frame.groupby([pd.Grouper(level='inner'), - 'B']).mean() - assert not result.index.equals(not_expected.index) +@pytest.fixture(params=[['inner'], ['inner', 'outer']]) +def frame(request): + levels = request.param + df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'], + 'inner': [1, 2, 3, 1, 2, 3], + 'A': np.arange(6), + 'B': ['one', 'one', 'two', 'two', 'one', 'one']}) + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture() +def series(): + df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'], + 'inner': [1, 2, 3, 1, 2, 3], + 'A': np.arange(6), + 'B': ['one', 'one', 'two', 'two', 'one', 'one']}) + s = df.set_index(['outer', 'inner', 'B'])['A'] + + return s + + +@pytest.mark.parametrize('key_strs,groupers', [ + ('inner', # Index name + pd.Grouper(level='inner') + ), + (['inner'], # List of index name + [pd.Grouper(level='inner')] + ), + (['B', 'inner'], # Column and index + ['B', pd.Grouper(level='inner')] + ), + (['inner', 'B'], # Index and column + [pd.Grouper(level='inner'), 'B'])]) +def test_grouper_index_level_as_string(frame, key_strs, groupers): + result = frame.groupby(key_strs).mean() + expected = frame.groupby(groupers).mean() + assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('levels', [ + 'inner', 'outer', 'B', + ['inner'], ['outer'], ['B'], + ['inner', 'outer'], ['outer', 'inner'], + ['inner', 'outer', 'B'], ['B', 'outer', 'inner'] +]) +def test_grouper_index_level_as_string_series(series, levels): + + # Compute expected result + if isinstance(levels, list): + groupers = [pd.Grouper(level=lv) for lv in levels] + else: + groupers = pd.Grouper(level=levels) + + expected = series.groupby(groupers).mean() + + # Compute and check result + result = series.groupby(levels).mean() + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('key_strs,key_groupers,level_groupers', [ + ('inner', # Index name + pd.Grouper(key='inner'), + pd.Grouper(level='inner'), + ), + (['inner'], # List of index name + [pd.Grouper(key='inner')], + [pd.Grouper(level='inner')] + ), + (['B', 'inner'], # Column and index + ['B', pd.Grouper(key='inner')], + ['B', pd.Grouper(level='inner')] + ), + (['inner', 'B'], # Index and column + [pd.Grouper(key='inner'), 'B'], + [pd.Grouper(level='inner'), 'B'])]) +def test_grouper_column_index_level_precedence(frame, + key_strs, + key_groupers, + level_groupers): + + # GH 5677, when a string passed as the `by` parameter + # matches a column and an index level the column takes + # precedence and a FutureWarning is raised + + # Add 'inner' column to frame + # (frame already has an 'inner' index) + frame['inner'] = [1, 1, 1, 1, 1, 1] + + # Performing a groupby with strings should produce warning + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame.groupby(key_strs).mean() + + # Grouping with key Grouper should produce the same result and no warning + with tm.assert_produces_warning(False): + expected = frame.groupby(key_groupers).mean() + + assert_frame_equal(result, expected) + + # Grouping with level Grouper should produce a difference result but + # still no warning + with tm.assert_produces_warning(False): + not_expected = frame.groupby(level_groupers).mean() + + assert not result.index.equals(not_expected.index)