Skip to content

Refactor index-as-string groupby tests and fix spurious warning (Bug 17383) #17843

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,7 @@ Groupby/Resample/Rolling
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)
- Bug in ``DataFrame.groupby`` where spurious warning is raised when ``Grouper`` object is used to override ambiguous column name (:issue:`17383`)
- Bug in ``TimeGrouper`` differs when passes as a list and as a scalar (:issue:`17530`)

Sparse
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2704,7 +2704,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,

# a passed-in Grouper, directly convert
if isinstance(key, Grouper):
binner, grouper, obj = key._get_grouper(obj)
binner, grouper, obj = key._get_grouper(obj, validate=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that I had to add this flag to 'fix' this warning issue elsewhere, I don't really like it, but would require more refactoring to make this cleaner.

if key.key is None:
return grouper, [], obj
else:
Expand Down
152 changes: 0 additions & 152 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,158 +253,6 @@ def test_grouper_column_and_index(self):
expected = df_single.reset_index().groupby(['inner', 'B']).mean()
assert_frame_equal(result, expected)

def test_grouper_index_level_as_string(self):
# GH 5677, allow strings passed as the `by` parameter to reference
# columns or index levels

idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one']},
index=idx)

df_single = df_multi.reset_index('outer')

# Column and Index on MultiIndex
result = df_multi.groupby(['B', 'inner']).mean()
expected = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
assert_frame_equal(result, expected)

# Index and Column on MultiIndex
result = df_multi.groupby(['inner', 'B']).mean()
expected = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
assert_frame_equal(result, expected)

# Column and Index on single Index
result = df_single.groupby(['B', 'inner']).mean()
expected = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
assert_frame_equal(result, expected)

# Index and Column on single Index
result = df_single.groupby(['inner', 'B']).mean()
expected = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
assert_frame_equal(result, expected)

# Single element list of Index on MultiIndex
result = df_multi.groupby(['inner']).mean()
expected = df_multi.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Single element list of Index on single Index
result = df_single.groupby(['inner']).mean()
expected = df_single.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Index on MultiIndex
result = df_multi.groupby('inner').mean()
expected = df_multi.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

# Index on single Index
result = df_single.groupby('inner').mean()
expected = df_single.groupby(pd.Grouper(level='inner')).mean()
assert_frame_equal(result, expected)

def test_grouper_column_index_level_precedence(self):
# GH 5677, when a string passed as the `by` parameter
# matches a column and an index level the column takes
# precedence

idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
('b', 1), ('b', 2), ('b', 3)])
idx.names = ['outer', 'inner']
df_multi_both = pd.DataFrame({"A": np.arange(6),
'B': ['one', 'one', 'two',
'two', 'one', 'one'],
'inner': [1, 1, 1, 1, 1, 1]},
index=idx)

df_single_both = df_multi_both.reset_index('outer')

# Group MultiIndex by single key
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby('inner').mean()

expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pd.Grouper object in this expected expression shouldn't have been wrapped in a list. If it had not been, the spurious warning would have been raised in this test. This is corrected in the new test below.

assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by single key
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby('inner').mean()

expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group MultiIndex by single key list
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby(['inner']).mean()

expected = df_multi_both.groupby([pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by single key list
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby(['inner']).mean()

expected = df_single_both.groupby([pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby(pd.Grouper(level='inner')).mean()
assert not result.index.equals(not_expected.index)

# Group MultiIndex by two keys (1)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby(['B', 'inner']).mean()

expected = df_multi_both.groupby(['B',
pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby(['B',
pd.Grouper(level='inner')
]).mean()
assert not result.index.equals(not_expected.index)

# Group MultiIndex by two keys (2)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_multi_both.groupby(['inner', 'B']).mean()

expected = df_multi_both.groupby([pd.Grouper(key='inner'),
'B']).mean()
assert_frame_equal(result, expected)
not_expected = df_multi_both.groupby([pd.Grouper(level='inner'),
'B']).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by two keys (1)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby(['B', 'inner']).mean()

expected = df_single_both.groupby(['B',
pd.Grouper(key='inner')]).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby(['B',
pd.Grouper(level='inner')
]).mean()
assert not result.index.equals(not_expected.index)

# Group single Index by two keys (2)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = df_single_both.groupby(['inner', 'B']).mean()

expected = df_single_both.groupby([pd.Grouper(key='inner'),
'B']).mean()
assert_frame_equal(result, expected)
not_expected = df_single_both.groupby([pd.Grouper(level='inner'),
'B']).mean()
assert not result.index.equals(not_expected.index)

def test_grouper_getting_correct_binner(self):

# GH 10063
Expand Down
116 changes: 116 additions & 0 deletions pandas/tests/groupby/test_index_as_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import pytest
import pandas as pd
import numpy as np

from pandas.util.testing import assert_frame_equal, assert_series_equal
import pandas.util.testing as tm


@pytest.fixture(params=[['inner'], ['inner', 'outer']])
def frame(request):
levels = request.param
df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
'inner': [1, 2, 3, 1, 2, 3],
'A': np.arange(6),
'B': ['one', 'one', 'two', 'two', 'one', 'one']})
if levels:
df = df.set_index(levels)

return df


@pytest.fixture()
def series():
df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'],
'inner': [1, 2, 3, 1, 2, 3],
'A': np.arange(6),
'B': ['one', 'one', 'two', 'two', 'one', 'one']})
s = df.set_index(['outer', 'inner', 'B'])['A']

return s


@pytest.mark.parametrize('key_strs,groupers', [
('inner', # Index name
pd.Grouper(level='inner')
),
(['inner'], # List of index name
[pd.Grouper(level='inner')]
),
(['B', 'inner'], # Column and index
['B', pd.Grouper(level='inner')]
),
(['inner', 'B'], # Index and column
[pd.Grouper(level='inner'), 'B'])])
def test_grouper_index_level_as_string(frame, key_strs, groupers):
result = frame.groupby(key_strs).mean()
expected = frame.groupby(groupers).mean()
assert_frame_equal(result, expected)


@pytest.mark.parametrize('levels', [
'inner', 'outer', 'B',
['inner'], ['outer'], ['B'],
['inner', 'outer'], ['outer', 'inner'],
['inner', 'outer', 'B'], ['B', 'outer', 'inner']
])
def test_grouper_index_level_as_string_series(series, levels):

# Compute expected result
if isinstance(levels, list):
groupers = [pd.Grouper(level=lv) for lv in levels]
else:
groupers = pd.Grouper(level=levels)

expected = series.groupby(groupers).mean()

# Compute and check result
result = series.groupby(levels).mean()
assert_series_equal(result, expected)


@pytest.mark.parametrize('key_strs,key_groupers,level_groupers', [
('inner', # Index name
pd.Grouper(key='inner'),
pd.Grouper(level='inner'),
),
(['inner'], # List of index name
[pd.Grouper(key='inner')],
[pd.Grouper(level='inner')]
),
(['B', 'inner'], # Column and index
['B', pd.Grouper(key='inner')],
['B', pd.Grouper(level='inner')]
),
(['inner', 'B'], # Index and column
[pd.Grouper(key='inner'), 'B'],
[pd.Grouper(level='inner'), 'B'])])
def test_grouper_column_index_level_precedence(frame,
key_strs,
key_groupers,
level_groupers):

# GH 5677, when a string passed as the `by` parameter
# matches a column and an index level the column takes
# precedence and a FutureWarning is raised

# Add 'inner' column to frame
# (frame already has an 'inner' index)
frame['inner'] = [1, 1, 1, 1, 1, 1]

# Performing a groupby with strings should produce warning
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = frame.groupby(key_strs).mean()

# Grouping with key Grouper should produce the same result and no warning
with tm.assert_produces_warning(False):
expected = frame.groupby(key_groupers).mean()

assert_frame_equal(result, expected)

# Grouping with level Grouper should produce a difference result but
# still no warning
with tm.assert_produces_warning(False):
not_expected = frame.groupby(level_groupers).mean()

assert not result.index.equals(not_expected.index)