-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: GH17464 MultiIndex now raises an error when levels aren't unique, tests changed #17971
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 41 commits
a9a5177
91c8388
392ce8a
3c0812e
0983453
b39421f
5771426
48f9429
9ffc8ad
e70a2be
c28d3df
03f2da8
1d45ab6
015af48
9dc7eb5
9aa2bcd
e52460e
48d509d
d75e1de
0943d19
6f2efc6
840fe56
137bc16
0129ee7
8b400dc
0a8e9f2
cc7ebc7
b02114f
b56eca0
9f179e6
2af9aba
3e56aba
85d6379
49b731d
ec4f971
2684855
c36c236
2b3f4d4
386daaf
c169645
073e629
869157d
44e4552
297216b
fead79f
703ff1e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -177,8 +177,8 @@ def _verify_integrity(self, labels=None, levels=None): | |
Raises | ||
------ | ||
ValueError | ||
* if length of levels and labels don't match or any label would | ||
exceed level bounds | ||
If length of levels and labels don't match, if any label would | ||
exceed level bounds, or there are any duplicate levels. | ||
""" | ||
# NOTE: Currently does not check, among other things, that cached | ||
# nlevels matches nor that sortorder matches actually sortorder. | ||
|
@@ -198,6 +198,10 @@ def _verify_integrity(self, labels=None, levels=None): | |
" level (%d). NOTE: this index is in an" | ||
" inconsistent state" % (i, label.max(), | ||
len(level))) | ||
if not level.is_unique: | ||
raise ValueError("Level values must be unique: {0}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use kwargs rather than positional here |
||
" on level {1}".format( | ||
[value for value in level], i)) | ||
|
||
@property | ||
def levels(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,10 +52,10 @@ def test_frame_describe_multikey(self): | |
desc_groups = [] | ||
for col in self.tsframe: | ||
group = grouped[col].describe() | ||
group_col = pd.MultiIndex([[col] * len(group.columns), | ||
group.columns], | ||
[[0] * len(group.columns), | ||
range(len(group.columns))]) | ||
# GH 17464 - Remove duplicate MultiIndex levels | ||
group_col = pd.MultiIndex( | ||
levels=[[col], group.columns], | ||
labels=[[0] * len(group.columns), range(len(group.columns))]) | ||
group = pd.DataFrame(group.values, | ||
columns=group_col, | ||
index=group.index) | ||
|
@@ -67,8 +67,10 @@ def test_frame_describe_multikey(self): | |
'C': 1, 'D': 1}, axis=1) | ||
result = groupedT.describe() | ||
expected = self.tsframe.describe().T | ||
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], | ||
[range(4), range(len(expected.index))]) | ||
# GH 17464 - Remove duplicate MultiIndex levels | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't need this comment here |
||
expected.index = pd.MultiIndex( | ||
levels=[[0, 1], expected.index], | ||
labels=[[0, 0, 1, 1], range(len(expected.index))]) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_frame_describe_tupleindex(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -385,6 +385,83 @@ def test_attr_wrapper(self): | |
# make sure raises error | ||
pytest.raises(AttributeError, getattr, grouped, 'foo') | ||
|
||
def test_series_describe_multikey(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these tests were moved, revert this file |
||
ts = tm.makeTimeSeries() | ||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) | ||
result = grouped.describe() | ||
assert_series_equal(result['mean'], grouped.mean(), check_names=False) | ||
assert_series_equal(result['std'], grouped.std(), check_names=False) | ||
assert_series_equal(result['min'], grouped.min(), check_names=False) | ||
|
||
def test_series_describe_single(self): | ||
ts = tm.makeTimeSeries() | ||
grouped = ts.groupby(lambda x: x.month) | ||
result = grouped.apply(lambda x: x.describe()) | ||
expected = grouped.describe().stack() | ||
assert_series_equal(result, expected) | ||
|
||
def test_series_index_name(self): | ||
grouped = self.df.loc[:, ['C']].groupby(self.df['A']) | ||
result = grouped.agg(lambda x: x.mean()) | ||
assert result.index.name == 'A' | ||
|
||
def test_frame_describe_multikey(self): | ||
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) | ||
result = grouped.describe() | ||
desc_groups = [] | ||
for col in self.tsframe: | ||
group = grouped[col].describe() | ||
# GH 17464 - Remove duplicate MultiIndex levels | ||
group_col = pd.MultiIndex( | ||
levels=[[col], group.columns], | ||
labels=[[0] * len(group.columns), range(len(group.columns))]) | ||
group = pd.DataFrame(group.values, | ||
columns=group_col, | ||
index=group.index) | ||
desc_groups.append(group) | ||
expected = pd.concat(desc_groups, axis=1) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
groupedT = self.tsframe.groupby({'A': 0, 'B': 0, | ||
'C': 1, 'D': 1}, axis=1) | ||
result = groupedT.describe() | ||
expected = self.tsframe.describe().T | ||
# GH 17464 - Remove duplicate MultiIndex levels | ||
expected.index = pd.MultiIndex( | ||
levels=[[0, 1], expected.index], | ||
labels=[[0, 0, 1, 1], range(len(expected.index))]) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_frame_describe_tupleindex(self): | ||
|
||
# GH 14848 - regression from 0.19.0 to 0.19.1 | ||
df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, | ||
'y': [10, 20, 30, 40, 50] * 3, | ||
'z': [100, 200, 300, 400, 500] * 3}) | ||
df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 | ||
df2 = df1.rename(columns={'k': 'key'}) | ||
pytest.raises(ValueError, lambda: df1.groupby('k').describe()) | ||
pytest.raises(ValueError, lambda: df2.groupby('key').describe()) | ||
|
||
def test_frame_describe_unstacked_format(self): | ||
# GH 4792 | ||
prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, | ||
pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, | ||
pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} | ||
volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, | ||
pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, | ||
pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} | ||
df = pd.DataFrame({'PRICE': prices, | ||
'VOLUME': volumes}) | ||
result = df.groupby('PRICE').VOLUME.describe() | ||
data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), | ||
df[df.PRICE == 25499].VOLUME.describe().values.tolist()] | ||
expected = pd.DataFrame(data, | ||
index=pd.Index([24990, 25499], name='PRICE'), | ||
columns=['count', 'mean', 'std', 'min', | ||
'25%', '50%', '75%', 'max']) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_frame_groupby(self): | ||
grouped = self.tsframe.groupby(lambda x: x.weekday()) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1618,7 +1618,9 @@ def test_is_(self): | |
# shouldn't change | ||
assert mi2.is_(mi) | ||
mi4 = mi3.view() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add an explicit test for the issue, e.g that you are raising (use the example from the original issue). |
||
mi4.set_levels([[1 for _ in range(10)], lrange(10)], inplace=True) | ||
|
||
# GH 17464 - Remove duplicate MultiIndex levels | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don't need these comments here, rather they go on a new test (see above) |
||
mi4.set_levels([lrange(10), lrange(10)], inplace=True) | ||
assert not mi4.is_(mi3) | ||
mi5 = mi.view() | ||
mi5.set_levels(mi5.levels, inplace=True) | ||
|
@@ -2450,13 +2452,11 @@ def test_isna_behavior(self): | |
pd.isna(self.index) | ||
|
||
def test_level_setting_resets_attributes(self): | ||
ind = MultiIndex.from_arrays([ | ||
ind = pd.MultiIndex.from_arrays([ | ||
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] | ||
]) | ||
assert ind.is_monotonic | ||
ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], | ||
inplace=True) | ||
|
||
ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True) | ||
# if this fails, probably didn't reset the cache correctly. | ||
assert not ind.is_monotonic | ||
|
||
|
@@ -3083,3 +3083,15 @@ def test_million_record_attribute_error(self): | |
with tm.assert_raises_regex(AttributeError, | ||
"'Series' object has no attribute 'foo'"): | ||
df['a'].foo() | ||
|
||
def test_duplicate_multiindex_labels(self): | ||
# GH 17464 | ||
# Make sure that a MultiIndex with duplicate levels throws a ValueError | ||
with pytest.raises(ValueError): | ||
ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) | ||
# And that using set_levels with duplicate levels fails | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line |
||
ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], | ||
[1, 2, 1, 2, 3]]) | ||
with pytest.raises(ValueError): | ||
ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], | ||
inplace=True) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looks like some duplication here