-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: MultiIndex not dropping nan level and invalid code value #26408
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
e0d3bb4
af2ad91
248177b
b6d29bd
041b4cd
933615b
d4227c6
b66d3a4
81ebfda
7dfd698
ec265fa
af35fa3
6ec67b3
28c953f
055999d
d9db853
a141e40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -243,10 +243,18 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, | |
|
||
if verify_integrity: | ||
result._verify_integrity() | ||
|
||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if _set_identity: | ||
result._reset_identity() | ||
|
||
return result | ||
|
||
def _validate_codes(cls, level, code): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this accept self, and add doc-string There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed and doc-string added |
||
null_mask = isna(level) | ||
if np.any(null_mask): | ||
code = np.where(null_mask[code], -1, code) | ||
return code | ||
|
||
def _verify_integrity(self, codes=None, levels=None): | ||
""" | ||
|
||
|
@@ -277,16 +285,26 @@ def _verify_integrity(self, codes=None, levels=None): | |
raise ValueError("Unequal code lengths: %s" % | ||
([len(code_) for code_ in codes])) | ||
if len(level_codes) and level_codes.max() >= len(level): | ||
raise ValueError("On level %d, code max (%d) >= length of" | ||
" level (%d). NOTE: this index is in an" | ||
" inconsistent state" % (i, level_codes.max(), | ||
len(level))) | ||
raise ValueError("On level {level}, code max ({max_code})" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can do like this
(and if needed make the message nicely formatted over multiple lines, with parens e.g.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. formatting updated |
||
" >= length of level ({level_len}). " | ||
"NOTE: this index is in an inconsistent" | ||
" state".format( | ||
level=i, max_code=level_codes.max(), | ||
level_len=len(level))) | ||
if len(level_codes) and level_codes.min() < -1: | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise ValueError("On level {level}, code value ({code})" | ||
" < -1".format( | ||
level=i, code=level_codes.min())) | ||
if not level.is_unique: | ||
raise ValueError("Level values must be unique: {values} on " | ||
"level {level}".format( | ||
values=[value for value in level], | ||
level=i)) | ||
|
||
codes = [self._validate_codes(level, code) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for level, code in zip(levels, codes)] | ||
self._set_codes(codes, validate=False) | ||
|
||
@classmethod | ||
def from_arrays(cls, arrays, sortorder=None, names=None): | ||
""" | ||
|
@@ -675,7 +693,6 @@ def labels(self): | |
|
||
def _set_codes(self, codes, level=None, copy=False, validate=True, | ||
verify_integrity=False): | ||
|
||
if validate and level is None and len(codes) != self.nlevels: | ||
raise ValueError("Length of codes must match number of levels") | ||
if validate and level is not None and len(codes) != len(level): | ||
|
@@ -696,8 +713,9 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, | |
|
||
if verify_integrity: | ||
self._verify_integrity(codes=new_codes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you change _verfiy_integrity to return codes, this then becomes
|
||
else: | ||
self._codes = new_codes | ||
|
||
self._codes = new_codes | ||
self._tuples = None | ||
self._reset_cache() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,6 +67,7 @@ def test_constructor_mismatched_codes_levels(idx): | |
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." | ||
" NOTE: this index is in an inconsistent state") | ||
label_error = r"Unequal code lengths: \[4, 2\]" | ||
code_value_error = (r"On level 0, code value \(-2\) < -1") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need the outer parentheses here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
||
|
||
# important to check that it's looking at the right thing. | ||
with pytest.raises(ValueError, match=length_error): | ||
|
@@ -83,6 +84,37 @@ def test_constructor_mismatched_codes_levels(idx): | |
with pytest.raises(ValueError, match=label_error): | ||
idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) | ||
|
||
# code value smaller than -1 | ||
with pytest.raises(ValueError, match=code_value_error): | ||
MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) | ||
|
||
|
||
def test_na_levels(): | ||
# GH26408 | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], | ||
codes=[[0, -1, 1, 2, 3, 4]]) | ||
expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], | ||
codes=[[-1, -1, -1, -1, 3, 4]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[0, -1, 1, 2, 3, 4]]) | ||
expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[-1, -1, 1, -1, 3, -1]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
# verify set_levels and set_codes | ||
result = MultiIndex( | ||
levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( | ||
[[np.nan, 's', pd.NaT, 128, None]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
result = MultiIndex( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you replace the actual OP tests as well (e.g. the .dropna()); I don't think they are very different that what you have, but they also attempt to .dropna() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dropna tests added, and green now. |
||
levels=[[np.nan, 's', pd.NaT, 128, None]], | ||
codes=[[1, 2, 2, 2, 2, 2]]).set_codes( | ||
[[0, -1, 1, 2, 3, 4]]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_labels_deprecated(idx): | ||
# GH23752 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May need to move this under the "API breaking changes" section, as previously we did allow codes < 0?.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah sure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved the whatsnew entry to an API breaking change