Skip to content

BUG: MultiIndex not dropping nan level and invalid code value #26408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 1, 2019
Merged
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,7 @@ MultiIndex
^^^^^^^^^^

- Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`)
-
-
- Bug in :class:`Multindex` construction from levels and codes that would incorrectly allows code values < -1 or NaN levels (:issue:`19387`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May need to move this under the "API breaking changes" section, as previously we did allow codes < 0?.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah sure

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved the whatsnew entry to an API breaking change


I/O
^^^
Expand Down
30 changes: 24 additions & 6 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,18 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None,

if verify_integrity:
result._verify_integrity()

if _set_identity:
result._reset_identity()

return result

def _validate_codes(cls, level, code):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this accept self, and add doc-string

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed and doc-string added

null_mask = isna(level)
if np.any(null_mask):
code = np.where(null_mask[code], -1, code)
return code

def _verify_integrity(self, codes=None, levels=None):
"""

Expand Down Expand Up @@ -277,16 +285,26 @@ def _verify_integrity(self, codes=None, levels=None):
raise ValueError("Unequal code lengths: %s" %
([len(code_) for code_ in codes]))
if len(level_codes) and level_codes.max() >= len(level):
raise ValueError("On level %d, code max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, level_codes.max(),
len(level)))
raise ValueError("On level {level}, code max ({max_code})"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can do like this

msg = "On level...........".format(....)
raise ValueError(msg)

(and if needed make the message nicely formatted over multiple lines, with parens e.g.

msg = ("........"
             ".......".format)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

formatting updated

" >= length of level ({level_len}). "
"NOTE: this index is in an inconsistent"
" state".format(
level=i, max_code=level_codes.max(),
level_len=len(level)))
if len(level_codes) and level_codes.min() < -1:
raise ValueError("On level {level}, code value ({code})"
" < -1".format(
level=i, code=level_codes.min()))
if not level.is_unique:
raise ValueError("Level values must be unique: {values} on "
"level {level}".format(
values=[value for value in level],
level=i))

codes = [self._validate_codes(level, code)
for level, code in zip(levels, codes)]
self._set_codes(codes, validate=False)

@classmethod
def from_arrays(cls, arrays, sortorder=None, names=None):
"""
Expand Down Expand Up @@ -675,7 +693,6 @@ def labels(self):

def _set_codes(self, codes, level=None, copy=False, validate=True,
verify_integrity=False):

if validate and level is None and len(codes) != self.nlevels:
raise ValueError("Length of codes must match number of levels")
if validate and level is not None and len(codes) != len(level):
Expand All @@ -696,8 +713,9 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,

if verify_integrity:
self._verify_integrity(codes=new_codes)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you change _verfiy_integrity to return codes, this then becomes

if verify_integrity:
      new_codes = self._verify_integrity(codes=new_codes)
self._codes = new_codes

else:
self._codes = new_codes

self._codes = new_codes
self._tuples = None
self._reset_cache()

Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/indexes/multi/test_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def test_constructor_mismatched_codes_levels(idx):
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\."
" NOTE: this index is in an inconsistent state")
label_error = r"Unequal code lengths: \[4, 2\]"
code_value_error = (r"On level 0, code value \(-2\) < -1")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need the outer parentheses here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed


# important to check that it's looking at the right thing.
with pytest.raises(ValueError, match=length_error):
Expand All @@ -83,6 +84,37 @@ def test_constructor_mismatched_codes_levels(idx):
with pytest.raises(ValueError, match=label_error):
idx.copy().set_codes([[0, 0, 0, 0], [0, 0]])

# code value smaller than -1
with pytest.raises(ValueError, match=code_value_error):
MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]])


def test_na_levels():
# GH26408
result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
codes=[[0, -1, 1, 2, 3, 4]])
expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
codes=[[-1, -1, -1, -1, 3, 4]])
tm.assert_index_equal(result, expected)

result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]],
codes=[[0, -1, 1, 2, 3, 4]])
expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]],
codes=[[-1, -1, 1, -1, 3, -1]])
tm.assert_index_equal(result, expected)

# verify set_levels and set_codes
result = MultiIndex(
levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels(
[[np.nan, 's', pd.NaT, 128, None]])
tm.assert_index_equal(result, expected)

result = MultiIndex(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you replace the actual OP tests as well (e.g. the .dropna()); I don't think they are very different that what you have, but they also attempt to .dropna()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropna tests added, and green now.

levels=[[np.nan, 's', pd.NaT, 128, None]],
codes=[[1, 2, 2, 2, 2, 2]]).set_codes(
[[0, -1, 1, 2, 3, 4]])
tm.assert_index_equal(result, expected)


def test_labels_deprecated(idx):
# GH23752
Expand Down