Skip to content

Commit 73d8f96

Browse files
jiangyue12392jreback
authored andcommitted
BUG: MultiIndex not dropping nan level and invalid code value (#26408)
1 parent 2c6d005 commit 73d8f96

File tree

4 files changed

+143
-12
lines changed

4 files changed

+143
-12
lines changed

doc/source/whatsnew/v0.25.0.rst

+36-1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`)
119119
120120
df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
121121
122+
123+
.. _whatsnew_0250.api_breaking.multi_indexing:
124+
125+
126+
MultiIndex constructed from levels and codes
127+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
128+
129+
Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously.
130+
Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes
131+
would be reassigned as -1. (:issue:`19387`)
132+
133+
.. ipython:: python
134+
135+
mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
136+
codes=[[0, -1, 1, 2, 3, 4]])
137+
mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]])
138+
139+
*Previous Behavior*:
140+
141+
.. code-block:: ipython
142+
143+
In [1]: mi1
144+
Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]],
145+
codes=[[0, -1, 1, 2, 3, 4]])
146+
In [2]: mi2
147+
Out[2]: MultiIndex(levels=[[1, 2]],
148+
codes=[[0, -2]])
149+
150+
*New Behavior*:
151+
152+
.. ipython:: python
153+
154+
mi1
155+
mi2
156+
157+
122158
.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once:
123159

124160
GroupBy.apply on ``DataFrame`` evaluates first group only once
@@ -536,7 +572,6 @@ MultiIndex
536572

537573
- Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`)
538574
-
539-
-
540575

541576
I/O
542577
^^^

pandas/core/indexes/multi.py

+52-10
Original file line numberDiff line numberDiff line change
@@ -243,11 +243,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None,
243243
result.sortorder = sortorder
244244

245245
if verify_integrity:
246-
result._verify_integrity()
246+
new_codes = result._verify_integrity()
247+
result._codes = new_codes
248+
247249
if _set_identity:
248250
result._reset_identity()
251+
249252
return result
250253

254+
def _validate_codes(self, level: list, code: list):
255+
"""
256+
Reassign code values as -1 if their corresponding levels are NaN.
257+
258+
Parameters
259+
----------
260+
code : list
261+
Code to reassign.
262+
level : list
263+
Level to check for missing values (NaN, NaT, None).
264+
265+
Returns
266+
-------
267+
code : new code where code value = -1 if it corresponds
268+
to a level with missing values (NaN, NaT, None).
269+
"""
270+
null_mask = isna(level)
271+
if np.any(null_mask):
272+
code = np.where(null_mask[code], -1, code)
273+
return code
274+
251275
def _verify_integrity(self, codes=None, levels=None):
252276
"""
253277
@@ -263,6 +287,11 @@ def _verify_integrity(self, codes=None, levels=None):
263287
ValueError
264288
If length of levels and codes don't match, if the codes for any
265289
level would exceed level bounds, or there are any duplicate levels.
290+
291+
Returns
292+
-------
293+
codes : new codes where code value = -1 if it corresponds to a
294+
NaN level.
266295
"""
267296
# NOTE: Currently does not check, among other things, that cached
268297
# nlevels matches nor that sortorder matches actually sortorder.
@@ -272,22 +301,33 @@ def _verify_integrity(self, codes=None, levels=None):
272301
if len(levels) != len(codes):
273302
raise ValueError("Length of levels and codes must match. NOTE:"
274303
" this index is in an inconsistent state.")
275-
codes_length = len(self.codes[0])
304+
codes_length = len(codes[0])
276305
for i, (level, level_codes) in enumerate(zip(levels, codes)):
277306
if len(level_codes) != codes_length:
278307
raise ValueError("Unequal code lengths: %s" %
279308
([len(code_) for code_ in codes]))
280309
if len(level_codes) and level_codes.max() >= len(level):
281-
raise ValueError("On level %d, code max (%d) >= length of"
282-
" level (%d). NOTE: this index is in an"
283-
" inconsistent state" % (i, level_codes.max(),
284-
len(level)))
310+
msg = ("On level {level}, code max ({max_code}) >= length of "
311+
"level ({level_len}). NOTE: this index is in an "
312+
"inconsistent state".format(
313+
level=i, max_code=level_codes.max(),
314+
level_len=len(level)))
315+
raise ValueError(msg)
316+
if len(level_codes) and level_codes.min() < -1:
317+
raise ValueError("On level {level}, code value ({code})"
318+
" < -1".format(
319+
level=i, code=level_codes.min()))
285320
if not level.is_unique:
286321
raise ValueError("Level values must be unique: {values} on "
287322
"level {level}".format(
288323
values=[value for value in level],
289324
level=i))
290325

326+
codes = [self._validate_codes(level, code)
327+
for level, code in zip(levels, codes)]
328+
new_codes = FrozenList(codes)
329+
return new_codes
330+
291331
@classmethod
292332
def from_arrays(cls, arrays, sortorder=None, names=None):
293333
"""
@@ -586,7 +626,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True,
586626
new_levels = FrozenList(new_levels)
587627

588628
if verify_integrity:
589-
self._verify_integrity(levels=new_levels)
629+
new_codes = self._verify_integrity(levels=new_levels)
630+
self._codes = new_codes
590631

591632
names = self.names
592633
self._levels = new_levels
@@ -676,7 +717,6 @@ def labels(self):
676717

677718
def _set_codes(self, codes, level=None, copy=False, validate=True,
678719
verify_integrity=False):
679-
680720
if validate and level is None and len(codes) != self.nlevels:
681721
raise ValueError("Length of codes must match number of levels")
682722
if validate and level is not None and len(codes) != len(level):
@@ -696,9 +736,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
696736
new_codes = FrozenList(new_codes)
697737

698738
if verify_integrity:
699-
self._verify_integrity(codes=new_codes)
739+
new_codes = self._verify_integrity(codes=new_codes)
700740

701741
self._codes = new_codes
742+
702743
self._tuples = None
703744
self._reset_cache()
704745

@@ -1763,9 +1804,10 @@ def __setstate__(self, state):
17631804

17641805
self._set_levels([Index(x) for x in levels], validate=False)
17651806
self._set_codes(codes)
1807+
new_codes = self._verify_integrity()
1808+
self._set_codes(new_codes)
17661809
self._set_names(names)
17671810
self.sortorder = sortorder
1768-
self._verify_integrity()
17691811
self._reset_identity()
17701812

17711813
def __getitem__(self, key):

pandas/tests/indexes/multi/test_constructor.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ def test_constructor_mismatched_codes_levels(idx):
6363
with pytest.raises(ValueError, match=msg):
6464
MultiIndex(levels=levels, codes=codes)
6565

66-
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\."
66+
length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\."
6767
" NOTE: this index is in an inconsistent state")
6868
label_error = r"Unequal code lengths: \[4, 2\]"
69+
code_value_error = r"On level 0, code value \(-2\) < -1"
6970

7071
# important to check that it's looking at the right thing.
7172
with pytest.raises(ValueError, match=length_error):
@@ -82,6 +83,44 @@ def test_constructor_mismatched_codes_levels(idx):
8283
with pytest.raises(ValueError, match=label_error):
8384
idx.copy().set_codes([[0, 0, 0, 0], [0, 0]])
8485

86+
# test set_codes with verify_integrity=False
87+
# the setting should not raise any value error
88+
idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]],
89+
verify_integrity=False)
90+
91+
# code value smaller than -1
92+
with pytest.raises(ValueError, match=code_value_error):
93+
MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]])
94+
95+
96+
def test_na_levels():
97+
# GH26408
98+
# test if codes are re-assigned value -1 for levels
99+
# with mising values (NaN, NaT, None)
100+
result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
101+
codes=[[0, -1, 1, 2, 3, 4]])
102+
expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]],
103+
codes=[[-1, -1, -1, -1, 3, 4]])
104+
tm.assert_index_equal(result, expected)
105+
106+
result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]],
107+
codes=[[0, -1, 1, 2, 3, 4]])
108+
expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]],
109+
codes=[[-1, -1, 1, -1, 3, -1]])
110+
tm.assert_index_equal(result, expected)
111+
112+
# verify set_levels and set_codes
113+
result = MultiIndex(
114+
levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels(
115+
[[np.nan, 's', pd.NaT, 128, None]])
116+
tm.assert_index_equal(result, expected)
117+
118+
result = MultiIndex(
119+
levels=[[np.nan, 's', pd.NaT, 128, None]],
120+
codes=[[1, 2, 2, 2, 2, 2]]).set_codes(
121+
[[0, -1, 1, 2, 3, 4]])
122+
tm.assert_index_equal(result, expected)
123+
85124

86125
def test_labels_deprecated(idx):
87126
# GH23752

pandas/tests/indexes/multi/test_missing.py

+15
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,21 @@ def test_dropna():
7373
with pytest.raises(ValueError, match=msg):
7474
idx.dropna(how='xxx')
7575

76+
# GH26408
77+
# test if missing values are dropped for mutiindex constructed
78+
# from codes and values
79+
idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2],
80+
[np.nan, None, pd.NaT, "128", 2]],
81+
codes=[[0, -1, 1, 2, 3, 4],
82+
[0, -1, 3, 3, 3, 4]])
83+
expected = MultiIndex.from_arrays([["128", 2], ["128", 2]])
84+
tm.assert_index_equal(idx.dropna(), expected)
85+
tm.assert_index_equal(idx.dropna(how='any'), expected)
86+
87+
expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2],
88+
["128", "128", "128", 2]])
89+
tm.assert_index_equal(idx.dropna(how='all'), expected)
90+
7691

7792
def test_nulls(idx):
7893
# this is really a smoke test for the methods

0 commit comments

Comments
 (0)