diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 49518c57fc846..71f427c3268e7 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -77,6 +77,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + +.. _whatsnew_0250.api_breaking.multi_indexing: + + +MultiIndex constructed from levels and codes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Constructing a :class:`MultiIndex` with NaN levels or codes value < -1 was allowed previously. +Now, construction with codes value < -1 is not allowed and NaN levels' corresponding codes +would be reassigned as -1. (:issue:`19387`) + +.. ipython:: python + + mi1 = pd.MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + mi2 = pd.MultiIndex(levels=[[1, 2]], codes=[[0, -2]]) + +*Previous Behavior*: + +.. code-block:: ipython + + In [1]: mi1 + Out[1]: MultiIndex(levels=[[nan, None, NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + In [2]: mi2 + Out[2]: MultiIndex(levels=[[1, 2]], + codes=[[0, -2]]) + +*New Behavior*: + +.. ipython:: python + + mi1 + mi2 + + .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: GroupBy.apply on ``DataFrame`` evaluates first group only once @@ -388,7 +424,6 @@ MultiIndex - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - -- I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7a933bdcb0953..6dc0aedfdec06 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -242,11 +242,35 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, result.sortorder = sortorder if verify_integrity: - result._verify_integrity() + new_codes = result._verify_integrity() + result._codes = new_codes + if _set_identity: result._reset_identity() + return result + def _validate_codes(self, level: list, code: list): + """ + Reassign code values as -1 if their corresponding levels are NaN. + + Parameters + ---------- + code : list + Code to reassign. + level : list + Level to check for missing values (NaN, NaT, None). + + Returns + ------- + code : new code where code value = -1 if it corresponds + to a level with missing values (NaN, NaT, None). + """ + null_mask = isna(level) + if np.any(null_mask): + code = np.where(null_mask[code], -1, code) + return code + def _verify_integrity(self, codes=None, levels=None): """ @@ -262,6 +286,11 @@ def _verify_integrity(self, codes=None, levels=None): ValueError If length of levels and codes don't match, if the codes for any level would exceed level bounds, or there are any duplicate levels. + + Returns + ------- + codes : new codes where code value = -1 if it corresponds to a + NaN level. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -271,22 +300,33 @@ def _verify_integrity(self, codes=None, levels=None): if len(levels) != len(codes): raise ValueError("Length of levels and codes must match. NOTE:" " this index is in an inconsistent state.") - codes_length = len(self.codes[0]) + codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError("Unequal code lengths: %s" % ([len(code_) for code_ in codes])) if len(level_codes) and level_codes.max() >= len(level): - raise ValueError("On level %d, code max (%d) >= length of" - " level (%d). NOTE: this index is in an" - " inconsistent state" % (i, level_codes.max(), - len(level))) + msg = ("On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), + level_len=len(level))) + raise ValueError(msg) + if len(level_codes) and level_codes.min() < -1: + raise ValueError("On level {level}, code value ({code})" + " < -1".format( + level=i, code=level_codes.min())) if not level.is_unique: raise ValueError("Level values must be unique: {values} on " "level {level}".format( values=[value for value in level], level=i)) + codes = [self._validate_codes(level, code) + for level, code in zip(levels, codes)] + new_codes = FrozenList(codes) + return new_codes + @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ @@ -585,7 +625,8 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, new_levels = FrozenList(new_levels) if verify_integrity: - self._verify_integrity(levels=new_levels) + new_codes = self._verify_integrity(levels=new_levels) + self._codes = new_codes names = self.names self._levels = new_levels @@ -675,7 +716,6 @@ def labels(self): def _set_codes(self, codes, level=None, copy=False, validate=True, verify_integrity=False): - if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): @@ -695,9 +735,10 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, new_codes = FrozenList(new_codes) if verify_integrity: - self._verify_integrity(codes=new_codes) + new_codes = self._verify_integrity(codes=new_codes) self._codes = new_codes + self._tuples = None self._reset_cache() @@ -1760,9 +1801,10 @@ def __setstate__(self, state): self._set_levels([Index(x) for x in levels], validate=False) self._set_codes(codes) + new_codes = self._verify_integrity() + self._set_codes(new_codes) self._set_names(names) self.sortorder = sortorder - self._verify_integrity() self._reset_identity() def __getitem__(self, key): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 639e6aee87c6f..197c06aacf90e 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -64,9 +64,10 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." + length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." " NOTE: this index is in an inconsistent state") label_error = r"Unequal code lengths: \[4, 2\]" + code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): @@ -83,6 +84,44 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) + # test set_codes with verify_integrity=False + # the setting should not raise any value error + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], + verify_integrity=False) + + # code value smaller than -1 + with pytest.raises(ValueError, match=code_value_error): + MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + + +def test_na_levels(): + # GH26408 + # test if codes are re-assigned value -1 for levels + # with mising values (NaN, NaT, None) + result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], + codes=[[-1, -1, -1, -1, 3, 4]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[0, -1, 1, 2, 3, 4]]) + expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[-1, -1, 1, -1, 3, -1]]) + tm.assert_index_equal(result, expected) + + # verify set_levels and set_codes + result = MultiIndex( + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( + [[np.nan, 's', pd.NaT, 128, None]]) + tm.assert_index_equal(result, expected) + + result = MultiIndex( + levels=[[np.nan, 's', pd.NaT, 128, None]], + codes=[[1, 2, 2, 2, 2, 2]]).set_codes( + [[0, -1, 1, 2, 3, 4]]) + tm.assert_index_equal(result, expected) + def test_labels_deprecated(idx): # GH23752 diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index ed90f74d80989..518c12bb20e13 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -73,6 +73,21 @@ def test_dropna(): with pytest.raises(ValueError, match=msg): idx.dropna(how='xxx') + # GH26408 + # test if missing values are dropped for mutiindex constructed + # from codes and values + idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], + [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], + [0, -1, 3, 3, 3, 4]]) + expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) + tm.assert_index_equal(idx.dropna(), expected) + tm.assert_index_equal(idx.dropna(how='any'), expected) + + expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], + ["128", "128", "128", 2]]) + tm.assert_index_equal(idx.dropna(how='all'), expected) + def test_nulls(idx): # this is really a smoke test for the methods