diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2b4b10c39602a..bd3112403b31b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -392,6 +392,7 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) +- Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) Missing ^^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1cea817abbaa3..a7379376c2f78 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -493,7 +493,10 @@ def maybe_casted_values(index, codes=None): values = values._data # TODO: can we de-kludge yet? if mask.any(): - values, _ = maybe_upcast_putmask(values, mask, np.nan) + if isinstance(values, np.ndarray): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + else: + values[mask] = np.nan if issubclass(values_type, DatetimeLikeArrayMixin): values = values_type(values, dtype=values_dtype) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 486855f5c37cd..deac1792737a1 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -12,10 +12,12 @@ from pandas import ( Categorical, + CategoricalIndex, DataFrame, DatetimeIndex, Index, IntervalIndex, + MultiIndex, Series, Timestamp, cut, @@ -171,21 +173,6 @@ def test_assign_columns(self, float_frame): tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) - def test_set_index_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame( - { - "A": [1, 2, 1, 1, 2], - "B": [10, 16, 22, 28, 34], - "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), - "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), - } - ) - for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: - result = df.set_index(cols).reset_index() - result = result.reindex(columns=df.columns) - tm.assert_frame_equal(result, df) - def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) @@ -266,3 +253,47 @@ def test_set_reset_index(self): df = df.set_index("B") df = df.reset_index() + + +class TestCategoricalIndex: + def test_set_index_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize( + "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) + ) + def test_reindexing_with_missing_values(self, codes): + # GH 24206 + + index = MultiIndex( + [CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])], codes + ) + data = {"col": range(len(index))} + df = DataFrame(data=data, index=index) + + expected = DataFrame( + { + "level_0": Categorical.from_codes(codes[0], categories=["A", "B"]), + "level_1": Categorical.from_codes(codes[1], categories=["a", "b"]), + "col": range(4), + } + ) + + res = df.reset_index() + tm.assert_frame_equal(res, expected) + + # roundtrip + res = expected.set_index(["level_0", "level_1"]).reset_index() + tm.assert_frame_equal(res, expected)