From fee7b1a45e0e8b275f07e3ca23c40170da10726c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 5 Oct 2020 03:35:22 +0000 Subject: [PATCH 1/8] BUG: add check so maybe_upcast_putmask is only called with ndarray --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ebe5185ce4488..3f2d4859aee8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4879,7 +4879,7 @@ def _maybe_casted_values(index, labels=None): if issubclass(values_type, DatetimeLikeArray): values = values._data # TODO: can we de-kludge yet? - if mask.any(): + if mask.any() and isinstance(values, np.ndarray): values, _ = maybe_upcast_putmask(values, mask, np.nan) if issubclass(values_type, DatetimeLikeArray): From 00e1c28ecd3b7526e45ffd8e4f7bd07957cb1abe Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 5 Oct 2020 04:03:16 +0000 Subject: [PATCH 2/8] TST: add tests --- pandas/tests/indexing/test_categorical.py | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index fae229aecc3d4..4442f0fa64275 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -650,6 +650,32 @@ def test_loc_slice(self): expected = self.df.iloc[[2, 3, 4]] tm.assert_frame_equal(result, expected) + def test_reindexing_with_missing_values(self): + # GH 24206 + + index = pd.MultiIndex( + [pd.CategoricalIndex(["A", "B"]), pd.CategoricalIndex(["a", "b"])], + [[0, 0, -1, 1], [0, 1, 0, 1]], + ) + data = {"col": range(len(index))} + df = DataFrame(data=data, index=index) + + res = df.reset_index() + + expected = pd.DataFrame( + { + "level_0": pd.Categorical.from_codes( + [0, 0, 1, 1], categories=["A", "B"] + ), + "level_1": pd.Categorical.from_codes( + [0, 1, 0, 1], categories=["a", "b"] + ), + "col": range(4), + } + ) + + tm.assert_frame_equal(res, expected) + def test_loc_and_at_with_categorical_index(self): # GH 20629 s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"])) From 05c4c1e0e2aabbba99ce31142730a21c78b78237 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 5 Oct 2020 05:08:15 +0000 Subject: [PATCH 3/8] DOC: add whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ae4d5ea692066..7d1ea292591b0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -387,6 +387,7 @@ Indexing - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) +- Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) Missing ^^^^^^^ From f70368b0b631f11f442104188e57225223b38036 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 9 Oct 2020 04:34:47 +0000 Subject: [PATCH 4/8] feedback: test roundtrip --- pandas/tests/indexing/test_categorical.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 4442f0fa64275..e26661e9ca791 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -660,8 +660,6 @@ def test_reindexing_with_missing_values(self): data = {"col": range(len(index))} df = DataFrame(data=data, index=index) - res = df.reset_index() - expected = pd.DataFrame( { "level_0": pd.Categorical.from_codes( @@ -674,6 +672,11 @@ def test_reindexing_with_missing_values(self): } ) + res = df.reset_index() + tm.assert_frame_equal(res, expected) + + # roundtrip + res = expected.set_index(["level_0", "level_1"]).reset_index() tm.assert_frame_equal(res, expected) def test_loc_and_at_with_categorical_index(self): From 50dbb93b86f87825f5b365eb8474f2cbaa6b616d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 9 Oct 2020 04:45:30 +0000 Subject: [PATCH 5/8] feedback: parametrize on both examples from OP --- pandas/tests/indexing/test_categorical.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index e26661e9ca791..c71a998b4d8cd 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -650,24 +650,22 @@ def test_loc_slice(self): expected = self.df.iloc[[2, 3, 4]] tm.assert_frame_equal(result, expected) - def test_reindexing_with_missing_values(self): + @pytest.mark.parametrize( + "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) + ) + def test_reindexing_with_missing_values(self, codes): # GH 24206 index = pd.MultiIndex( - [pd.CategoricalIndex(["A", "B"]), pd.CategoricalIndex(["a", "b"])], - [[0, 0, -1, 1], [0, 1, 0, 1]], + [pd.CategoricalIndex(["A", "B"]), pd.CategoricalIndex(["a", "b"])], codes ) data = {"col": range(len(index))} df = DataFrame(data=data, index=index) expected = pd.DataFrame( { - "level_0": pd.Categorical.from_codes( - [0, 0, 1, 1], categories=["A", "B"] - ), - "level_1": pd.Categorical.from_codes( - [0, 1, 0, 1], categories=["a", "b"] - ), + "level_0": pd.Categorical.from_codes(codes[0], categories=["A", "B"]), + "level_1": pd.Categorical.from_codes(codes[1], categories=["a", "b"]), "col": range(4), } ) From 17e73a52c5bf5b4478872492dd63673d1e4422c0 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 9 Oct 2020 05:00:46 +0000 Subject: [PATCH 6/8] move test to frame/test_alter_axes.py --- pandas/tests/frame/test_alter_axes.py | 61 +++++++++++++++++------ pandas/tests/indexing/test_categorical.py | 27 ---------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 486855f5c37cd..deac1792737a1 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -12,10 +12,12 @@ from pandas import ( Categorical, + CategoricalIndex, DataFrame, DatetimeIndex, Index, IntervalIndex, + MultiIndex, Series, Timestamp, cut, @@ -171,21 +173,6 @@ def test_assign_columns(self, float_frame): tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) - def test_set_index_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame( - { - "A": [1, 2, 1, 1, 2], - "B": [10, 16, 22, 28, 34], - "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), - "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), - } - ) - for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: - result = df.set_index(cols).reset_index() - result = result.reindex(columns=df.columns) - tm.assert_frame_equal(result, df) - def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) @@ -266,3 +253,47 @@ def test_set_reset_index(self): df = df.set_index("B") df = df.reset_index() + + +class TestCategoricalIndex: + def test_set_index_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize( + "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) + ) + def test_reindexing_with_missing_values(self, codes): + # GH 24206 + + index = MultiIndex( + [CategoricalIndex(["A", "B"]), CategoricalIndex(["a", "b"])], codes + ) + data = {"col": range(len(index))} + df = DataFrame(data=data, index=index) + + expected = DataFrame( + { + "level_0": Categorical.from_codes(codes[0], categories=["A", "B"]), + "level_1": Categorical.from_codes(codes[1], categories=["a", "b"]), + "col": range(4), + } + ) + + res = df.reset_index() + tm.assert_frame_equal(res, expected) + + # roundtrip + res = expected.set_index(["level_0", "level_1"]).reset_index() + tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c71a998b4d8cd..fae229aecc3d4 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -650,33 +650,6 @@ def test_loc_slice(self): expected = self.df.iloc[[2, 3, 4]] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "codes", ([[0, 0, 1, 1], [0, 1, 0, 1]], [[0, 0, -1, 1], [0, 1, 0, 1]]) - ) - def test_reindexing_with_missing_values(self, codes): - # GH 24206 - - index = pd.MultiIndex( - [pd.CategoricalIndex(["A", "B"]), pd.CategoricalIndex(["a", "b"])], codes - ) - data = {"col": range(len(index))} - df = DataFrame(data=data, index=index) - - expected = pd.DataFrame( - { - "level_0": pd.Categorical.from_codes(codes[0], categories=["A", "B"]), - "level_1": pd.Categorical.from_codes(codes[1], categories=["a", "b"]), - "col": range(4), - } - ) - - res = df.reset_index() - tm.assert_frame_equal(res, expected) - - # roundtrip - res = expected.set_index(["level_0", "level_1"]).reset_index() - tm.assert_frame_equal(res, expected) - def test_loc_and_at_with_categorical_index(self): # GH 20629 s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"])) From f55846f4c75a1c1c321d71f35b22b396fe050656 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 9 Oct 2020 05:12:29 +0000 Subject: [PATCH 7/8] fill mask in index with nan when not calling maybe_upcast_putmask --- pandas/core/frame.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f2d4859aee8b..425998db3f481 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4879,8 +4879,12 @@ def _maybe_casted_values(index, labels=None): if issubclass(values_type, DatetimeLikeArray): values = values._data # TODO: can we de-kludge yet? - if mask.any() and isinstance(values, np.ndarray): - values, _ = maybe_upcast_putmask(values, mask, np.nan) + if mask.any(): + if isinstance(values, np.ndarray): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + else: + # GH24206 + values[mask] = np.nan if issubclass(values_type, DatetimeLikeArray): values = values_type(values, dtype=values_dtype) From 8d1df1f55b22d9485ced08e2d0209cfbc6e57f97 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 11 Oct 2020 05:51:43 +0000 Subject: [PATCH 8/8] restore the fix --- pandas/core/dtypes/cast.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1cea817abbaa3..a7379376c2f78 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -493,7 +493,10 @@ def maybe_casted_values(index, codes=None): values = values._data # TODO: can we de-kludge yet? if mask.any(): - values, _ = maybe_upcast_putmask(values, mask, np.nan) + if isinstance(values, np.ndarray): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + else: + values[mask] = np.nan if issubclass(values_type, DatetimeLikeArrayMixin): values = values_type(values, dtype=values_dtype)