From 0bc28fac03b92eae84b670a4522bc2f404111c89 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Sat, 7 Sep 2019 12:00:50 -0700 Subject: [PATCH 1/3] GH28301 check for non-unique index in stack_multi_columns --- pandas/core/reshape/reshape.py | 5 +++-- pandas/tests/frame/test_reshape.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 374de6156c807..c32ca47c19160 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -725,8 +725,9 @@ def _convert_level_number(level_num, columns): new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: - new_levels = [this.index] - new_codes = [np.arange(N).repeat(levsize)] + old_codes, old_levels = _factorize_from_iterable(this.index) + new_levels = [old_levels] + new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 84e343f07f990..8926644f59d1d 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1002,6 +1002,26 @@ def test_stack_preserve_categorical_dtype_values(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "index, columns", + [ + ([0, 0, 1, 1], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), + ([0, 0, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), + ([0, 1, 2, 3], pd.MultiIndex.from_product([[1, 2], ["a", "b"]])), + ], + ) + def test_stack_multi_columns_non_unique_index(self, index, columns): + # GH-28301 + df = pd.DataFrame(index=index, columns=columns).fillna(1) + stacked = df.stack(-1) + new_index = pd.MultiIndex.from_tuples(stacked.index.values) + + tm.assert_index_equal(stacked.index, new_index) + + a = np.asarray(stacked.index.codes) + b = np.asarray(new_index.codes) + tm.assert_numpy_array_equal(a, b) + @pytest.mark.parametrize("level", [0, 1]) def test_unstack_mixed_extension_types(self, level): index = pd.MultiIndex.from_tuples( From 92c6b871b93521c835becdf3d70099d6c3fb28a8 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Sat, 7 Sep 2019 15:28:12 -0700 Subject: [PATCH 2/3] Added whatsnew and changed test to compare entire frame --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/tests/frame/test_reshape.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 58892b316c940..e3db8edee8521 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -184,7 +184,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) - Sparse diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 8926644f59d1d..a5fd419d4fc39 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1013,14 +1013,13 @@ def test_stack_preserve_categorical_dtype_values(self): def test_stack_multi_columns_non_unique_index(self, index, columns): # GH-28301 df = pd.DataFrame(index=index, columns=columns).fillna(1) - stacked = df.stack(-1) - new_index = pd.MultiIndex.from_tuples(stacked.index.values) - - tm.assert_index_equal(stacked.index, new_index) - - a = np.asarray(stacked.index.codes) - b = np.asarray(new_index.codes) - tm.assert_numpy_array_equal(a, b) + stacked = df.stack() + new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy()) + expected = pd.DataFrame(stacked.to_numpy(), index=new_index, columns=stacked.columns) + tm.assert_frame_equal(stacked, expected) + stacked_codes = np.asarray(stacked.index.codes) + expected_codes = np.asarray(new_index.codes) + tm.assert_numpy_array_equal(stacked_codes, expected_codes) @pytest.mark.parametrize("level", [0, 1]) def test_unstack_mixed_extension_types(self, level): From 5cc68250e9f95104047b4bbe6be6d02558ba2c90 Mon Sep 17 00:00:00 2001 From: Chris Zimmerman Date: Sat, 7 Sep 2019 15:28:38 -0700 Subject: [PATCH 3/3] code formatting --- pandas/tests/frame/test_reshape.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index a5fd419d4fc39..eb654be3f12e6 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1015,7 +1015,9 @@ def test_stack_multi_columns_non_unique_index(self, index, columns): df = pd.DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack() new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy()) - expected = pd.DataFrame(stacked.to_numpy(), index=new_index, columns=stacked.columns) + expected = pd.DataFrame( + stacked.to_numpy(), index=new_index, columns=stacked.columns + ) tm.assert_frame_equal(stacked, expected) stacked_codes = np.asarray(stacked.index.codes) expected_codes = np.asarray(new_index.codes)