diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index dd67d1f158c47..d656867965764 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -470,6 +470,7 @@ Reshaping - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) - Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`) - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`) +- Bug in :meth:`DataFrame.stack` would incorrectly order results when ``sort=True`` and the input had :class:`MultiIndex` levels that were not sorted (:issue:`53636`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3866d30e9c757..f6ce9955bc2bc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -756,7 +756,16 @@ def _convert_level_number(level_num: int, columns: Index): level_vals = mi_cols.levels[-1] level_codes = unique(mi_cols.codes[-1]) if sort: + _, index, inverse = np.unique( + level_vals, return_index=True, return_inverse=True + ) + sorted_level_vals = np.take(level_vals, index) level_codes = np.sort(level_codes) + # Take level_codes according to where level_vals get sorted to, while + # also allowing for NA (-1) values + level_codes = np.where(level_codes == -1, -1, np.take(inverse, level_codes)) + else: + sorted_level_vals = level_vals level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) @@ -818,7 +827,7 @@ def _convert_level_number(level_num: int, columns: Index): new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(level_vals) + new_levels.append(sorted_level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2818df721db34..9900db7910c46 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1998,18 +1998,20 @@ def __init__(self, *args, **kwargs) -> None: ), ) @pytest.mark.parametrize("stack_lev", range(2)) - def test_stack_order_with_unsorted_levels(self, levels, stack_lev): + @pytest.mark.parametrize("sort", [True, False]) + def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort): # GH#16323 # deep check for 1-row case columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) - df_stacked = df.stack(stack_lev) - assert all( - df.loc[row, col] - == df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]] - for row in df.index - for col in df.columns - ) + df_stacked = df.stack(stack_lev, sort=sort) + for row in df.index: + for col in df.columns: + expected = df.loc[row, col] + result_row = row, col[stack_lev] + result_col = col[1 - stack_lev] + result = df_stacked.loc[result_row, result_col] + assert result == expected def test_stack_order_with_unsorted_levels_multi_row(self): # GH#16323 @@ -2028,6 +2030,26 @@ def test_stack_order_with_unsorted_levels_multi_row(self): for col in df.columns ) + def test_stack_order_with_unsorted_levels_multi_row_2(self): + # GH#53636 + levels = ((0, 1), (1, 0)) + stack_lev = 1 + columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) + df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3]) + result = df.stack(stack_lev, sort=True) + expected_index = MultiIndex( + levels=[[0, 1, 2, 3], [0, 1]], + codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]], + ) + expected = DataFrame( + { + 0: [0, 1, 0, 1, 0, 1, 0, 1], + 1: [2, 3, 2, 3, 2, 3, 2, 3], + }, + index=expected_index, + ) + tm.assert_frame_equal(result, expected) + def test_stack_unstack_unordered_multiindex(self): # GH# 18265 values = np.arange(5)