BUG: DataFrame.stack sorts columns (pandas-dev#53787)

rhshadrach · im-vinicius · commit 04ea554eef98 · 2023-07-08T12:28:10.000+02:00
* BUG: DataFrame.stack with sort=True and unsorted MultiIndex levels

* revert ignoring warnings

* BUG: DataFrame.stack sorting columns

* whatsnew

* Docstring fixup

* Merge cleanup
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -497,9 +497,9 @@ Reshaping
 - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
 - Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
 - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
+- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
 - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
 - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
--
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9108,11 +9108,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
         cat    1.0    2.0
         dog    3.0    4.0
         >>> df_multi_level_cols2.stack()
-                height  weight
-        cat kg     NaN     1.0
-            m      2.0     NaN
-        dog kg     NaN     3.0
-            m      4.0     NaN
+                weight  height
+        cat kg     1.0     NaN
+            m      NaN     2.0
+        dog kg     3.0     NaN
+            m      NaN     4.0
 
         **Prescribing the level(s) to be stacked**
 
@@ -9147,16 +9147,16 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
         cat    NaN    1.0
         dog    2.0    3.0
         >>> df_multi_level_cols3.stack(dropna=False)
-                height  weight
+                weight  height
         cat kg     NaN     NaN
-            m      1.0     NaN
-        dog kg     NaN     2.0
-            m      3.0     NaN
+            m      NaN     1.0
+        dog kg     2.0     NaN
+            m      NaN     3.0
         >>> df_multi_level_cols3.stack(dropna=True)
-                height  weight
-        cat m      1.0     NaN
-        dog kg     NaN     2.0
-            m      3.0     NaN
+                weight  height
+        cat m      NaN     1.0
+        dog kg     2.0     NaN
+            m      NaN     3.0
         """
         from pandas.core.reshape.reshape import (
             stack,
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -828,6 +828,11 @@ def _convert_level_number(level_num: int, columns: Index):
 
     result = frame._constructor(new_data, index=new_index, columns=new_columns)
 
+    if frame.columns.nlevels > 1:
+        desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
+        if not result.columns.equals(desired_columns):
+            result = result[desired_columns]
+
     # more efficient way to go about this? can do the whole masking biz but
     # will only save a small amount of time...
     if dropna:
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -516,10 +516,10 @@ def test_unstack_level_binding(self):
 
         expected = DataFrame(
             np.array(
-                [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64
+                [[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64
             ),
             index=expected_mi,
-            columns=Index(["a", "b"], name="third"),
+            columns=Index(["b", "a"], name="third"),
         )
 
         tm.assert_frame_equal(result, expected)
@@ -1536,7 +1536,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data):
 
         # columns unsorted
         unstacked = ymd.unstack()
-        unstacked = unstacked.sort_index(axis=1, ascending=False)
         restacked = unstacked.stack()
         tm.assert_frame_equal(restacked, ymd)
 
@@ -2000,18 +1999,20 @@ def __init__(self, *args, **kwargs) -> None:
         ),
     )
     @pytest.mark.parametrize("stack_lev", range(2))
-    def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
+    @pytest.mark.parametrize("sort", [True, False])
+    def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort):
         # GH#16323
         # deep check for 1-row case
         columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
         df = DataFrame(columns=columns, data=[range(4)])
-        df_stacked = df.stack(stack_lev)
-        assert all(
-            df.loc[row, col]
-            == df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]]
-            for row in df.index
-            for col in df.columns
-        )
+        df_stacked = df.stack(stack_lev, sort=sort)
+        for row in df.index:
+            for col in df.columns:
+                expected = df.loc[row, col]
+                result_row = row, col[stack_lev]
+                result_col = col[1 - stack_lev]
+                result = df_stacked.loc[result_row, result_col]
+                assert result == expected
 
     def test_stack_order_with_unsorted_levels_multi_row(self):
         # GH#16323
@@ -2030,6 +2031,26 @@ def test_stack_order_with_unsorted_levels_multi_row(self):
             for col in df.columns
         )
 
+    def test_stack_order_with_unsorted_levels_multi_row_2(self):
+        # GH#53636
+        levels = ((0, 1), (1, 0))
+        stack_lev = 1
+        columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+        df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
+        result = df.stack(stack_lev, sort=True)
+        expected_index = MultiIndex(
+            levels=[[0, 1, 2, 3], [0, 1]],
+            codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],
+        )
+        expected = DataFrame(
+            {
+                0: [0, 1, 0, 1, 0, 1, 0, 1],
+                1: [2, 3, 2, 3, 2, 3, 2, 3],
+            },
+            index=expected_index,
+        )
+        tm.assert_frame_equal(result, expected)
+
     def test_stack_unstack_unordered_multiindex(self):
         # GH# 18265
         values = np.arange(5)