Skip to content

Commit 04ea554

Browse files
rhshadrachim-vinicius
authored and
im-vinicius
committed
BUG: DataFrame.stack sorts columns (pandas-dev#53787)
* BUG: DataFrame.stack with sort=True and unsorted MultiIndex levels * revert ignoring warnings * BUG: DataFrame.stack sorting columns * whatsnew * Docstring fixup * Merge cleanup
1 parent 2449dcd commit 04ea554

File tree

4 files changed

+51
-25
lines changed

4 files changed

+51
-25
lines changed

doc/source/whatsnew/v2.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -497,9 +497,9 @@ Reshaping
497497
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
498498
- Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
499499
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
500+
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
500501
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
501502
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
502-
-
503503

504504
Sparse
505505
^^^^^^

pandas/core/frame.py

+13-13
Original file line numberDiff line numberDiff line change
@@ -9108,11 +9108,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
91089108
cat 1.0 2.0
91099109
dog 3.0 4.0
91109110
>>> df_multi_level_cols2.stack()
9111-
height weight
9112-
cat kg NaN 1.0
9113-
m 2.0 NaN
9114-
dog kg NaN 3.0
9115-
m 4.0 NaN
9111+
weight height
9112+
cat kg 1.0 NaN
9113+
m NaN 2.0
9114+
dog kg 3.0 NaN
9115+
m NaN 4.0
91169116
91179117
**Prescribing the level(s) to be stacked**
91189118
@@ -9147,16 +9147,16 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
91479147
cat NaN 1.0
91489148
dog 2.0 3.0
91499149
>>> df_multi_level_cols3.stack(dropna=False)
9150-
height weight
9150+
weight height
91519151
cat kg NaN NaN
9152-
m 1.0 NaN
9153-
dog kg NaN 2.0
9154-
m 3.0 NaN
9152+
m NaN 1.0
9153+
dog kg 2.0 NaN
9154+
m NaN 3.0
91559155
>>> df_multi_level_cols3.stack(dropna=True)
9156-
height weight
9157-
cat m 1.0 NaN
9158-
dog kg NaN 2.0
9159-
m 3.0 NaN
9156+
weight height
9157+
cat m NaN 1.0
9158+
dog kg 2.0 NaN
9159+
m NaN 3.0
91609160
"""
91619161
from pandas.core.reshape.reshape import (
91629162
stack,

pandas/core/reshape/reshape.py

+5
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,11 @@ def _convert_level_number(level_num: int, columns: Index):
828828

829829
result = frame._constructor(new_data, index=new_index, columns=new_columns)
830830

831+
if frame.columns.nlevels > 1:
832+
desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
833+
if not result.columns.equals(desired_columns):
834+
result = result[desired_columns]
835+
831836
# more efficient way to go about this? can do the whole masking biz but
832837
# will only save a small amount of time...
833838
if dropna:

pandas/tests/frame/test_stack_unstack.py

+32-11
Original file line numberDiff line numberDiff line change
@@ -516,10 +516,10 @@ def test_unstack_level_binding(self):
516516

517517
expected = DataFrame(
518518
np.array(
519-
[[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64
519+
[[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64
520520
),
521521
index=expected_mi,
522-
columns=Index(["a", "b"], name="third"),
522+
columns=Index(["b", "a"], name="third"),
523523
)
524524

525525
tm.assert_frame_equal(result, expected)
@@ -1536,7 +1536,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data):
15361536

15371537
# columns unsorted
15381538
unstacked = ymd.unstack()
1539-
unstacked = unstacked.sort_index(axis=1, ascending=False)
15401539
restacked = unstacked.stack()
15411540
tm.assert_frame_equal(restacked, ymd)
15421541

@@ -2000,18 +1999,20 @@ def __init__(self, *args, **kwargs) -> None:
20001999
),
20012000
)
20022001
@pytest.mark.parametrize("stack_lev", range(2))
2003-
def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
2002+
@pytest.mark.parametrize("sort", [True, False])
2003+
def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort):
20042004
# GH#16323
20052005
# deep check for 1-row case
20062006
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
20072007
df = DataFrame(columns=columns, data=[range(4)])
2008-
df_stacked = df.stack(stack_lev)
2009-
assert all(
2010-
df.loc[row, col]
2011-
== df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]]
2012-
for row in df.index
2013-
for col in df.columns
2014-
)
2008+
df_stacked = df.stack(stack_lev, sort=sort)
2009+
for row in df.index:
2010+
for col in df.columns:
2011+
expected = df.loc[row, col]
2012+
result_row = row, col[stack_lev]
2013+
result_col = col[1 - stack_lev]
2014+
result = df_stacked.loc[result_row, result_col]
2015+
assert result == expected
20152016

20162017
def test_stack_order_with_unsorted_levels_multi_row(self):
20172018
# GH#16323
@@ -2030,6 +2031,26 @@ def test_stack_order_with_unsorted_levels_multi_row(self):
20302031
for col in df.columns
20312032
)
20322033

2034+
def test_stack_order_with_unsorted_levels_multi_row_2(self):
2035+
# GH#53636
2036+
levels = ((0, 1), (1, 0))
2037+
stack_lev = 1
2038+
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
2039+
df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
2040+
result = df.stack(stack_lev, sort=True)
2041+
expected_index = MultiIndex(
2042+
levels=[[0, 1, 2, 3], [0, 1]],
2043+
codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],
2044+
)
2045+
expected = DataFrame(
2046+
{
2047+
0: [0, 1, 0, 1, 0, 1, 0, 1],
2048+
1: [2, 3, 2, 3, 2, 3, 2, 3],
2049+
},
2050+
index=expected_index,
2051+
)
2052+
tm.assert_frame_equal(result, expected)
2053+
20332054
def test_stack_unstack_unordered_multiindex(self):
20342055
# GH# 18265
20352056
values = np.arange(5)

0 commit comments

Comments
 (0)