Skip to content

Commit e92fe43

Browse files
rhshadrachim-vinicius
authored and
im-vinicius
committed
BUG: DataFrame.stack with sort=True and unsorted MultiIndex levels (pandas-dev#53637)
* BUG: DataFrame.stack with sort=True and unsorted MultiIndex levels * revert ignoring warnings
1 parent 7b7923b commit e92fe43

File tree

3 files changed

+41
-9
lines changed

3 files changed

+41
-9
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ Reshaping
470470
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
471471
- Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
472472
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
473+
- Bug in :meth:`DataFrame.stack` would incorrectly order results when ``sort=True`` and the input had :class:`MultiIndex` levels that were not sorted (:issue:`53636`)
473474
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
474475
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
475476
-

pandas/core/reshape/reshape.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,16 @@ def _convert_level_number(level_num: int, columns: Index):
756756
level_vals = mi_cols.levels[-1]
757757
level_codes = unique(mi_cols.codes[-1])
758758
if sort:
759+
_, index, inverse = np.unique(
760+
level_vals, return_index=True, return_inverse=True
761+
)
762+
sorted_level_vals = np.take(level_vals, index)
759763
level_codes = np.sort(level_codes)
764+
# Take level_codes according to where level_vals get sorted to, while
765+
# also allowing for NA (-1) values
766+
level_codes = np.where(level_codes == -1, -1, np.take(inverse, level_codes))
767+
else:
768+
sorted_level_vals = level_vals
760769
level_vals_nan = level_vals.insert(len(level_vals), None)
761770

762771
level_vals_used = np.take(level_vals_nan, level_codes)
@@ -818,7 +827,7 @@ def _convert_level_number(level_num: int, columns: Index):
818827
new_codes = [old_codes.repeat(levsize)]
819828
new_names = [this.index.name] # something better?
820829

821-
new_levels.append(level_vals)
830+
new_levels.append(sorted_level_vals)
822831
new_codes.append(np.tile(level_codes, N))
823832
new_names.append(frame.columns.names[level_num])
824833

pandas/tests/frame/test_stack_unstack.py

+30-8
Original file line numberDiff line numberDiff line change
@@ -1998,18 +1998,20 @@ def __init__(self, *args, **kwargs) -> None:
19981998
),
19991999
)
20002000
@pytest.mark.parametrize("stack_lev", range(2))
2001-
def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
2001+
@pytest.mark.parametrize("sort", [True, False])
2002+
def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort):
20022003
# GH#16323
20032004
# deep check for 1-row case
20042005
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
20052006
df = DataFrame(columns=columns, data=[range(4)])
2006-
df_stacked = df.stack(stack_lev)
2007-
assert all(
2008-
df.loc[row, col]
2009-
== df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]]
2010-
for row in df.index
2011-
for col in df.columns
2012-
)
2007+
df_stacked = df.stack(stack_lev, sort=sort)
2008+
for row in df.index:
2009+
for col in df.columns:
2010+
expected = df.loc[row, col]
2011+
result_row = row, col[stack_lev]
2012+
result_col = col[1 - stack_lev]
2013+
result = df_stacked.loc[result_row, result_col]
2014+
assert result == expected
20132015

20142016
def test_stack_order_with_unsorted_levels_multi_row(self):
20152017
# GH#16323
@@ -2028,6 +2030,26 @@ def test_stack_order_with_unsorted_levels_multi_row(self):
20282030
for col in df.columns
20292031
)
20302032

2033+
def test_stack_order_with_unsorted_levels_multi_row_2(self):
2034+
# GH#53636
2035+
levels = ((0, 1), (1, 0))
2036+
stack_lev = 1
2037+
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
2038+
df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
2039+
result = df.stack(stack_lev, sort=True)
2040+
expected_index = MultiIndex(
2041+
levels=[[0, 1, 2, 3], [0, 1]],
2042+
codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],
2043+
)
2044+
expected = DataFrame(
2045+
{
2046+
0: [0, 1, 0, 1, 0, 1, 0, 1],
2047+
1: [2, 3, 2, 3, 2, 3, 2, 3],
2048+
},
2049+
index=expected_index,
2050+
)
2051+
tm.assert_frame_equal(result, expected)
2052+
20312053
def test_stack_unstack_unordered_multiindex(self):
20322054
# GH# 18265
20332055
values = np.arange(5)

0 commit comments

Comments
 (0)