Skip to content

Commit 9372d21

Browse files
authored
Revert "BUG: DataFrame.stack sometimes sorting the resulting index" (#54068)
Revert "BUG: DataFrame.stack sometimes sorting the resulting index (#53825)" This reverts commit 5307062.
1 parent 7c876ed commit 9372d21

File tree

4 files changed

+53
-48
lines changed

4 files changed

+53
-48
lines changed

doc/source/whatsnew/v2.1.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ Other enhancements
154154
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
155155
- Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
156156
- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
157+
- :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
157158
- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
158159
- :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`)
159160
- :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`)
@@ -570,8 +571,7 @@ Reshaping
570571
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
571572
- Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
572573
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
573-
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically in rare cases (:issue:`53786`)
574-
- Bug in :meth:`DataFrame.stack` sorting index lexicographically in rare cases (:issue:`53824`)
574+
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
575575
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
576576
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
577577
- Bug when joining empty :class:`DataFrame` objects, where the joined index would be a :class:`RangeIndex` instead of the joined index type (:issue:`52777`)

pandas/core/frame.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -9031,7 +9031,7 @@ def pivot_table(
90319031
sort=sort,
90329032
)
90339033

9034-
def stack(self, level: IndexLabel = -1, dropna: bool = True):
9034+
def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
90359035
"""
90369036
Stack the prescribed level(s) from columns to index.
90379037
@@ -9057,6 +9057,8 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True):
90579057
axis can create combinations of index and column values
90589058
that are missing from the original dataframe. See Examples
90599059
section.
9060+
sort : bool, default True
9061+
Whether to sort the levels of the resulting MultiIndex.
90609062
90619063
Returns
90629064
-------
@@ -9156,15 +9158,15 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True):
91569158
91579159
>>> df_multi_level_cols2.stack(0)
91589160
kg m
9159-
cat weight 1.0 NaN
9160-
height NaN 2.0
9161-
dog weight 3.0 NaN
9162-
height NaN 4.0
9161+
cat height NaN 2.0
9162+
weight 1.0 NaN
9163+
dog height NaN 4.0
9164+
weight 3.0 NaN
91639165
>>> df_multi_level_cols2.stack([0, 1])
9164-
cat weight kg 1.0
9165-
height m 2.0
9166-
dog weight kg 3.0
9167-
height m 4.0
9166+
cat height m 2.0
9167+
weight kg 1.0
9168+
dog height m 4.0
9169+
weight kg 3.0
91689170
dtype: float64
91699171
91709172
**Dropping missing values**
@@ -9200,9 +9202,9 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True):
92009202
)
92019203

92029204
if isinstance(level, (tuple, list)):
9203-
result = stack_multiple(self, level, dropna=dropna)
9205+
result = stack_multiple(self, level, dropna=dropna, sort=sort)
92049206
else:
9205-
result = stack(self, level, dropna=dropna)
9207+
result = stack(self, level, dropna=dropna, sort=sort)
92069208

92079209
return result.__finalize__(self, method="stack")
92089210

pandas/core/reshape/reshape.py

+24-22
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import itertools
34
from typing import (
45
TYPE_CHECKING,
56
cast,
@@ -498,7 +499,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
498499
if isinstance(obj.index, MultiIndex):
499500
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
500501
else:
501-
return obj.T.stack(dropna=False)
502+
return obj.T.stack(dropna=False, sort=sort)
502503
elif not isinstance(obj.index, MultiIndex):
503504
# GH 36113
504505
# Give nicer error messages when unstack a Series whose
@@ -571,7 +572,7 @@ def _unstack_extension_series(
571572
return result
572573

573574

574-
def stack(frame: DataFrame, level=-1, dropna: bool = True):
575+
def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
575576
"""
576577
Convert DataFrame to Series with multi-level Index. Columns become the
577578
second level of the resulting hierarchical index
@@ -593,7 +594,9 @@ def factorize(index):
593594
level_num = frame.columns._get_level_number(level)
594595

595596
if isinstance(frame.columns, MultiIndex):
596-
return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
597+
return _stack_multi_columns(
598+
frame, level_num=level_num, dropna=dropna, sort=sort
599+
)
597600
elif isinstance(frame.index, MultiIndex):
598601
new_levels = list(frame.index.levels)
599602
new_codes = [lab.repeat(K) for lab in frame.index.codes]
@@ -646,13 +649,13 @@ def factorize(index):
646649
return frame._constructor_sliced(new_values, index=new_index)
647650

648651

649-
def stack_multiple(frame: DataFrame, level, dropna: bool = True):
652+
def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
650653
# If all passed levels match up to column names, no
651654
# ambiguity about what to do
652655
if all(lev in frame.columns.names for lev in level):
653656
result = frame
654657
for lev in level:
655-
result = stack(result, lev, dropna=dropna)
658+
result = stack(result, lev, dropna=dropna, sort=sort)
656659

657660
# Otherwise, level numbers may change as each successive level is stacked
658661
elif all(isinstance(lev, int) for lev in level):
@@ -665,7 +668,7 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True):
665668

666669
while level:
667670
lev = level.pop(0)
668-
result = stack(result, lev, dropna=dropna)
671+
result = stack(result, lev, dropna=dropna, sort=sort)
669672
# Decrement all level numbers greater than current, as these
670673
# have now shifted down by one
671674
level = [v if v <= lev else v - 1 for v in level]
@@ -691,14 +694,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
691694

692695
# Remove duplicate tuples in the MultiIndex.
693696
tuples = zip(*levs)
694-
seen = set()
695-
# mypy doesn't like our trickery to get `set.add` to work in a comprehension
696-
# error: "add" of "set" does not return a value
697-
unique_tuples = (
698-
key
699-
for key in tuples
700-
if not (key in seen or seen.add(key)) # type: ignore[func-returns-value]
701-
)
697+
unique_tuples = (key for key, _ in itertools.groupby(tuples))
702698
new_levs = zip(*unique_tuples)
703699

704700
# The dtype of each level must be explicitly set to avoid inferring the wrong type.
@@ -714,7 +710,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
714710

715711

716712
def _stack_multi_columns(
717-
frame: DataFrame, level_num: int = -1, dropna: bool = True
713+
frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
718714
) -> DataFrame:
719715
def _convert_level_number(level_num: int, columns: Index):
720716
"""
@@ -744,22 +740,31 @@ def _convert_level_number(level_num: int, columns: Index):
744740
roll_columns = roll_columns.swaplevel(lev1, lev2)
745741
this.columns = mi_cols = roll_columns
746742

743+
if not mi_cols._is_lexsorted() and sort:
744+
# Workaround the edge case where 0 is one of the column names,
745+
# which interferes with trying to sort based on the first
746+
# level
747+
level_to_sort = _convert_level_number(0, mi_cols)
748+
this = this.sort_index(level=level_to_sort, axis=1)
749+
mi_cols = this.columns
750+
751+
mi_cols = cast(MultiIndex, mi_cols)
747752
new_columns = _stack_multi_column_index(mi_cols)
748753

749754
# time to ravel the values
750755
new_data = {}
751756
level_vals = mi_cols.levels[-1]
752757
level_codes = unique(mi_cols.codes[-1])
758+
if sort:
759+
level_codes = np.sort(level_codes)
753760
level_vals_nan = level_vals.insert(len(level_vals), None)
754761

755762
level_vals_used = np.take(level_vals_nan, level_codes)
756763
levsize = len(level_codes)
757764
drop_cols = []
758765
for key in new_columns:
759766
try:
760-
with warnings.catch_warnings():
761-
warnings.simplefilter("ignore", PerformanceWarning)
762-
loc = this.columns.get_loc(key)
767+
loc = this.columns.get_loc(key)
763768
except KeyError:
764769
drop_cols.append(key)
765770
continue
@@ -769,12 +774,9 @@ def _convert_level_number(level_num: int, columns: Index):
769774
# but if unsorted can get a boolean
770775
# indexer
771776
if not isinstance(loc, slice):
772-
slice_len = loc.sum()
777+
slice_len = len(loc)
773778
else:
774779
slice_len = loc.stop - loc.start
775-
if loc.step is not None:
776-
# Integer division using ceiling instead of floor
777-
slice_len = -(slice_len // -loc.step)
778780

779781
if slice_len != levsize:
780782
chunk = this.loc[:, this.columns[loc]]

pandas/tests/frame/test_stack_unstack.py

+14-13
Original file line numberDiff line numberDiff line change
@@ -1099,18 +1099,18 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels):
10991099
"labels,data",
11001100
[
11011101
(list("xyz"), [10, 11, 12, 13, 14, 15]),
1102-
(list("zyx"), [10, 11, 12, 13, 14, 15]),
1102+
(list("zyx"), [14, 15, 12, 13, 10, 11]),
11031103
],
11041104
)
11051105
def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data):
11061106
# GH-36991
11071107
cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered)
11081108
cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered)
11091109
midx = MultiIndex.from_product([cidx, cidx2])
1110-
df = DataFrame([data], columns=midx)
1110+
df = DataFrame([sorted(data)], columns=midx)
11111111
result = df.stack([0, 1])
11121112

1113-
s_cidx = pd.CategoricalIndex(labels, ordered=ordered)
1113+
s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered)
11141114
expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2]))
11151115

11161116
tm.assert_series_equal(result, expected)
@@ -1400,16 +1400,16 @@ def test_unstack_non_slice_like_blocks(using_array_manager):
14001400
tm.assert_frame_equal(res, expected)
14011401

14021402

1403-
def test_stack_nosort():
1404-
# GH 15105, GH 53825
1403+
def test_stack_sort_false():
1404+
# GH 15105
14051405
data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]]
14061406
df = DataFrame(
14071407
data,
14081408
columns=MultiIndex(
14091409
levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
14101410
),
14111411
)
1412-
result = df.stack(level=0)
1412+
result = df.stack(level=0, sort=False)
14131413
expected = DataFrame(
14141414
{"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]},
14151415
index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]),
@@ -1421,15 +1421,15 @@ def test_stack_nosort():
14211421
data,
14221422
columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]),
14231423
)
1424-
result = df.stack(level=0)
1424+
result = df.stack(level=0, sort=False)
14251425
tm.assert_frame_equal(result, expected)
14261426

14271427

1428-
def test_stack_nosort_multi_level():
1429-
# GH 15105, GH 53825
1428+
def test_stack_sort_false_multi_level():
1429+
# GH 15105
14301430
idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")])
14311431
df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx)
1432-
result = df.stack([0, 1])
1432+
result = df.stack([0, 1], sort=False)
14331433
expected_index = MultiIndex.from_tuples(
14341434
[
14351435
("cat", "weight", "kg"),
@@ -1999,12 +1999,13 @@ def __init__(self, *args, **kwargs) -> None:
19991999
),
20002000
)
20012001
@pytest.mark.parametrize("stack_lev", range(2))
2002-
def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
2002+
@pytest.mark.parametrize("sort", [True, False])
2003+
def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort):
20032004
# GH#16323
20042005
# deep check for 1-row case
20052006
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
20062007
df = DataFrame(columns=columns, data=[range(4)])
2007-
df_stacked = df.stack(stack_lev)
2008+
df_stacked = df.stack(stack_lev, sort=sort)
20082009
for row in df.index:
20092010
for col in df.columns:
20102011
expected = df.loc[row, col]
@@ -2036,7 +2037,7 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self):
20362037
stack_lev = 1
20372038
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
20382039
df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
2039-
result = df.stack(stack_lev)
2040+
result = df.stack(stack_lev, sort=True)
20402041
expected_index = MultiIndex(
20412042
levels=[[0, 1, 2, 3], [0, 1]],
20422043
codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],

0 commit comments

Comments
 (0)