Skip to content

Commit 5307062

Browse files
authored
BUG: DataFrame.stack sometimes sorting the resulting index (#53825)
* BUG: DataFrame.stack with sort=True and unsorted MultiIndex levels * revert ignoring warnings * BUG: DataFrame.stack sorting columns * whatsnew * Docstring fixup * Merge cleanup * WIP * BUG: DataFrame.stack sometimes sorting the resulting index * mypy fixups * Remove sort argument from DataFrame.stack
1 parent 09a1db3 commit 5307062

File tree

4 files changed

+48
-53
lines changed

4 files changed

+48
-53
lines changed

doc/source/whatsnew/v2.1.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ Other enhancements
104104
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
105105
- Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
106106
- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
107-
- :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
108107
- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
109108
- :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`)
110109
- :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`)
@@ -503,7 +502,8 @@ Reshaping
503502
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
504503
- Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
505504
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
506-
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
505+
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically in rare cases (:issue:`53786`)
506+
- Bug in :meth:`DataFrame.stack` sorting index lexicographically in rare cases (:issue:`53824`)
507507
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
508508
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
509509
-

pandas/core/frame.py

+11-13
Original file line numberDiff line numberDiff line change
@@ -9017,7 +9017,7 @@ def pivot_table(
90179017
sort=sort,
90189018
)
90199019

9020-
def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
9020+
def stack(self, level: IndexLabel = -1, dropna: bool = True):
90219021
"""
90229022
Stack the prescribed level(s) from columns to index.
90239023
@@ -9043,8 +9043,6 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
90439043
axis can create combinations of index and column values
90449044
that are missing from the original dataframe. See Examples
90459045
section.
9046-
sort : bool, default True
9047-
Whether to sort the levels of the resulting MultiIndex.
90489046
90499047
Returns
90509048
-------
@@ -9144,15 +9142,15 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
91449142
91459143
>>> df_multi_level_cols2.stack(0)
91469144
kg m
9147-
cat height NaN 2.0
9148-
weight 1.0 NaN
9149-
dog height NaN 4.0
9150-
weight 3.0 NaN
9145+
cat weight 1.0 NaN
9146+
height NaN 2.0
9147+
dog weight 3.0 NaN
9148+
height NaN 4.0
91519149
>>> df_multi_level_cols2.stack([0, 1])
9152-
cat height m 2.0
9153-
weight kg 1.0
9154-
dog height m 4.0
9155-
weight kg 3.0
9150+
cat weight kg 1.0
9151+
height m 2.0
9152+
dog weight kg 3.0
9153+
height m 4.0
91569154
dtype: float64
91579155
91589156
**Dropping missing values**
@@ -9188,9 +9186,9 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
91889186
)
91899187

91909188
if isinstance(level, (tuple, list)):
9191-
result = stack_multiple(self, level, dropna=dropna, sort=sort)
9189+
result = stack_multiple(self, level, dropna=dropna)
91929190
else:
9193-
result = stack(self, level, dropna=dropna, sort=sort)
9191+
result = stack(self, level, dropna=dropna)
91949192

91959193
return result.__finalize__(self, method="stack")
91969194

pandas/core/reshape/reshape.py

+22-24
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import itertools
43
from typing import (
54
TYPE_CHECKING,
65
cast,
@@ -499,7 +498,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True):
499498
if isinstance(obj.index, MultiIndex):
500499
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
501500
else:
502-
return obj.T.stack(dropna=False, sort=sort)
501+
return obj.T.stack(dropna=False)
503502
elif not isinstance(obj.index, MultiIndex):
504503
# GH 36113
505504
# Give nicer error messages when unstack a Series whose
@@ -572,7 +571,7 @@ def _unstack_extension_series(
572571
return result
573572

574573

575-
def stack(frame: DataFrame, level=-1, dropna: bool = True, sort: bool = True):
574+
def stack(frame: DataFrame, level=-1, dropna: bool = True):
576575
"""
577576
Convert DataFrame to Series with multi-level Index. Columns become the
578577
second level of the resulting hierarchical index
@@ -594,9 +593,7 @@ def factorize(index):
594593
level_num = frame.columns._get_level_number(level)
595594

596595
if isinstance(frame.columns, MultiIndex):
597-
return _stack_multi_columns(
598-
frame, level_num=level_num, dropna=dropna, sort=sort
599-
)
596+
return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
600597
elif isinstance(frame.index, MultiIndex):
601598
new_levels = list(frame.index.levels)
602599
new_codes = [lab.repeat(K) for lab in frame.index.codes]
@@ -649,13 +646,13 @@ def factorize(index):
649646
return frame._constructor_sliced(new_values, index=new_index)
650647

651648

652-
def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = True):
649+
def stack_multiple(frame: DataFrame, level, dropna: bool = True):
653650
# If all passed levels match up to column names, no
654651
# ambiguity about what to do
655652
if all(lev in frame.columns.names for lev in level):
656653
result = frame
657654
for lev in level:
658-
result = stack(result, lev, dropna=dropna, sort=sort)
655+
result = stack(result, lev, dropna=dropna)
659656

660657
# Otherwise, level numbers may change as each successive level is stacked
661658
elif all(isinstance(lev, int) for lev in level):
@@ -668,7 +665,7 @@ def stack_multiple(frame: DataFrame, level, dropna: bool = True, sort: bool = Tr
668665

669666
while level:
670667
lev = level.pop(0)
671-
result = stack(result, lev, dropna=dropna, sort=sort)
668+
result = stack(result, lev, dropna=dropna)
672669
# Decrement all level numbers greater than current, as these
673670
# have now shifted down by one
674671
level = [v if v <= lev else v - 1 for v in level]
@@ -694,7 +691,14 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
694691

695692
# Remove duplicate tuples in the MultiIndex.
696693
tuples = zip(*levs)
697-
unique_tuples = (key for key, _ in itertools.groupby(tuples))
694+
seen = set()
695+
# mypy doesn't like our trickery to get `set.add` to work in a comprehension
696+
# error: "add" of "set" does not return a value
697+
unique_tuples = (
698+
key
699+
for key in tuples
700+
if not (key in seen or seen.add(key)) # type: ignore[func-returns-value]
701+
)
698702
new_levs = zip(*unique_tuples)
699703

700704
# The dtype of each level must be explicitly set to avoid inferring the wrong type.
@@ -710,7 +714,7 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
710714

711715

712716
def _stack_multi_columns(
713-
frame: DataFrame, level_num: int = -1, dropna: bool = True, sort: bool = True
717+
frame: DataFrame, level_num: int = -1, dropna: bool = True
714718
) -> DataFrame:
715719
def _convert_level_number(level_num: int, columns: Index):
716720
"""
@@ -740,31 +744,22 @@ def _convert_level_number(level_num: int, columns: Index):
740744
roll_columns = roll_columns.swaplevel(lev1, lev2)
741745
this.columns = mi_cols = roll_columns
742746

743-
if not mi_cols._is_lexsorted() and sort:
744-
# Workaround the edge case where 0 is one of the column names,
745-
# which interferes with trying to sort based on the first
746-
# level
747-
level_to_sort = _convert_level_number(0, mi_cols)
748-
this = this.sort_index(level=level_to_sort, axis=1)
749-
mi_cols = this.columns
750-
751-
mi_cols = cast(MultiIndex, mi_cols)
752747
new_columns = _stack_multi_column_index(mi_cols)
753748

754749
# time to ravel the values
755750
new_data = {}
756751
level_vals = mi_cols.levels[-1]
757752
level_codes = unique(mi_cols.codes[-1])
758-
if sort:
759-
level_codes = np.sort(level_codes)
760753
level_vals_nan = level_vals.insert(len(level_vals), None)
761754

762755
level_vals_used = np.take(level_vals_nan, level_codes)
763756
levsize = len(level_codes)
764757
drop_cols = []
765758
for key in new_columns:
766759
try:
767-
loc = this.columns.get_loc(key)
760+
with warnings.catch_warnings():
761+
warnings.simplefilter("ignore", PerformanceWarning)
762+
loc = this.columns.get_loc(key)
768763
except KeyError:
769764
drop_cols.append(key)
770765
continue
@@ -774,9 +769,12 @@ def _convert_level_number(level_num: int, columns: Index):
774769
# but if unsorted can get a boolean
775770
# indexer
776771
if not isinstance(loc, slice):
777-
slice_len = len(loc)
772+
slice_len = loc.sum()
778773
else:
779774
slice_len = loc.stop - loc.start
775+
if loc.step is not None:
776+
# Integer division using ceiling instead of floor
777+
slice_len = -(slice_len // -loc.step)
780778

781779
if slice_len != levsize:
782780
chunk = this.loc[:, this.columns[loc]]

pandas/tests/frame/test_stack_unstack.py

+13-14
Original file line numberDiff line numberDiff line change
@@ -1099,18 +1099,18 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels):
10991099
"labels,data",
11001100
[
11011101
(list("xyz"), [10, 11, 12, 13, 14, 15]),
1102-
(list("zyx"), [14, 15, 12, 13, 10, 11]),
1102+
(list("zyx"), [10, 11, 12, 13, 14, 15]),
11031103
],
11041104
)
11051105
def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data):
11061106
# GH-36991
11071107
cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered)
11081108
cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered)
11091109
midx = MultiIndex.from_product([cidx, cidx2])
1110-
df = DataFrame([sorted(data)], columns=midx)
1110+
df = DataFrame([data], columns=midx)
11111111
result = df.stack([0, 1])
11121112

1113-
s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered)
1113+
s_cidx = pd.CategoricalIndex(labels, ordered=ordered)
11141114
expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2]))
11151115

11161116
tm.assert_series_equal(result, expected)
@@ -1400,16 +1400,16 @@ def test_unstack_non_slice_like_blocks(using_array_manager):
14001400
tm.assert_frame_equal(res, expected)
14011401

14021402

1403-
def test_stack_sort_false():
1404-
# GH 15105
1403+
def test_stack_nosort():
1404+
# GH 15105, GH 53825
14051405
data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]]
14061406
df = DataFrame(
14071407
data,
14081408
columns=MultiIndex(
14091409
levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
14101410
),
14111411
)
1412-
result = df.stack(level=0, sort=False)
1412+
result = df.stack(level=0)
14131413
expected = DataFrame(
14141414
{"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]},
14151415
index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]),
@@ -1421,15 +1421,15 @@ def test_stack_sort_false():
14211421
data,
14221422
columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]),
14231423
)
1424-
result = df.stack(level=0, sort=False)
1424+
result = df.stack(level=0)
14251425
tm.assert_frame_equal(result, expected)
14261426

14271427

1428-
def test_stack_sort_false_multi_level():
1429-
# GH 15105
1428+
def test_stack_nosort_multi_level():
1429+
# GH 15105, GH 53825
14301430
idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")])
14311431
df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx)
1432-
result = df.stack([0, 1], sort=False)
1432+
result = df.stack([0, 1])
14331433
expected_index = MultiIndex.from_tuples(
14341434
[
14351435
("cat", "weight", "kg"),
@@ -1999,13 +1999,12 @@ def __init__(self, *args, **kwargs) -> None:
19991999
),
20002000
)
20012001
@pytest.mark.parametrize("stack_lev", range(2))
2002-
@pytest.mark.parametrize("sort", [True, False])
2003-
def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort):
2002+
def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
20042003
# GH#16323
20052004
# deep check for 1-row case
20062005
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
20072006
df = DataFrame(columns=columns, data=[range(4)])
2008-
df_stacked = df.stack(stack_lev, sort=sort)
2007+
df_stacked = df.stack(stack_lev)
20092008
for row in df.index:
20102009
for col in df.columns:
20112010
expected = df.loc[row, col]
@@ -2037,7 +2036,7 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self):
20372036
stack_lev = 1
20382037
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
20392038
df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
2040-
result = df.stack(stack_lev, sort=True)
2039+
result = df.stack(stack_lev)
20412040
expected_index = MultiIndex(
20422041
levels=[[0, 1, 2, 3], [0, 1]],
20432042
codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],

0 commit comments

Comments
 (0)