diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4088736dd4150..dc8ca5ff5d367 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9158,15 +9158,22 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): dog kg 2.0 NaN m NaN 3.0 """ - from pandas.core.reshape.reshape import ( - stack, - stack_multiple, - ) + from pandas.core.reshape.reshape import stack_v2 - if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) - else: - result = stack(self, level, dropna=dropna, sort=sort) + if ( + isinstance(level, (tuple, list)) + and not all(lev in self.columns.names for lev in level) + and not all(isinstance(lev, int) for lev in level) + ): + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.columns._get_level_number(lev) for lev in level] + result = stack_v2(self, level, dropna=dropna, sort=sort) return result.__finalize__(self, method="stack") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5181623c0c327..6ccd8b9c850cd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2386,6 +2386,10 @@ def reorder_levels(self, order) -> MultiIndex: names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] + result = self._reorder_ilevels(order) + return result + + def _reorder_ilevels(self, order) -> MultiIndex: if len(order) != self.nlevels: raise AssertionError( f"Length of order must be same as number of levels ({self.nlevels}), " diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6b452f7cdaecf..fe654eb1acb8e 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,14 +28,19 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.algorithms import unique +from pandas.core.algorithms import ( + factorize, + unique, +) from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, + RangeIndex, ) +from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -876,3 +881,110 @@ def _reorder_for_extension_array_stack( # c0r1, c1r1, c2r1, ...] idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() return arr.take(idx) + + +def stack_v2(frame, level: list[int], dropna: bool = True, sort: bool = True): + if frame.columns.nunique() != len(frame.columns): + raise ValueError("Columns with duplicate values are not supported in stack") + + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level)[::-1] + stack_cols = frame.columns._drop_level_numbers( + [k for k in range(frame.columns.nlevels) if k not in level][::-1] + ) + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) + else: + ordered_stack_cols = stack_cols + + stack_cols_unique = stack_cols.unique() + ordered_stack_cols_unique = ordered_stack_cols.unique() + + # Grab data for each unique index to be stacked + buf = [] + for idx in stack_cols_unique: + if len(frame.columns) == 1: + data = frame.copy() + else: + # Take the data from frame corresponding to this idx value + if not isinstance(idx, tuple): + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in level else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] + + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) + elif stack_cols.nlevels == 1: + if data.ndim == 1: + data = data.rename(0) + else: + data.columns = RangeIndex(len(data.columns)) + buf.append(data) + if len(buf) > 0: + result = concat(buf) + ratio = len(result) // len(frame) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns) + ratio = 0 + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = np.tile(frame.index.codes, (1, ratio)) + else: + index_levels = [frame.index.unique()] + codes = factorize(frame.index)[0] + index_codes = np.tile(codes, (1, ratio)) + index_codes = list(index_codes) + if isinstance(stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols.unique()] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + if dropna: + result = result.dropna(how="all") + + return result diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index a48728a778877..cb968ac148595 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1080,10 +1080,9 @@ def test_stack_full_multiIndex(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) - def test_stack_preserve_categorical_dtype(self, ordered, labels): + def test_stack_preserve_categorical_dtype(self, ordered): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() @@ -1110,8 +1109,10 @@ def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): df = DataFrame([sorted(data)], columns=midx) result = df.stack([0, 1]) - s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered) - expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2])) + s_cidx = pd.CategoricalIndex(labels, ordered=ordered) + expected = Series( + sorted(data), index=MultiIndex.from_product([[0], s_cidx, cidx2]) + ) tm.assert_series_equal(result, expected) @@ -1311,7 +1312,7 @@ def test_stack_timezone_aware_values(): def test_stack_empty_frame(dropna): # GH 36113 levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] - expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) + expected = Series(index=MultiIndex(levels=levels, codes=[[], []])) result = DataFrame(dtype=np.float64).stack(dropna=dropna) tm.assert_series_equal(result, expected) @@ -1568,17 +1569,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): @pytest.mark.parametrize( "idx, columns, exp_idx", [ - [ - list("abab"), - ["1st", "2nd", "3rd"], - MultiIndex( - levels=[["a", "b"], ["1st", "2nd", "3rd"]], - codes=[ - np.tile(np.arange(2).repeat(3), 2), - np.tile(np.arange(3), 4), - ], - ), - ], [ list("abab"), ["1st", "2nd", "1st"], @@ -1608,12 +1598,9 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx): index=idx, columns=columns, ) - result = df.stack() - expected = Series(np.arange(12), index=exp_idx) - tm.assert_series_equal(result, expected) - assert result.index.is_unique is False - li, ri = result.index, expected.index - tm.assert_index_equal(li, ri) + msg = "Columns with duplicate values are not supported in stack" + with pytest.raises(ValueError, match=msg): + df.stack() def test_unstack_odd_failure(self): data = """day,time,smoker,sum,len @@ -2199,8 +2186,14 @@ def test_stack_nan_in_multiindex_columns(self): result = df.stack(2) expected = DataFrame( [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], - index=Index([(0, None), (0, 0), (0, 1)]), - columns=Index([(0, None), (0, 2), (0, 3)]), + index=MultiIndex( + levels=[[0], [0.0, 1.0]], + codes=[[0, 0, 0], [-1, 0, 1]], + ), + columns=MultiIndex( + levels=[[0], [2, 3]], + codes=[[0, 0, 0], [-1, 0, 1]], + ), ) tm.assert_frame_equal(result, expected) @@ -2218,20 +2211,20 @@ def test_multi_level_stack_categorical(self): expected = DataFrame( [ [0, np.nan], - [np.nan, 2], [1, np.nan], + [np.nan, 2], [np.nan, 3], [4, np.nan], - [np.nan, 6], [5, np.nan], + [np.nan, 6], [np.nan, 7], ], columns=["A", "B"], index=MultiIndex.from_arrays( [ [0] * 4 + [1] * 4, - pd.Categorical(list("aabbaabb")), - pd.Categorical(list("cdcdcdcd")), + pd.Categorical(list("abababab")), + pd.Categorical(list("ccddccdd")), ] ), ) @@ -2251,8 +2244,10 @@ def test_stack_nan_level(self): expected = DataFrame( [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]], columns=Index(["A", "B"], name="Upper"), - index=MultiIndex.from_tuples( - [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] + index=MultiIndex( + levels=[[0, 1], [np.nan, "b"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["Num", "Lower"], ), ) tm.assert_frame_equal(result, expected)