From 848081ce5be6a5fc5b16b358df0ba95fabf9fde9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Jun 2023 18:37:40 -0400 Subject: [PATCH 01/12] WIP --- pandas/core/frame.py | 29 +++++++- pandas/core/indexes/multi.py | 14 ++++ pandas/core/reshape/reshape.py | 87 ++++++++++++++++++++++++ pandas/tests/frame/test_stack_unstack.py | 4 +- 4 files changed, 129 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 671924c5e9607..cd4280ec0fe7f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9154,14 +9154,37 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): from pandas.core.reshape.reshape import ( stack, stack_multiple, + new_stack, ) if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) + old_result = stack_multiple(self, level, dropna=dropna, sort=sort) else: - result = stack(self, level, dropna=dropna, sort=sort) + old_result = stack(self, level, dropna=dropna, sort=sort) + + new_level = level + if not isinstance(new_level, (tuple, list)): + new_level = [new_level] + new_level = [self.columns._get_level_number(lev) for lev in new_level] + result = new_stack(self, new_level) + if result.ndim == 2 and len(result.columns) == 1 and len(new_level) == self.columns.nlevels: + result = result.iloc[:, 0] + if result.ndim == 1: + result = result.rename(None) + if sort: + if isinstance(self.columns, MultiIndex) and not self.columns._is_lexsorted(): + result = result.sort_index() + if result.ndim == 2: + # TODO: Hack! Should we be sorting the columns? + try: + result = result[sorted(result.columns)] + except Exception: + pass + + import pandas._testing as tm + tm.assert_equal(result.sort_index(), old_result.sort_index(), check_dtype=False) - return result.__finalize__(self, method="stack") + return old_result.__finalize__(self, method="stack") def explode( self, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a1c240f72a28b..c5d8edd065a5f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2382,6 +2382,20 @@ def reorder_levels(self, order) -> MultiIndex: levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) + def _reorder_ilevels(self, order) -> MultiIndex: + if len(order) != self.nlevels: + raise AssertionError( + f"Length of order must be same as number of levels ({self.nlevels}), " + f"got {len(order)}" + ) + new_levels = [self.levels[i] for i in order] + new_codes = [self.codes[i] for i in order] + new_names = [self.names[i] for i in order] + + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) + def _get_codes_for_sorting(self) -> list[Categorical]: """ we are categorizing our codes by using the diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 3866d30e9c757..5e266ad92dfee 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -871,3 +871,90 @@ def _reorder_for_extension_array_stack( # c0r1, c1r1, c2r1, ...] idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() return arr.take(idx) + + +def new_stack(df, levels): + stack_cols = ( + df.columns + ._drop_level_numbers([k for k in range(df.columns.nlevels) if k not in levels][::-1]) + ) + _, taker = np.unique(levels, return_inverse=True) + if len(levels) > 1: + # Arrange columns in the order we want to take them + ordered_stack_cols = stack_cols._reorder_ilevels(taker) + else: + ordered_stack_cols = stack_cols + + buf = [] + for idx in stack_cols.unique(): + if not isinstance(idx, tuple): + idx = (idx,) + # Take the data from df corresponding to this idx value + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in levels else slice(None) + for k in range(df.columns.nlevels) + ) + if len(df.columns) == 1: + # TODO: Are there other cases that cause issues; e.g. one column? + data = df + else: + data = df.loc[:, column_indexer] + + # When len(levels) == df.columns.nlevels, we're stacking all columns + # and end up with a Series + if len(levels) < df.columns.nlevels: + levnums = sorted(levels)[::-1] + data.columns = data.columns._drop_level_numbers(levnums) + elif stack_cols.nlevels == 1: + from pandas.core.indexes.range import RangeIndex + data.columns = RangeIndex(0, 1) + + buf.append(data) + + from pandas.core.reshape.concat import concat + result = concat(buf) + + # Construct the correct MultiIndex by combining the input's index and + # stacked columns. + if isinstance(df.index, MultiIndex): + index_levels = [level.unique() for level in df.index.levels] + else: + index_levels = [ + df.index.get_level_values(k).values for k in range(df.index.nlevels) + ] + if isinstance(stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + else: + column_levels = [ + ordered_stack_cols.get_level_values(e).unique() + for e in range(ordered_stack_cols.nlevels) + ] + if isinstance(df.index, MultiIndex): + index_codes = np.tile(df.index.codes, (1, len(result) // len(df))) + else: + index_codes = np.tile(np.arange(len(df)), (1, len(result) // len(df))) + index_codes = [e for e in index_codes] + if isinstance(stack_cols, MultiIndex): + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_codes = [np.arange(stack_cols.nunique())] + column_codes = [np.repeat(codes, len(df)) for codes in column_codes] + index_names = df.index.names + column_names = list(ordered_stack_cols.names) + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=index_names + column_names, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(df) + n_uniques = len(ordered_stack_cols.unique()) + idxs = ( + np.tile(len_df * np.arange(n_uniques), len_df) + + np.repeat(np.arange(len_df), n_uniques) + ) + result = result.take(idxs) + + return result \ No newline at end of file diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2818df721db34..7513ffad796a6 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1037,7 +1037,7 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) - result = df.stack(level=level, dropna=False) + result = df.stack(level=level, dropna=True) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, @@ -1052,7 +1052,7 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): df.columns = MultiIndex.from_tuples( df.columns.to_numpy(), names=df.columns.names ) - expected = df.stack(level=level, dropna=False) + expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: From fa46971f219224f82af68e18f20453682130622b Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 13 Jun 2023 17:55:45 -0400 Subject: [PATCH 02/12] WIP --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cd4280ec0fe7f..1fdd7731a0ec9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9167,7 +9167,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): new_level = [new_level] new_level = [self.columns._get_level_number(lev) for lev in new_level] result = new_stack(self, new_level) - if result.ndim == 2 and len(result.columns) == 1 and len(new_level) == self.columns.nlevels: + if result.ndim == 2 and len(result.columns) == 1 and len(new_level) == self.columns.nlevels and not isinstance(level, (tuple, list)) and self.columns.nlevels > 1: result = result.iloc[:, 0] if result.ndim == 1: result = result.rename(None) @@ -9182,7 +9182,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): pass import pandas._testing as tm - tm.assert_equal(result.sort_index(), old_result.sort_index(), check_dtype=False) + # tm.assert_equal(result.sort_index(), old_result.sort_index(), check_dtype=False) return old_result.__finalize__(self, method="stack") From bcf9eee8cbbc3cf0f396b8013722321bfeb741b5 Mon Sep 17 00:00:00 2001 From: richard Date: Sat, 17 Jun 2023 11:10:55 -0400 Subject: [PATCH 03/12] FWV --- pandas/core/frame.py | 37 ++++---- pandas/core/reshape/reshape.py | 61 +++++++++----- pandas/tests/frame/test_stack_unstack.py | 102 ++++++++++++++++------- pandas/tests/groupby/test_groupby.py | 1 + 4 files changed, 132 insertions(+), 69 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a40d37ec5be05..ed73789508398 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9152,39 +9152,40 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): m 3.0 NaN """ from pandas.core.reshape.reshape import ( + new_stack, stack, stack_multiple, - new_stack, ) if isinstance(level, (tuple, list)): - old_result = stack_multiple(self, level, dropna=dropna, sort=sort) + stack_multiple(self, level, dropna=dropna, sort=sort) else: - old_result = stack(self, level, dropna=dropna, sort=sort) + stack(self, level, dropna=dropna, sort=sort) new_level = level if not isinstance(new_level, (tuple, list)): new_level = [new_level] new_level = [self.columns._get_level_number(lev) for lev in new_level] - result = new_stack(self, new_level) - if result.ndim == 2 and len(result.columns) == 1 and len(new_level) == self.columns.nlevels and not isinstance(level, (tuple, list)) and self.columns.nlevels > 1: + result = new_stack(self, new_level, sort=sort) + if ( + result.ndim == 2 + and ( + ( + isinstance(self.columns, MultiIndex) + and self.columns.nlevels == len(new_level) + ) + or (not isinstance(self.columns, MultiIndex)) + ) + and not result.empty + and len(result.columns) == 1 + ): result = result.iloc[:, 0] if result.ndim == 1: result = result.rename(None) - if sort: - if isinstance(self.columns, MultiIndex) and not self.columns._is_lexsorted(): - result = result.sort_index() - if result.ndim == 2: - # TODO: Hack! Should we be sorting the columns? - try: - result = result[sorted(result.columns)] - except Exception: - pass - - import pandas._testing as tm - # tm.assert_equal(result.sort_index(), old_result.sort_index(), check_dtype=False) + if dropna: + result = result.dropna(how="all") - return old_result.__finalize__(self, method="stack") + return result.__finalize__(self, method="stack") def explode( self, diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e7ff0bc8378c1..ecf9f27ff970a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,13 +28,17 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.algorithms import unique +from pandas.core.algorithms import ( + factorize, + unique, +) from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, + RangeIndex, ) from pandas.core.series import Series from pandas.core.sorting import ( @@ -882,10 +886,9 @@ def _reorder_for_extension_array_stack( return arr.take(idx) -def new_stack(df, levels): - stack_cols = ( - df.columns - ._drop_level_numbers([k for k in range(df.columns.nlevels) if k not in levels][::-1]) +def new_stack(df, levels, sort: bool = True): + stack_cols = df.columns._drop_level_numbers( + [k for k in range(df.columns.nlevels) if k not in levels][::-1] ) _, taker = np.unique(levels, return_inverse=True) if len(levels) > 1: @@ -901,14 +904,13 @@ def new_stack(df, levels): # Take the data from df corresponding to this idx value gen = iter(idx) column_indexer = tuple( - next(gen) if k in levels else slice(None) - for k in range(df.columns.nlevels) + next(gen) if k in levels else slice(None) for k in range(df.columns.nlevels) ) if len(df.columns) == 1: # TODO: Are there other cases that cause issues; e.g. one column? - data = df + data = df.copy() else: - data = df.loc[:, column_indexer] + data = df.loc[:, column_indexer].copy() # When len(levels) == df.columns.nlevels, we're stacking all columns # and end up with a Series @@ -916,22 +918,28 @@ def new_stack(df, levels): levnums = sorted(levels)[::-1] data.columns = data.columns._drop_level_numbers(levnums) elif stack_cols.nlevels == 1: - from pandas.core.indexes.range import RangeIndex - data.columns = RangeIndex(0, 1) + if data.ndim == 2: + data.columns = RangeIndex(len(data.columns)) + else: + data = data.rename(0) buf.append(data) + if len(buf) == 0: + return df + from pandas.core.reshape.concat import concat + result = concat(buf) + if len(levels) < df.columns.nlevels: + result = result[df.columns._drop_level_numbers(sorted(levels)[::-1]).unique()] # Construct the correct MultiIndex by combining the input's index and # stacked columns. if isinstance(df.index, MultiIndex): index_levels = [level.unique() for level in df.index.levels] else: - index_levels = [ - df.index.get_level_values(k).values for k in range(df.index.nlevels) - ] + index_levels = [df.index.unique()] if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels else: @@ -941,13 +949,21 @@ def new_stack(df, levels): ] if isinstance(df.index, MultiIndex): index_codes = np.tile(df.index.codes, (1, len(result) // len(df))) - else: + elif isinstance(df.index, RangeIndex): index_codes = np.tile(np.arange(len(df)), (1, len(result) // len(df))) - index_codes = [e for e in index_codes] + else: + codes = factorize(df.index)[0] + index_codes = np.tile(codes, (1, len(result) // len(df))) + index_codes = list(index_codes) if isinstance(stack_cols, MultiIndex): column_codes = ordered_stack_cols.drop_duplicates().codes + elif isinstance(stack_cols, RangeIndex): + column_codes = [np.arange(len(stack_cols))] else: - column_codes = [np.arange(stack_cols.nunique())] + # TODO: use_na_sentinel? + column_codes = [ + factorize(ordered_stack_cols.unique(), use_na_sentinel=False)[0] + ] column_codes = [np.repeat(codes, len(df)) for codes in column_codes] index_names = df.index.names column_names = list(ordered_stack_cols.names) @@ -960,10 +976,11 @@ def new_stack(df, levels): # sort result, but faster than calling sort_index since we know the order we need len_df = len(df) n_uniques = len(ordered_stack_cols.unique()) - idxs = ( - np.tile(len_df * np.arange(n_uniques), len_df) - + np.repeat(np.arange(len_df), n_uniques) - ) + if sort: + indexer = np.argsort(stack_cols.unique().to_numpy(), kind="stable") + else: + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) result = result.take(idxs) - return result \ No newline at end of file + return result diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 07a39d48e3b3c..b8bed2fe2e4c9 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -46,9 +46,12 @@ def test_stack_mixed_level(self): # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) - result = df.stack() - expected = Series(1, index=MultiIndex.from_product(levels[:2])) - tm.assert_series_equal(result, expected) + with pytest.raises( + TypeError, match="'<' not supported between instances of 'int' and 'str'" + ): + result = df.stack() + # expected = Series(1, index=MultiIndex.from_product(levels[:2])) + # tm.assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) @@ -515,10 +518,10 @@ def test_unstack_level_binding(self): expected = DataFrame( np.array( - [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 + [[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64 ), index=expected_mi, - columns=Index(["a", "b"], name="third"), + columns=Index(["b", "a"], name="third"), ) tm.assert_frame_equal(result, expected) @@ -1088,10 +1091,16 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. - midx = MultiIndex.from_product([df.index, cidx]) - expected = Series([10, 11, 12], index=midx) - tm.assert_series_equal(result, expected) + if labels == list("yxz"): + midx = MultiIndex.from_product([df.index, cidx.sort_values()]) + expected = Series([11, 10, 12], index=midx) + else: + # Don't include the 2nd y + midx = MultiIndex.from_product([df.index, cidx[:2].sort_values()]) + expected = DataFrame({0: [11, 10], 1: [np.nan, 12.0]}, index=midx) + + tm.assert_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize( @@ -1537,7 +1546,8 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): unstacked = ymd.unstack() unstacked = unstacked.sort_index(axis=1, ascending=False) restacked = unstacked.stack() - tm.assert_frame_equal(restacked, ymd) + expected = ymd.sort_index(axis=1, ascending=False) + tm.assert_frame_equal(restacked, expected) # more than 2 levels in the columns unstacked = ymd.unstack(1).unstack(1) @@ -1609,11 +1619,36 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx): columns=columns, ) result = df.stack() - expected = Series(np.arange(12), index=exp_idx) - tm.assert_series_equal(result, expected) - assert result.index.is_unique is False - li, ri = result.index, expected.index - tm.assert_index_equal(li, ri) + if columns == ["1st", "2nd", "1st"] and isinstance(idx, MultiIndex): + expected = DataFrame( + { + 0: [0, 1, 3, 4, 6, 7, 9, 10], + 1: [2.0, np.nan, 5.0, np.nan, 8.0, np.nan, 11.0, np.nan], + }, + index=MultiIndex( + levels=[["a", "b"], [1, 2], ["1st", "2nd"]], + codes=[ + [0, 0, 1, 1, 0, 0, 1, 1], + [1, 1, 0, 0, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + ), + ) + elif columns == ["1st", "2nd", "1st"]: + expected = DataFrame( + { + 0: [0, 1, 3, 4, 6, 7, 9, 10], + 1: [2.0, np.nan, 5.0, np.nan, 8.0, np.nan, 11.0, np.nan], + }, + index=MultiIndex( + levels=[["a", "b"], ["1st", "2nd"]], + codes=[[0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]], + ), + ) + else: + expected = Series(np.arange(12), index=exp_idx) + + tm.assert_equal(result, expected) def test_unstack_odd_failure(self): data = """day,time,smoker,sum,len @@ -2039,12 +2074,12 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self): result = df.stack(stack_lev, sort=True) expected_index = MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], - codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]], + codes=[[1, 1, 0, 0, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], ) expected = DataFrame( { - 0: [0, 1, 0, 1, 0, 1, 0, 1], - 1: [2, 3, 2, 3, 2, 3, 2, 3], + 0: [1, 0, 1, 0, 1, 0, 1, 0], + 1: [3, 2, 3, 2, 3, 2, 3, 2], }, index=expected_index, ) @@ -2197,9 +2232,15 @@ def test_stack_nan_in_multiindex_columns(self): ) result = df.stack(2) expected = DataFrame( - [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], - index=Index([(0, None), (0, 0), (0, 1)]), - columns=Index([(0, None), (0, 2), (0, 3)]), + [[np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0], [0.0, np.nan, np.nan]], + index=MultiIndex( + levels=[[0], [0.0, 1.0, np.nan]], + codes=[[0, 0, 0], [0, 1, 2]], + ), + columns=MultiIndex( + levels=[[0], [np.nan, 2.0, 3.0]], + codes=[[0, 0, 0], [0, 1, 2]], + ), ) tm.assert_frame_equal(result, expected) @@ -2246,15 +2287,18 @@ def test_stack_nan_level(self): index=Index([0, 1], name="Num"), dtype=np.float64, ) - result = df_nan.stack() - expected = DataFrame( - [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]], - columns=Index(["A", "B"], name="Upper"), - index=MultiIndex.from_tuples( - [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] - ), - ) - tm.assert_frame_equal(result, expected) + with pytest.raises( + TypeError, match="'<' not supported between instances of 'float' and 'str'" + ): + df_nan.stack() + # expected = DataFrame( + # [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]], + # columns=Index(["A", "B"], name="Upper"), + # index=MultiIndex.from_tuples( + # [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] + # ), + # ) + # tm.assert_frame_equal(result, expected) def test_unstack_categorical_columns(self): # GH 14018 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bf0b646847ed6..cacc4a6c6b5a6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -279,6 +279,7 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): expected.index = Index(range(2)) tm.assert_frame_equal(res, expected) + assert False def test_len(): From 07d46830db4dc43be3cc2c16cb4ea4966ca52b00 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 18 Jun 2023 16:07:06 -0400 Subject: [PATCH 04/12] Refinements --- pandas/core/frame.py | 28 +++++----- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/reshape/reshape.py | 66 +++++++++++++++++------- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/groupby/test_categorical.py | 12 +++-- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/groupby/test_groupby.py | 1 - pandas/tests/io/json/test_pandas.py | 2 +- 9 files changed, 76 insertions(+), 41 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ed73789508398..88a8baa38ca05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9151,16 +9151,17 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): dog kg NaN 2.0 m 3.0 NaN """ - from pandas.core.reshape.reshape import ( - new_stack, - stack, - stack_multiple, - ) + from pandas.core.reshape.reshape import new_stack - if isinstance(level, (tuple, list)): - stack_multiple(self, level, dropna=dropna, sort=sort) - else: - stack(self, level, dropna=dropna, sort=sort) + if ( + isinstance(level, (tuple, list)) + and not all(lev in self.columns.names for lev in level) + and not all(isinstance(lev, int) for lev in level) + ): + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) new_level = level if not isinstance(new_level, (tuple, list)): @@ -9176,10 +9177,13 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): ) or (not isinstance(self.columns, MultiIndex)) ) - and not result.empty - and len(result.columns) == 1 + # and not result.empty + and len(result.columns) <= 1 ): - result = result.iloc[:, 0] + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] if result.ndim == 1: result = result.rename(None) if dropna: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2b1ff05f18d5e..0a3acd2a14f14 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -408,7 +408,7 @@ def _wrap_applied_output( res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - res_ser = res_df.stack(dropna=self.observed) + res_ser = res_df.stack(sort=False, dropna=self.observed) res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e447377db9e55..6f361a6a989a7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3189,7 +3189,7 @@ def describe( if obj.ndim == 1: result = described else: - result = described.unstack() + result = described.unstack(sort=False) return result.to_frame().T.iloc[:0] with com.temp_setattr(self, "as_index", True): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ecf9f27ff970a..7cef167929887 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -887,12 +887,45 @@ def _reorder_for_extension_array_stack( def new_stack(df, levels, sort: bool = True): + if df.empty: + N, K = df.shape + if len(levels) < df.index.nlevels: + # TODO: is this hit by tests? + new_columns = df.columns._drop_level_numbers(levels) + new_levels = list(df.index.levels) + new_codes = [lab.repeat(K) for lab in df.index.codes] + + clev, clab = factorize(df.columns) + new_levels.append(clev) + new_codes.append(np.tile(clab, N).ravel()) + + new_names = list(df.index.names) + new_names.append(df.columns.name) + new_index = MultiIndex( + levels=new_levels, + codes=new_codes, + names=new_names, + verify_integrity=False, + ) + else: + new_columns = RangeIndex(0) + levels, (ilab, clab) = zip(*map(factorize, (df.index, df.columns))) + codes = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex( + levels=levels, + codes=codes, + names=[df.index.name, df.columns.name], + verify_integrity=False, + ) + result = DataFrame(index=new_index, columns=new_columns) + return result + stack_cols = df.columns._drop_level_numbers( [k for k in range(df.columns.nlevels) if k not in levels][::-1] ) - _, taker = np.unique(levels, return_inverse=True) if len(levels) > 1: # Arrange columns in the order we want to take them + _, taker = np.unique(levels, return_inverse=True) ordered_stack_cols = stack_cols._reorder_ilevels(taker) else: ordered_stack_cols = stack_cols @@ -912,8 +945,6 @@ def new_stack(df, levels, sort: bool = True): else: data = df.loc[:, column_indexer].copy() - # When len(levels) == df.columns.nlevels, we're stacking all columns - # and end up with a Series if len(levels) < df.columns.nlevels: levnums = sorted(levels)[::-1] data.columns = data.columns._drop_level_numbers(levnums) @@ -925,52 +956,47 @@ def new_stack(df, levels, sort: bool = True): buf.append(data) - if len(buf) == 0: - return df - from pandas.core.reshape.concat import concat result = concat(buf) if len(levels) < df.columns.nlevels: - result = result[df.columns._drop_level_numbers(sorted(levels)[::-1]).unique()] + desired_columns = df.columns._drop_level_numbers(sorted(levels)[::-1]).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] # Construct the correct MultiIndex by combining the input's index and # stacked columns. if isinstance(df.index, MultiIndex): index_levels = [level.unique() for level in df.index.levels] + index_codes = np.tile(df.index.codes, (1, len(result) // len(df))) else: index_levels = [df.index.unique()] + codes = factorize(df.index)[0] + index_codes = np.tile(codes, (1, len(result) // len(df))) + index_codes = list(index_codes) + if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes else: column_levels = [ ordered_stack_cols.get_level_values(e).unique() for e in range(ordered_stack_cols.nlevels) ] - if isinstance(df.index, MultiIndex): - index_codes = np.tile(df.index.codes, (1, len(result) // len(df))) - elif isinstance(df.index, RangeIndex): - index_codes = np.tile(np.arange(len(df)), (1, len(result) // len(df))) - else: - codes = factorize(df.index)[0] - index_codes = np.tile(codes, (1, len(result) // len(df))) - index_codes = list(index_codes) - if isinstance(stack_cols, MultiIndex): - column_codes = ordered_stack_cols.drop_duplicates().codes - elif isinstance(stack_cols, RangeIndex): - column_codes = [np.arange(len(stack_cols))] - else: # TODO: use_na_sentinel? column_codes = [ factorize(ordered_stack_cols.unique(), use_na_sentinel=False)[0] ] column_codes = [np.repeat(codes, len(df)) for codes in column_codes] + index_names = df.index.names column_names = list(ordered_stack_cols.names) + result.index = MultiIndex( levels=index_levels + column_levels, codes=index_codes + column_codes, names=index_names + column_names, + verify_integrity=False, ) # sort result, but faster than calling sort_index since we know the order we need diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index b8bed2fe2e4c9..c1e84e7d876d9 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1319,7 +1319,7 @@ def test_stack_timezone_aware_values(): def test_stack_empty_frame(dropna): # GH 36113 levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] - expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) + expected = Series(index=MultiIndex(levels=levels, codes=[[], []])) result = DataFrame(dtype=np.float64).stack(dropna=dropna) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c0704d9684574..0d71719dd2f98 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -218,9 +218,13 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + tm.assert_index_equal( + (desc_result.stack(sort=False).index.get_level_values(0)), exp + ) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal( + (desc_result.stack(sort=False).index.get_level_values(1)), exp + ) def test_level_get_group(observed): @@ -653,7 +657,9 @@ def test_datetime(): exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) + tm.assert_index_equal( + (desc_result.stack(sort=False).index.get_level_values(1)), exp + ) def test_categorical_index(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 98fce9d668e44..24835856bb182 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1100,7 +1100,7 @@ def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() + expected = grouped.describe().stack(sort=False) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cacc4a6c6b5a6..bf0b646847ed6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -279,7 +279,6 @@ def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): expected.index = Index(range(2)) tm.assert_frame_equal(res, expected) - assert False def test_len(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a966ad1dabcaa..a7c1edb366862 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1812,7 +1812,7 @@ def test_to_json_multiindex_escape(self): True, index=pd.date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], - ).stack() + ).stack(sort=False) result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," From db02720af2494c0a15aaf1a67a41b71a409483c8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 19 Jun 2023 17:36:08 -0400 Subject: [PATCH 05/12] Ban duplicate values in columns when stacking --- pandas/core/reshape/reshape.py | 2 + pandas/tests/frame/test_stack_unstack.py | 61 +++--------------------- 2 files changed, 9 insertions(+), 54 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 7cef167929887..2ef2caf9788f2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -887,6 +887,8 @@ def _reorder_for_extension_array_stack( def new_stack(df, levels, sort: bool = True): + if df.columns.nunique() != len(df.columns): + raise ValueError("Columns with duplicate values are not supported in stack") if df.empty: N, K = df.shape if len(levels) < df.index.nlevels: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index c1e84e7d876d9..59c9fad259d20 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1082,24 +1082,16 @@ def test_stack_full_multiIndex(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) - def test_stack_preserve_categorical_dtype(self, ordered, labels): + def test_stack_preserve_categorical_dtype(self, ordered): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. - - if labels == list("yxz"): - midx = MultiIndex.from_product([df.index, cidx.sort_values()]) - expected = Series([11, 10, 12], index=midx) - else: - # Don't include the 2nd y - midx = MultiIndex.from_product([df.index, cidx[:2].sort_values()]) - expected = DataFrame({0: [11, 10], 1: [np.nan, 12.0]}, index=midx) - + midx = MultiIndex.from_product([df.index, cidx.sort_values()]) + expected = Series([11, 10, 12], index=midx) tm.assert_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) @@ -1578,17 +1570,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): @pytest.mark.parametrize( "idx, columns, exp_idx", [ - [ - list("abab"), - ["1st", "2nd", "3rd"], - MultiIndex( - levels=[["a", "b"], ["1st", "2nd", "3rd"]], - codes=[ - np.tile(np.arange(2).repeat(3), 2), - np.tile(np.arange(3), 4), - ], - ), - ], [ list("abab"), ["1st", "2nd", "1st"], @@ -1618,37 +1599,9 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx): index=idx, columns=columns, ) - result = df.stack() - if columns == ["1st", "2nd", "1st"] and isinstance(idx, MultiIndex): - expected = DataFrame( - { - 0: [0, 1, 3, 4, 6, 7, 9, 10], - 1: [2.0, np.nan, 5.0, np.nan, 8.0, np.nan, 11.0, np.nan], - }, - index=MultiIndex( - levels=[["a", "b"], [1, 2], ["1st", "2nd"]], - codes=[ - [0, 0, 1, 1, 0, 0, 1, 1], - [1, 1, 0, 0, 0, 0, 1, 1], - [0, 1, 0, 1, 0, 1, 0, 1], - ], - ), - ) - elif columns == ["1st", "2nd", "1st"]: - expected = DataFrame( - { - 0: [0, 1, 3, 4, 6, 7, 9, 10], - 1: [2.0, np.nan, 5.0, np.nan, 8.0, np.nan, 11.0, np.nan], - }, - index=MultiIndex( - levels=[["a", "b"], ["1st", "2nd"]], - codes=[[0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]], - ), - ) - else: - expected = Series(np.arange(12), index=exp_idx) - - tm.assert_equal(result, expected) + msg = "Columns with duplicate values are not supported in stack" + with pytest.raises(ValueError, match=msg): + df.stack() def test_unstack_odd_failure(self): data = """day,time,smoker,sum,len From c2980222bc39b011ff08376de5512ad6d5d12d80 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 19 Jun 2023 18:07:33 -0400 Subject: [PATCH 06/12] Some refactors --- pandas/core/frame.py | 31 +----- pandas/core/reshape/reshape.py | 189 +++++++++++++++++++-------------- 2 files changed, 114 insertions(+), 106 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 88a8baa38ca05..b946ea738366a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9151,7 +9151,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): dog kg NaN 2.0 m 3.0 NaN """ - from pandas.core.reshape.reshape import new_stack + from pandas.core.reshape.reshape import stack_v2 if ( isinstance(level, (tuple, list)) @@ -9163,31 +9163,10 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): "numbers, not a mixture of the two." ) - new_level = level - if not isinstance(new_level, (tuple, list)): - new_level = [new_level] - new_level = [self.columns._get_level_number(lev) for lev in new_level] - result = new_stack(self, new_level, sort=sort) - if ( - result.ndim == 2 - and ( - ( - isinstance(self.columns, MultiIndex) - and self.columns.nlevels == len(new_level) - ) - or (not isinstance(self.columns, MultiIndex)) - ) - # and not result.empty - and len(result.columns) <= 1 - ): - if len(result.columns) == 0: - result = Series(index=result.index) - else: - result = result.iloc[:, 0] - if result.ndim == 1: - result = result.rename(None) - if dropna: - result = result.dropna(how="all") + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.columns._get_level_number(lev) for lev in level] + result = stack_v2(self, level, dropna=dropna, sort=sort) return result.__finalize__(self, method="stack") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2ef2caf9788f2..a4899cc8dd250 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -40,6 +40,7 @@ MultiIndex, RangeIndex, ) +from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -886,95 +887,129 @@ def _reorder_for_extension_array_stack( return arr.take(idx) -def new_stack(df, levels, sort: bool = True): - if df.columns.nunique() != len(df.columns): +def stack_v2(frame, level, dropna: bool = True, sort: bool = True): + if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - if df.empty: - N, K = df.shape - if len(levels) < df.index.nlevels: - # TODO: is this hit by tests? - new_columns = df.columns._drop_level_numbers(levels) - new_levels = list(df.index.levels) - new_codes = [lab.repeat(K) for lab in df.index.codes] - - clev, clab = factorize(df.columns) - new_levels.append(clev) - new_codes.append(np.tile(clab, N).ravel()) - - new_names = list(df.index.names) - new_names.append(df.columns.name) - new_index = MultiIndex( - levels=new_levels, - codes=new_codes, - names=new_names, - verify_integrity=False, + if frame.empty: + result = stack_v2_empty(frame, level) + else: + result = stack_v2_nonempty(frame, level, sort) + + if ( + result.ndim == 2 + and ( + ( + isinstance(frame.columns, MultiIndex) + and frame.columns.nlevels == len(level) ) + or (not isinstance(frame.columns, MultiIndex)) + ) + # and not result.empty + and len(result.columns) <= 1 + ): + if len(result.columns) == 0: + result = Series(index=result.index) else: - new_columns = RangeIndex(0) - levels, (ilab, clab) = zip(*map(factorize, (df.index, df.columns))) - codes = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex( - levels=levels, - codes=codes, - names=[df.index.name, df.columns.name], - verify_integrity=False, - ) - result = DataFrame(index=new_index, columns=new_columns) - return result + result = result.iloc[:, 0] + if result.ndim == 1: + result = result.rename(None) + if dropna: + result = result.dropna(how="all") + + return result + + +def stack_v2_empty(df, level): + N, K = df.shape + if len(level) < df.index.nlevels: + # TODO: is this hit by tests? + new_columns = df.columns._drop_level_numbers(level) + new_levels = list(df.index.levels) + new_codes = [lab.repeat(K) for lab in df.index.codes] - stack_cols = df.columns._drop_level_numbers( - [k for k in range(df.columns.nlevels) if k not in levels][::-1] + clev, clab = factorize(df.columns) + new_levels.append(clev) + new_codes.append(np.tile(clab, N).ravel()) + + new_names = list(df.index.names) + new_names.append(df.columns.name) + new_index = MultiIndex( + levels=new_levels, + codes=new_codes, + names=new_names, + verify_integrity=False, + ) + else: + new_columns = RangeIndex(0) + levels, (ilab, clab) = zip(*map(factorize, (df.index, df.columns))) + codes = ilab.repeat(K), np.tile(clab, N).ravel() + new_index = MultiIndex( + levels=levels, + codes=codes, + names=[df.index.name, df.columns.name], + verify_integrity=False, + ) + result = DataFrame(index=new_index, columns=new_columns) + return result + + +def stack_v2_nonempty(frame, level, sort): + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level)[::-1] + stack_cols = frame.columns._drop_level_numbers( + [k for k in range(frame.columns.nlevels) if k not in level][::-1] ) - if len(levels) > 1: - # Arrange columns in the order we want to take them - _, taker = np.unique(levels, return_inverse=True) - ordered_stack_cols = stack_cols._reorder_ilevels(taker) + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) else: ordered_stack_cols = stack_cols + stack_cols_unique = stack_cols.unique() + ordered_stack_cols_unique = ordered_stack_cols.unique() + + # Grab data for each unique index to be stacked buf = [] - for idx in stack_cols.unique(): - if not isinstance(idx, tuple): - idx = (idx,) - # Take the data from df corresponding to this idx value - gen = iter(idx) - column_indexer = tuple( - next(gen) if k in levels else slice(None) for k in range(df.columns.nlevels) - ) - if len(df.columns) == 1: - # TODO: Are there other cases that cause issues; e.g. one column? - data = df.copy() + for idx in stack_cols_unique: + if len(frame.columns) == 1: + data = frame.copy() else: - data = df.loc[:, column_indexer].copy() + # Take the data from frame corresponding to this idx value + if not isinstance(idx, tuple): + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in level else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] - if len(levels) < df.columns.nlevels: - levnums = sorted(levels)[::-1] - data.columns = data.columns._drop_level_numbers(levnums) + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) elif stack_cols.nlevels == 1: - if data.ndim == 2: - data.columns = RangeIndex(len(data.columns)) - else: + if data.ndim == 1: data = data.rename(0) - + else: + data.columns = RangeIndex(len(data.columns)) buf.append(data) - - from pandas.core.reshape.concat import concat - result = concat(buf) - if len(levels) < df.columns.nlevels: - desired_columns = df.columns._drop_level_numbers(sorted(levels)[::-1]).unique() + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() if not result.columns.equals(desired_columns): result = result[desired_columns] # Construct the correct MultiIndex by combining the input's index and # stacked columns. - if isinstance(df.index, MultiIndex): - index_levels = [level.unique() for level in df.index.levels] - index_codes = np.tile(df.index.codes, (1, len(result) // len(df))) + if isinstance(frame.index, MultiIndex): + index_levels = [level.unique() for level in frame.index.levels] + index_codes = np.tile(frame.index.codes, (1, len(result) // len(frame))) else: - index_levels = [df.index.unique()] - codes = factorize(df.index)[0] - index_codes = np.tile(codes, (1, len(result) // len(df))) + index_levels = [frame.index.unique()] + codes = factorize(frame.index)[0] + index_codes = np.tile(codes, (1, len(result) // len(frame))) index_codes = list(index_codes) if isinstance(stack_cols, MultiIndex): @@ -985,27 +1020,21 @@ def new_stack(df, levels, sort: bool = True): ordered_stack_cols.get_level_values(e).unique() for e in range(ordered_stack_cols.nlevels) ] - # TODO: use_na_sentinel? - column_codes = [ - factorize(ordered_stack_cols.unique(), use_na_sentinel=False)[0] - ] - column_codes = [np.repeat(codes, len(df)) for codes in column_codes] - - index_names = df.index.names - column_names = list(ordered_stack_cols.names) + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] result.index = MultiIndex( levels=index_levels + column_levels, codes=index_codes + column_codes, - names=index_names + column_names, + names=frame.index.names + list(ordered_stack_cols.names), verify_integrity=False, ) # sort result, but faster than calling sort_index since we know the order we need - len_df = len(df) - n_uniques = len(ordered_stack_cols.unique()) + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) if sort: - indexer = np.argsort(stack_cols.unique().to_numpy(), kind="stable") + indexer = np.argsort(stack_cols_unique.to_numpy(), kind="stable") else: indexer = np.arange(n_uniques) idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) From 4781906d878c106765532564da6e40bfb062e39d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 19 Jun 2023 18:39:38 -0400 Subject: [PATCH 07/12] Some refactors --- pandas/core/reshape/reshape.py | 104 +++++++++------------------------ 1 file changed, 29 insertions(+), 75 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index a4899cc8dd250..7b53dbe93d4f4 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -887,73 +887,10 @@ def _reorder_for_extension_array_stack( return arr.take(idx) -def stack_v2(frame, level, dropna: bool = True, sort: bool = True): +def stack_v2(frame, level: list[int], dropna: bool = True, sort: bool = True): if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - if frame.empty: - result = stack_v2_empty(frame, level) - else: - result = stack_v2_nonempty(frame, level, sort) - - if ( - result.ndim == 2 - and ( - ( - isinstance(frame.columns, MultiIndex) - and frame.columns.nlevels == len(level) - ) - or (not isinstance(frame.columns, MultiIndex)) - ) - # and not result.empty - and len(result.columns) <= 1 - ): - if len(result.columns) == 0: - result = Series(index=result.index) - else: - result = result.iloc[:, 0] - if result.ndim == 1: - result = result.rename(None) - if dropna: - result = result.dropna(how="all") - - return result - - -def stack_v2_empty(df, level): - N, K = df.shape - if len(level) < df.index.nlevels: - # TODO: is this hit by tests? - new_columns = df.columns._drop_level_numbers(level) - new_levels = list(df.index.levels) - new_codes = [lab.repeat(K) for lab in df.index.codes] - - clev, clab = factorize(df.columns) - new_levels.append(clev) - new_codes.append(np.tile(clab, N).ravel()) - new_names = list(df.index.names) - new_names.append(df.columns.name) - new_index = MultiIndex( - levels=new_levels, - codes=new_codes, - names=new_names, - verify_integrity=False, - ) - else: - new_columns = RangeIndex(0) - levels, (ilab, clab) = zip(*map(factorize, (df.index, df.columns))) - codes = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex( - levels=levels, - codes=codes, - names=[df.index.name, df.columns.name], - verify_integrity=False, - ) - result = DataFrame(index=new_index, columns=new_columns) - return result - - -def stack_v2_nonempty(frame, level, sort): # If we need to drop `level` from columns, it needs to be in descending order drop_levnums = sorted(level)[::-1] stack_cols = frame.columns._drop_level_numbers( @@ -993,7 +930,18 @@ def stack_v2_nonempty(frame, level, sort): else: data.columns = RangeIndex(len(data.columns)) buf.append(data) - result = concat(buf) + if len(buf) > 0: + result = concat(buf) + ratio = len(result) // len(frame) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns) + ratio = 0 if len(level) < frame.columns.nlevels: # concat column order may be different from dropping the levels @@ -1001,28 +949,23 @@ def stack_v2_nonempty(frame, level, sort): if not result.columns.equals(desired_columns): result = result[desired_columns] - # Construct the correct MultiIndex by combining the input's index and + # Construct the correct MultiIndex by combining the frame's index and # stacked columns. if isinstance(frame.index, MultiIndex): - index_levels = [level.unique() for level in frame.index.levels] - index_codes = np.tile(frame.index.codes, (1, len(result) // len(frame))) + index_levels = frame.index.levels + index_codes = np.tile(frame.index.codes, (1, ratio)) else: index_levels = [frame.index.unique()] codes = factorize(frame.index)[0] - index_codes = np.tile(codes, (1, len(result) // len(frame))) + index_codes = np.tile(codes, (1, ratio)) index_codes = list(index_codes) - if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels column_codes = ordered_stack_cols.drop_duplicates().codes else: - column_levels = [ - ordered_stack_cols.get_level_values(e).unique() - for e in range(ordered_stack_cols.nlevels) - ] + column_levels = [ordered_stack_cols.unique()] column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] - result.index = MultiIndex( levels=index_levels + column_levels, codes=index_codes + column_codes, @@ -1040,4 +983,15 @@ def stack_v2_nonempty(frame, level, sort): idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) result = result.take(idxs) + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + if dropna: + result = result.dropna(how="all") + return result From d4ca34c66308f25239ad533a6ff9a152378a2435 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 19 Jun 2023 18:48:34 -0400 Subject: [PATCH 08/12] Some refactors --- pandas/core/indexes/multi.py | 14 ++------------ pandas/tests/frame/test_stack_unstack.py | 4 ++-- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c5d8edd065a5f..95568af88ee6c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2369,18 +2369,8 @@ def reorder_levels(self, order) -> MultiIndex: names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] - if len(order) != self.nlevels: - raise AssertionError( - f"Length of order must be same as number of levels ({self.nlevels}), " - f"got {len(order)}" - ) - new_levels = [self.levels[i] for i in order] - new_codes = [self.codes[i] for i in order] - new_names = [self.names[i] for i in order] - - return MultiIndex( - levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False - ) + result = self._reorder_ilevels(order) + return result def _reorder_ilevels(self, order) -> MultiIndex: if len(order) != self.nlevels: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 59c9fad259d20..e1e2f76497845 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1040,7 +1040,7 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) - result = df.stack(level=level, dropna=True) + result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, @@ -1055,7 +1055,7 @@ def test_stack_partial_multiIndex(self, multiindex_columns, level): df.columns = MultiIndex.from_tuples( df.columns.to_numpy(), names=df.columns.names ) - expected = df.stack(level=level, dropna=True) + expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: From a3d56322b790eed0536fb9748bb274bb9fc54657 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Jun 2023 18:37:40 -0400 Subject: [PATCH 09/12] POC: DataFrame.stack not including NA rows --- pandas/core/frame.py | 23 +++-- pandas/core/indexes/multi.py | 4 + pandas/core/reshape/reshape.py | 114 ++++++++++++++++++++++- pandas/tests/frame/test_stack_unstack.py | 65 ++++++------- 4 files changed, 161 insertions(+), 45 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b411cfc4a4685..b946ea738366a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9151,15 +9151,22 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True): dog kg NaN 2.0 m 3.0 NaN """ - from pandas.core.reshape.reshape import ( - stack, - stack_multiple, - ) + from pandas.core.reshape.reshape import stack_v2 - if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) - else: - result = stack(self, level, dropna=dropna, sort=sort) + if ( + isinstance(level, (tuple, list)) + and not all(lev in self.columns.names for lev in level) + and not all(isinstance(lev, int) for lev in level) + ): + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) + + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.columns._get_level_number(lev) for lev in level] + result = stack_v2(self, level, dropna=dropna, sort=sort) return result.__finalize__(self, method="stack") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5181623c0c327..6ccd8b9c850cd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2386,6 +2386,10 @@ def reorder_levels(self, order) -> MultiIndex: names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] + result = self._reorder_ilevels(order) + return result + + def _reorder_ilevels(self, order) -> MultiIndex: if len(order) != self.nlevels: raise AssertionError( f"Length of order must be same as number of levels ({self.nlevels}), " diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f6ce9955bc2bc..f76cbc88f1543 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,14 +28,19 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.algorithms import unique +from pandas.core.algorithms import ( + factorize, + unique, +) from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, + RangeIndex, ) +from pandas.core.reshape.concat import concat from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -880,3 +885,110 @@ def _reorder_for_extension_array_stack( # c0r1, c1r1, c2r1, ...] idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() return arr.take(idx) + + +def stack_v2(frame, level: list[int], dropna: bool = True, sort: bool = True): + if frame.columns.nunique() != len(frame.columns): + raise ValueError("Columns with duplicate values are not supported in stack") + + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level)[::-1] + stack_cols = frame.columns._drop_level_numbers( + [k for k in range(frame.columns.nlevels) if k not in level][::-1] + ) + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) + else: + ordered_stack_cols = stack_cols + + stack_cols_unique = stack_cols.unique() + ordered_stack_cols_unique = ordered_stack_cols.unique() + + # Grab data for each unique index to be stacked + buf = [] + for idx in stack_cols_unique: + if len(frame.columns) == 1: + data = frame.copy() + else: + # Take the data from frame corresponding to this idx value + if not isinstance(idx, tuple): + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in level else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] + + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) + elif stack_cols.nlevels == 1: + if data.ndim == 1: + data = data.rename(0) + else: + data.columns = RangeIndex(len(data.columns)) + buf.append(data) + if len(buf) > 0: + result = concat(buf) + ratio = len(result) // len(frame) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns) + ratio = 0 + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + # Construct the correct MultiIndex by combining the frame's index and + # stacked columns. + if isinstance(frame.index, MultiIndex): + index_levels = frame.index.levels + index_codes = np.tile(frame.index.codes, (1, ratio)) + else: + index_levels = [frame.index.unique()] + codes = factorize(frame.index)[0] + index_codes = np.tile(codes, (1, ratio)) + index_codes = list(index_codes) + if isinstance(stack_cols, MultiIndex): + column_levels = ordered_stack_cols.levels + column_codes = ordered_stack_cols.drop_duplicates().codes + else: + column_levels = [ordered_stack_cols.unique()] + column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] + result.index = MultiIndex( + levels=index_levels + column_levels, + codes=index_codes + column_codes, + names=frame.index.names + list(ordered_stack_cols.names), + verify_integrity=False, + ) + + # sort result, but faster than calling sort_index since we know the order we need + len_df = len(frame) + n_uniques = len(ordered_stack_cols_unique) + indexer = np.arange(n_uniques) + idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) + result = result.take(idxs) + + # Reshape/rename if needed and dropna + if result.ndim == 2 and frame.columns.nlevels == len(level): + if len(result.columns) == 0: + result = Series(index=result.index) + else: + result = result.iloc[:, 0] + if result.ndim == 1: + result.name = None + if dropna: + result = result.dropna(how="all") + + return result diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 13d93bb0a490d..7ef8afd293180 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -516,10 +516,10 @@ def test_unstack_level_binding(self): expected = DataFrame( np.array( - [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 + [[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64 ), index=expected_mi, - columns=Index(["a", "b"], name="third"), + columns=Index(["b", "a"], name="third"), ) tm.assert_frame_equal(result, expected) @@ -1080,10 +1080,9 @@ def test_stack_full_multiIndex(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) - @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) - def test_stack_preserve_categorical_dtype(self, ordered, labels): + def test_stack_preserve_categorical_dtype(self, ordered): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) + cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() @@ -1091,8 +1090,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): # it's tested elsewhere. midx = MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) - - tm.assert_series_equal(result, expected) + tm.assert_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize( @@ -1110,8 +1108,10 @@ def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data): df = DataFrame([sorted(data)], columns=midx) result = df.stack([0, 1]) - s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered) - expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2])) + s_cidx = pd.CategoricalIndex(labels, ordered=ordered) + expected = Series( + sorted(data), index=MultiIndex.from_product([[0], s_cidx, cidx2]) + ) tm.assert_series_equal(result, expected) @@ -1311,7 +1311,7 @@ def test_stack_timezone_aware_values(): def test_stack_empty_frame(dropna): # GH 36113 levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)] - expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []])) + expected = Series(index=MultiIndex(levels=levels, codes=[[], []])) result = DataFrame(dtype=np.float64).stack(dropna=dropna) tm.assert_series_equal(result, expected) @@ -1536,7 +1536,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): # columns unsorted unstacked = ymd.unstack() - unstacked = unstacked.sort_index(axis=1, ascending=False) restacked = unstacked.stack() tm.assert_frame_equal(restacked, ymd) @@ -1569,17 +1568,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): @pytest.mark.parametrize( "idx, columns, exp_idx", [ - [ - list("abab"), - ["1st", "2nd", "3rd"], - MultiIndex( - levels=[["a", "b"], ["1st", "2nd", "3rd"]], - codes=[ - np.tile(np.arange(2).repeat(3), 2), - np.tile(np.arange(3), 4), - ], - ), - ], [ list("abab"), ["1st", "2nd", "1st"], @@ -1609,12 +1597,9 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx): index=idx, columns=columns, ) - result = df.stack() - expected = Series(np.arange(12), index=exp_idx) - tm.assert_series_equal(result, expected) - assert result.index.is_unique is False - li, ri = result.index, expected.index - tm.assert_index_equal(li, ri) + msg = "Columns with duplicate values are not supported in stack" + with pytest.raises(ValueError, match=msg): + df.stack() def test_unstack_odd_failure(self): data = """day,time,smoker,sum,len @@ -2200,8 +2185,14 @@ def test_stack_nan_in_multiindex_columns(self): result = df.stack(2) expected = DataFrame( [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]], - index=Index([(0, None), (0, 0), (0, 1)]), - columns=Index([(0, None), (0, 2), (0, 3)]), + index=MultiIndex( + levels=[[0], [0.0, 1.0]], + codes=[[0, 0, 0], [-1, 0, 1]], + ), + columns=MultiIndex( + levels=[[0], [2, 3]], + codes=[[0, 0, 0], [-1, 0, 1]], + ), ) tm.assert_frame_equal(result, expected) @@ -2219,20 +2210,20 @@ def test_multi_level_stack_categorical(self): expected = DataFrame( [ [0, np.nan], - [np.nan, 2], [1, np.nan], + [np.nan, 2], [np.nan, 3], [4, np.nan], - [np.nan, 6], [5, np.nan], + [np.nan, 6], [np.nan, 7], ], columns=["A", "B"], index=MultiIndex.from_arrays( [ [0] * 4 + [1] * 4, - pd.Categorical(list("aabbaabb")), - pd.Categorical(list("cdcdcdcd")), + pd.Categorical(list("abababab")), + pd.Categorical(list("ccddccdd")), ] ), ) @@ -2252,8 +2243,10 @@ def test_stack_nan_level(self): expected = DataFrame( [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]], columns=Index(["A", "B"], name="Upper"), - index=MultiIndex.from_tuples( - [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"] + index=MultiIndex( + levels=[[0, 1], [np.nan, "b"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["Num", "Lower"], ), ) tm.assert_frame_equal(result, expected) From 5c62b3a898befb4f0f74d0d466db64b24535a86e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 23 Jun 2023 17:38:17 -0400 Subject: [PATCH 10/12] merge cleanup --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/tests/frame/test_stack_unstack.py | 23 ++++++++++------------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9bd1a61dec153..43854c5849481 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -408,7 +408,7 @@ def _wrap_applied_output( res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - res_ser = res_df.stack(sort=False, dropna=self.observed) + res_ser = res_df.stack(dropna=self.observed) res_ser.name = self.obj.name return res_ser elif isinstance(values[0], (Series, DataFrame)): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 80bdc6816131a..c094a62b22feb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3248,7 +3248,7 @@ def describe( if obj.ndim == 1: result = described else: - result = described.unstack(sort=False) + result = described.unstack() return result.to_frame().T.iloc[:0] with com.temp_setattr(self, "as_index", True): diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index f440a4031cc67..520ebcc25a3eb 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -47,12 +47,9 @@ def test_stack_mixed_level(self): # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) - with pytest.raises( - TypeError, match="'<' not supported between instances of 'int' and 'str'" - ): - result = df.stack() - # expected = Series(1, index=MultiIndex.from_product(levels[:2])) - # tm.assert_series_equal(result, expected) + result = df.stack() + expected = Series(1, index=MultiIndex.from_product(levels[:2])) + tm.assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) @@ -1091,8 +1088,9 @@ def test_stack_preserve_categorical_dtype(self, ordered): # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. - midx = MultiIndex.from_product([df.index, cidx.sort_values()]) - expected = Series([11, 10, 12], index=midx) + midx = MultiIndex.from_product([df.index, cidx]) + expected = Series([10, 11, 12], index=midx) + tm.assert_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) @@ -1540,8 +1538,7 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data): # columns unsorted unstacked = ymd.unstack() restacked = unstacked.stack() - expected = ymd.sort_index(axis=1, ascending=False) - tm.assert_frame_equal(restacked, expected) + tm.assert_frame_equal(restacked, ymd) # more than 2 levels in the columns unstacked = ymd.unstack(1).unstack(1) @@ -2030,12 +2027,12 @@ def test_stack_order_with_unsorted_levels_multi_row_2(self): result = df.stack(stack_lev, sort=True) expected_index = MultiIndex( levels=[[0, 1, 2, 3], [0, 1]], - codes=[[1, 1, 0, 0, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]], + codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]], ) expected = DataFrame( { - 0: [1, 0, 1, 0, 1, 0, 1, 0], - 1: [3, 2, 3, 2, 3, 2, 3, 2], + 0: [0, 1, 0, 1, 0, 1, 0, 1], + 1: [2, 3, 2, 3, 2, 3, 2, 3], }, index=expected_index, ) From 2901557db024c2edef6e674d87f04bbb2010d69f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 23 Jun 2023 17:39:30 -0400 Subject: [PATCH 11/12] merge cleanup --- pandas/tests/groupby/test_categorical.py | 12 +++--------- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/io/json/test_pandas.py | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0d71719dd2f98..c0704d9684574 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -218,13 +218,9 @@ def f(x): # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal( - (desc_result.stack(sort=False).index.get_level_values(0)), exp - ) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal( - (desc_result.stack(sort=False).index.get_level_values(1)), exp - ) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_level_get_group(observed): @@ -657,9 +653,7 @@ def test_datetime(): exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) - tm.assert_index_equal( - (desc_result.stack(sort=False).index.get_level_values(1)), exp - ) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_categorical_index(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e086df50e882..0535bafc2a907 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1097,7 +1097,7 @@ def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack(sort=False) + expected = grouped.describe().stack() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 332974d674093..90c48012ccac9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1879,7 +1879,7 @@ def test_to_json_multiindex_escape(self): True, index=pd.date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], - ).stack(sort=False) + ).stack() result = df.to_json() expected = ( "{\"(Timestamp('2017-01-20 00:00:00'), 'foo')\":true," From 0f7269671cbac6a4372291fb757c237e93076309 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 23 Jun 2023 17:54:24 -0400 Subject: [PATCH 12/12] cleanup --- pandas/tests/frame/test_stack_unstack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 520ebcc25a3eb..cb968ac148595 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1091,7 +1091,7 @@ def test_stack_preserve_categorical_dtype(self, ordered): midx = MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) - tm.assert_equal(result, expected) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize(