diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 3f72d5d44f870..9659f670280ad 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -98,3 +98,6 @@ Bug Fixes - Bug in `pd.infer_freq`/`DataFrame.inferred_freq` that prevented proper sub-daily frequency inference when the index contained DST days (:issue:`8772`). - Bug where index name was still used when plotting a series with ``use_index=False`` (:issue:`8558`). + +- Bugs when trying to stack multiple columns, when some (or all) + of the level names are numbers (:issue:`8584`). diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 5cbf392f246ed..d576f788a831f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -525,10 +525,10 @@ def stack(frame, level=-1, dropna=True): raise ValueError(msg) # Will also convert negative level numbers and check if out of bounds. - level = frame.columns._get_level_number(level) + level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): - return _stack_multi_columns(frame, level=level, dropna=dropna) + return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_levels.append(frame.columns) @@ -595,19 +595,43 @@ def stack_multiple(frame, level, dropna=True): return result -def _stack_multi_columns(frame, level=-1, dropna=True): +def _stack_multi_columns(frame, level_num=-1, dropna=True): + def _convert_level_number(level_num, columns): + """ + Logic for converting the level number to something + we can safely pass to swaplevel: + + We generally want to convert the level number into + a level name, except when columns do not have names, + in which case we must leave as a level number + """ + if level_num in columns.names: + return columns.names[level_num] + else: + if columns.names[level_num] is None: + return level_num + else: + return columns.names[level_num] + this = frame.copy() # this makes life much simpler - if level != frame.columns.nlevels - 1: + if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns - for i in range(level, frame.columns.nlevels - 1): - roll_columns = roll_columns.swaplevel(i, i + 1) + for i in range(level_num, frame.columns.nlevels - 1): + # Need to check if the ints conflict with level names + lev1 = _convert_level_number(i, roll_columns) + lev2 = _convert_level_number(i + 1, roll_columns) + roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): - this = this.sortlevel(0, axis=1) + # Workaround the edge case where 0 is one of the column names, + # which interferes with trying to sort based on the first + # level + level_to_sort = _convert_level_number(0, this.columns) + this = this.sortlevel(level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: @@ -660,9 +684,9 @@ def _stack_multi_columns(frame, level=-1, dropna=True): new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(frame.columns.levels[level]) + new_levels.append(frame.columns.levels[level_num]) new_labels.append(np.tile(np.arange(levsize), N)) - new_names.append(frame.columns.names[level]) + new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 312a5df475d6e..fc031afe728dc 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12110,6 +12110,70 @@ def test_stack_ints(self): df_named.stack(level=1).stack(level=1) ) + def test_stack_mixed_levels(self): + columns = MultiIndex.from_tuples( + [('A', 'cat', 'long'), ('B', 'cat', 'long'), + ('A', 'dog', 'short'), ('B', 'dog', 'short')], + names=['exp', 'animal', 'hair_length'] + ) + df = DataFrame(randn(4, 4), columns=columns) + + animal_hair_stacked = df.stack(level=['animal', 'hair_length']) + exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + + # GH #8584: Need to check that stacking works when a number + # is passed that is both a level name and in the range of + # the level numbers + df2 = df.copy() + df2.columns.names = ['exp', 'animal', 1] + assert_frame_equal(df2.stack(level=['animal', 1]), + animal_hair_stacked, check_names=False) + assert_frame_equal(df2.stack(level=['exp', 1]), + exp_hair_stacked, check_names=False) + + # When mixed types are passed and the ints are not level + # names, raise + self.assertRaises(ValueError, df2.stack, level=['animal', 0]) + + # GH #8584: Having 0 in the level names could raise a + # strange error about lexsort depth + df3 = df.copy() + df3.columns.names = ['exp', 'animal', 0] + assert_frame_equal(df3.stack(level=['animal', 0]), + animal_hair_stacked, check_names=False) + + def test_stack_int_level_names(self): + columns = MultiIndex.from_tuples( + [('A', 'cat', 'long'), ('B', 'cat', 'long'), + ('A', 'dog', 'short'), ('B', 'dog', 'short')], + names=['exp', 'animal', 'hair_length'] + ) + df = DataFrame(randn(4, 4), columns=columns) + + exp_animal_stacked = df.stack(level=['exp', 'animal']) + animal_hair_stacked = df.stack(level=['animal', 'hair_length']) + exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + + df2 = df.copy() + df2.columns.names = [0, 1, 2] + assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, + check_names=False ) + assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, + check_names=False) + assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, + check_names=False) + + # Out-of-order int column names + df3 = df.copy() + df3.columns.names = [2, 0, 1] + assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, + check_names=False) + assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, + check_names=False) + assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, + check_names=False) + + def test_unstack_bool(self): df = DataFrame([False, False], index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),