From 4ae90aeff521c10c417ce7751c79b11ec02d5db0 Mon Sep 17 00:00:00 2001 From: onesandzeroes Date: Fri, 14 Nov 2014 10:22:02 +1100 Subject: [PATCH] BUG: Stacking with multiple mixed int/str levels Add test case for mixed type stacking Used wrong var name in the assert Method to swap levels assuming ints are level numbers Fix _stack_multi_columns to deal with mixed strs/ints Extra testcases Add fix to the release notes Convert to label before swaplevel if possible Revert "Method to swap levels assuming ints are level numbers" This reverts commit 61f96fd3cb23cda9f9c7a6837b145ebd247a55cc. More test cases Use _convert_level_number() to sort columns --- doc/source/whatsnew/v0.15.2.txt | 3 ++ pandas/core/reshape.py | 42 +++++++++++++++++----- pandas/tests/test_frame.py | 64 +++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 3f72d5d44f870..9659f670280ad 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -98,3 +98,6 @@ Bug Fixes - Bug in `pd.infer_freq`/`DataFrame.inferred_freq` that prevented proper sub-daily frequency inference when the index contained DST days (:issue:`8772`). - Bug where index name was still used when plotting a series with ``use_index=False`` (:issue:`8558`). + +- Bugs when trying to stack multiple columns, when some (or all) + of the level names are numbers (:issue:`8584`). diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 5cbf392f246ed..d576f788a831f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -525,10 +525,10 @@ def stack(frame, level=-1, dropna=True): raise ValueError(msg) # Will also convert negative level numbers and check if out of bounds. - level = frame.columns._get_level_number(level) + level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): - return _stack_multi_columns(frame, level=level, dropna=dropna) + return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_levels.append(frame.columns) @@ -595,19 +595,43 @@ def stack_multiple(frame, level, dropna=True): return result -def _stack_multi_columns(frame, level=-1, dropna=True): +def _stack_multi_columns(frame, level_num=-1, dropna=True): + def _convert_level_number(level_num, columns): + """ + Logic for converting the level number to something + we can safely pass to swaplevel: + + We generally want to convert the level number into + a level name, except when columns do not have names, + in which case we must leave as a level number + """ + if level_num in columns.names: + return columns.names[level_num] + else: + if columns.names[level_num] is None: + return level_num + else: + return columns.names[level_num] + this = frame.copy() # this makes life much simpler - if level != frame.columns.nlevels - 1: + if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns - for i in range(level, frame.columns.nlevels - 1): - roll_columns = roll_columns.swaplevel(i, i + 1) + for i in range(level_num, frame.columns.nlevels - 1): + # Need to check if the ints conflict with level names + lev1 = _convert_level_number(i, roll_columns) + lev2 = _convert_level_number(i + 1, roll_columns) + roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): - this = this.sortlevel(0, axis=1) + # Workaround the edge case where 0 is one of the column names, + # which interferes with trying to sort based on the first + # level + level_to_sort = _convert_level_number(0, this.columns) + this = this.sortlevel(level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: @@ -660,9 +684,9 @@ def _stack_multi_columns(frame, level=-1, dropna=True): new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(frame.columns.levels[level]) + new_levels.append(frame.columns.levels[level_num]) new_labels.append(np.tile(np.arange(levsize), N)) - new_names.append(frame.columns.names[level]) + new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 312a5df475d6e..fc031afe728dc 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12110,6 +12110,70 @@ def test_stack_ints(self): df_named.stack(level=1).stack(level=1) ) + def test_stack_mixed_levels(self): + columns = MultiIndex.from_tuples( + [('A', 'cat', 'long'), ('B', 'cat', 'long'), + ('A', 'dog', 'short'), ('B', 'dog', 'short')], + names=['exp', 'animal', 'hair_length'] + ) + df = DataFrame(randn(4, 4), columns=columns) + + animal_hair_stacked = df.stack(level=['animal', 'hair_length']) + exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + + # GH #8584: Need to check that stacking works when a number + # is passed that is both a level name and in the range of + # the level numbers + df2 = df.copy() + df2.columns.names = ['exp', 'animal', 1] + assert_frame_equal(df2.stack(level=['animal', 1]), + animal_hair_stacked, check_names=False) + assert_frame_equal(df2.stack(level=['exp', 1]), + exp_hair_stacked, check_names=False) + + # When mixed types are passed and the ints are not level + # names, raise + self.assertRaises(ValueError, df2.stack, level=['animal', 0]) + + # GH #8584: Having 0 in the level names could raise a + # strange error about lexsort depth + df3 = df.copy() + df3.columns.names = ['exp', 'animal', 0] + assert_frame_equal(df3.stack(level=['animal', 0]), + animal_hair_stacked, check_names=False) + + def test_stack_int_level_names(self): + columns = MultiIndex.from_tuples( + [('A', 'cat', 'long'), ('B', 'cat', 'long'), + ('A', 'dog', 'short'), ('B', 'dog', 'short')], + names=['exp', 'animal', 'hair_length'] + ) + df = DataFrame(randn(4, 4), columns=columns) + + exp_animal_stacked = df.stack(level=['exp', 'animal']) + animal_hair_stacked = df.stack(level=['animal', 'hair_length']) + exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + + df2 = df.copy() + df2.columns.names = [0, 1, 2] + assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, + check_names=False ) + assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, + check_names=False) + assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, + check_names=False) + + # Out-of-order int column names + df3 = df.copy() + df3.columns.names = [2, 0, 1] + assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, + check_names=False) + assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, + check_names=False) + assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, + check_names=False) + + def test_unstack_bool(self): df = DataFrame([False, False], index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),