diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index db68c0eb224e2..ab9018da4c41a 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -160,10 +160,34 @@ the level numbers: stacked.unstack('second') +.. _reshaping.stack_multiple: + You may also stack or unstack more than one level at a time by passing a list of levels, in which case the end result is as if each level in the list were processed individually. +.. ipython:: python + + columns = MultiIndex.from_tuples([ + ('A', 'cat', 'long'), ('B', 'cat', 'long'), + ('A', 'dog', 'short'), ('B', 'dog', 'short') + ], + names=['exp', 'animal', 'hair_length'] + ) + df = DataFrame(randn(4, 4), columns=columns) + df + + df.stack(level=['animal', 'hair_length']) + +The list of levels can contain either level names or level numbers (but +not a mixture of the two). + +.. ipython:: python + + # df.stack(level=['animal', 'hair_length']) + # from above is equivalent to: + df.stack(level=[1, 2]) + These functions are intelligent about handling missing data and do not expect each subgroup within the hierarchical index to have the same set of labels. They also can handle the index being unsorted (but you can make it sorted by diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 5e3f97944c243..aa57004a70e29 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -30,6 +30,11 @@ users upgrade to this version. API changes ~~~~~~~~~~~ +- Passing multiple levels to `DataFrame.stack()` will now work when multiple level + numbers are passed (:issue:`7660`), and will raise a ``ValueError`` when the + levels aren't all level names or all level numbers. See + :ref:`Reshaping by stacking and unstacking `. + .. _whatsnew_0150.cat: Categoricals in Series/DataFrame diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4f558dda756dd..04fe9e8d35359 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3311,13 +3311,10 @@ def stack(self, level=-1, dropna=True): ------- stacked : DataFrame or Series """ - from pandas.core.reshape import stack + from pandas.core.reshape import stack, stack_multiple if isinstance(level, (tuple, list)): - result = self - for lev in level: - result = stack(result, lev, dropna=dropna) - return result + return stack_multiple(self, level, dropna=dropna) else: return stack(self, level, dropna=dropna) diff --git a/pandas/core/index.py b/pandas/core/index.py index 6927d5a732440..81602d5240a08 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2490,6 +2490,12 @@ def _get_level_number(self, level): raise KeyError('Level %s not found' % str(level)) elif level < 0: level += self.nlevels + if level < 0: + orig_level = level - self.nlevels + raise IndexError( + 'Too many levels: Index has only %d levels, ' + '%d is not a valid level number' % (self.nlevels, orig_level) + ) # Note: levels are zero-based elif level >= self.nlevels: raise IndexError('Too many levels: Index has only %d levels, ' diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 43784e15ab163..b014ede6e65a8 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -513,9 +513,7 @@ def stack(frame, level=-1, dropna=True): "names are not unique.".format(level)) raise ValueError(msg) - if isinstance(level, int) and level < 0: - level += frame.columns.nlevels - + # Will also convert negative level numbers and check if out of bounds. level = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): @@ -547,6 +545,45 @@ def stack(frame, level=-1, dropna=True): return Series(new_values, index=new_index) +def stack_multiple(frame, level, dropna=True): + # If all passed levels match up to column names, no + # ambiguity about what to do + if all(lev in frame.columns.names for lev in level): + result = frame + for lev in level: + result = stack(result, lev, dropna=dropna) + + # Otherwise, level numbers may change as each successive level is stacked + elif all(isinstance(lev, int) for lev in level): + # As each stack is done, the level numbers decrease, so we need + # to account for that when level is a sequence of ints + result = frame + # _get_level_number() checks level numbers are in range and converts + # negative numbers to positive + level = [frame.columns._get_level_number(lev) for lev in level] + + # Can't iterate directly through level as we might need to change + # values as we go + for index in range(len(level)): + lev = level[index] + result = stack(result, lev, dropna=dropna) + # Decrement all level numbers greater than current, as these + # have now shifted down by one + updated_level = [] + for other in level: + if other > lev: + updated_level.append(other - 1) + else: + updated_level.append(other) + level = updated_level + + else: + raise ValueError("level should contain all level names or all level numbers, " + "not a mixture of the two.") + + return result + + def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index df00edc46eed2..c4783bc49f0ce 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11725,6 +11725,29 @@ def test_stack_unstack(self): assert_frame_equal(unstacked_cols.T, self.frame) assert_frame_equal(unstacked_cols_df['bar'].T, self.frame) + def test_stack_ints(self): + df = DataFrame( + np.random.randn(30, 27), + columns=MultiIndex.from_tuples( + list(itertools.product(range(3), repeat=3)) + ) + ) + assert_frame_equal( + df.stack(level=[1, 2]), + df.stack(level=1).stack(level=1) + ) + assert_frame_equal( + df.stack(level=[-2, -1]), + df.stack(level=1).stack(level=1) + ) + + df_named = df.copy() + df_named.columns.set_names(range(3), inplace=True) + assert_frame_equal( + df_named.stack(level=[1, 2]), + df_named.stack(level=1).stack(level=1) + ) + def test_unstack_bool(self): df = DataFrame([False, False], index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]), diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d8e17c4d1d290..5c0e500b243c9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -834,6 +834,12 @@ def test_count_level_corner(self): columns=df.columns).fillna(0).astype(np.int64) assert_frame_equal(result, expected) + def test_get_level_number_out_of_bounds(self): + with assertRaisesRegexp(IndexError, "Too many levels"): + self.frame.index._get_level_number(2) + with assertRaisesRegexp(IndexError, "not a valid level number"): + self.frame.index._get_level_number(-3) + def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() @@ -1005,6 +1011,22 @@ def test_stack_unstack_multiple(self): expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') assert_frame_equal(unstacked, expected.ix[:, unstacked.columns]) + def test_stack_names_and_numbers(self): + unstacked = self.ymd.unstack(['year', 'month']) + + # Can't use mixture of names and numbers to stack + with assertRaisesRegexp(ValueError, "level should contain"): + unstacked.stack([0, 'month']) + + def test_stack_multiple_out_of_bounds(self): + # nlevels == 3 + unstacked = self.ymd.unstack(['year', 'month']) + + with assertRaisesRegexp(IndexError, "Too many levels"): + unstacked.stack([2, 3]) + with assertRaisesRegexp(IndexError, "not a valid level number"): + unstacked.stack([-4, -3]) + def test_unstack_period_series(self): # GH 4342 idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02',