Skip to content

BUG: Fix for passing multiple ints as levels in DataFrame.stack() (#7660) #7770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,34 @@ the level numbers:

stacked.unstack('second')

.. _reshaping.stack_multiple:

You may also stack or unstack more than one level at a time by passing a list
of levels, in which case the end result is as if each level in the list were
processed individually.

.. ipython:: python

columns = MultiIndex.from_tuples([
('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')
],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(randn(4, 4), columns=columns)
df

df.stack(level=['animal', 'hair_length'])

The list of levels can contain either level names or level numbers (but
not a mixture of the two).

.. ipython:: python

# df.stack(level=['animal', 'hair_length'])
# from above is equivalent to:
df.stack(level=[1, 2])

These functions are intelligent about handling missing data and do not expect
each subgroup within the hierarchical index to have the same set of labels.
They also can handle the index being unsorted (but you can make it sorted by
Expand Down
5 changes: 5 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ users upgrade to this version.
API changes
~~~~~~~~~~~

- Passing multiple levels to `DataFrame.stack()` will now work when multiple level
numbers are passed (:issue:`7660`), and will raise a ``ValueError`` when the
levels aren't all level names or all level numbers. See
:ref:`Reshaping by stacking and unstacking <reshaping.stack_multiple>`.

.. _whatsnew_0150.cat:

Categoricals in Series/DataFrame
Expand Down
7 changes: 2 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3311,13 +3311,10 @@ def stack(self, level=-1, dropna=True):
-------
stacked : DataFrame or Series
"""
from pandas.core.reshape import stack
from pandas.core.reshape import stack, stack_multiple

if isinstance(level, (tuple, list)):
result = self
for lev in level:
result = stack(result, lev, dropna=dropna)
return result
return stack_multiple(self, level, dropna=dropna)
else:
return stack(self, level, dropna=dropna)

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2490,6 +2490,12 @@ def _get_level_number(self, level):
raise KeyError('Level %s not found' % str(level))
elif level < 0:
level += self.nlevels
if level < 0:
orig_level = level - self.nlevels
raise IndexError(
'Too many levels: Index has only %d levels, '
'%d is not a valid level number' % (self.nlevels, orig_level)
)
# Note: levels are zero-based
elif level >= self.nlevels:
raise IndexError('Too many levels: Index has only %d levels, '
Expand Down
43 changes: 40 additions & 3 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,9 +513,7 @@ def stack(frame, level=-1, dropna=True):
"names are not unique.".format(level))
raise ValueError(msg)

if isinstance(level, int) and level < 0:
level += frame.columns.nlevels

# Will also convert negative level numbers and check if out of bounds.
level = frame.columns._get_level_number(level)

if isinstance(frame.columns, MultiIndex):
Expand Down Expand Up @@ -547,6 +545,45 @@ def stack(frame, level=-1, dropna=True):
return Series(new_values, index=new_index)


def stack_multiple(frame, level, dropna=True):
# If all passed levels match up to column names, no
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put a blank line after the if/elif statements

# ambiguity about what to do
if all(lev in frame.columns.names for lev in level):
result = frame
for lev in level:
result = stack(result, lev, dropna=dropna)

# Otherwise, level numbers may change as each successive level is stacked
elif all(isinstance(lev, int) for lev in level):
# As each stack is done, the level numbers decrease, so we need
# to account for that when level is a sequence of ints
result = frame
# _get_level_number() checks level numbers are in range and converts
# negative numbers to positive
level = [frame.columns._get_level_number(lev) for lev in level]

# Can't iterate directly through level as we might need to change
# values as we go
for index in range(len(level)):
lev = level[index]
result = stack(result, lev, dropna=dropna)
# Decrement all level numbers greater than current, as these
# have now shifted down by one
updated_level = []
for other in level:
if other > lev:
updated_level.append(other - 1)
else:
updated_level.append(other)
level = updated_level

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise ValueError on the fall thru (e.g. i guess this is a list of mixed ints/names), pls add a test for this too

else:
raise ValueError("level should contain all level names or all level numbers, "
"not a mixture of the two.")

return result


def _stack_multi_columns(frame, level=-1, dropna=True):
this = frame.copy()

Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11725,6 +11725,29 @@ def test_stack_unstack(self):
assert_frame_equal(unstacked_cols.T, self.frame)
assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)

def test_stack_ints(self):
df = DataFrame(
np.random.randn(30, 27),
columns=MultiIndex.from_tuples(
list(itertools.product(range(3), repeat=3))
)
)
assert_frame_equal(
df.stack(level=[1, 2]),
df.stack(level=1).stack(level=1)
)
assert_frame_equal(
df.stack(level=[-2, -1]),
df.stack(level=1).stack(level=1)
)

df_named = df.copy()
df_named.columns.set_names(range(3), inplace=True)
assert_frame_equal(
df_named.stack(level=[1, 2]),
df_named.stack(level=1).stack(level=1)
)

def test_unstack_bool(self):
df = DataFrame([False, False],
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,12 @@ def test_count_level_corner(self):
columns=df.columns).fillna(0).astype(np.int64)
assert_frame_equal(result, expected)

def test_get_level_number_out_of_bounds(self):
with assertRaisesRegexp(IndexError, "Too many levels"):
self.frame.index._get_level_number(2)
with assertRaisesRegexp(IndexError, "not a valid level number"):
self.frame.index._get_level_number(-3)

def test_unstack(self):
# just check that it works for now
unstacked = self.ymd.unstack()
Expand Down Expand Up @@ -1005,6 +1011,22 @@ def test_stack_unstack_multiple(self):
expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all')
assert_frame_equal(unstacked, expected.ix[:, unstacked.columns])

def test_stack_names_and_numbers(self):
unstacked = self.ymd.unstack(['year', 'month'])

# Can't use mixture of names and numbers to stack
with assertRaisesRegexp(ValueError, "level should contain"):
unstacked.stack([0, 'month'])

def test_stack_multiple_out_of_bounds(self):
# nlevels == 3
unstacked = self.ymd.unstack(['year', 'month'])

with assertRaisesRegexp(IndexError, "Too many levels"):
unstacked.stack([2, 3])
with assertRaisesRegexp(IndexError, "not a valid level number"):
unstacked.stack([-4, -3])

def test_unstack_period_series(self):
# GH 4342
idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02',
Expand Down