Skip to content

ENH/API: DataFrame.stack() support for level=None, sequentially=True/False, and NaN level values. #9023

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 65 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3718,7 +3718,7 @@ def pivot(self, index=None, columns=None, values=None):
from pandas.core.reshape import pivot
return pivot(self, index=index, columns=columns, values=values)

def stack(self, level=-1, dropna=True):
def stack(self, level=-1, dropna=True, sequentially=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we think we'll want to change the default eventually, let's default to None here for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite understand. You want sequentially to default to None? What would be the meaning?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. My comment here doesn't make sense to me anymore.

"""
Pivot a level of the (possibly hierarchical) column labels, returning a
DataFrame (or Series in the case of an object with a single level of
Expand All @@ -3728,11 +3728,15 @@ def stack(self, level=-1, dropna=True):

Parameters
----------
level : int, string, or list of these, default last level
Level(s) to stack, can pass level name
level : int, string, list of these, or None; default -1 (last level)
Level(s) to stack, can pass level name(s).
None specifies all column levels, i.e. list(range(columns.nlevels)).
dropna : boolean, default True
Whether to drop rows in the resulting Frame/Series with no valid
values
sequentially : boolean, default True
When level is a list (or None), whether the multiple column levels
should be stacked sequentially (if True) or simultaneously (if False).

Examples
----------
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe add to the examples here for use of level=None and sequentially=False (might be too big for here though....), will need to update docs as well for these features (e.g. you can also have a pointer to the docs)

Expand All @@ -3751,14 +3755,20 @@ def stack(self, level=-1, dropna=True):
-------
stacked : DataFrame or Series
"""
from pandas.core.reshape import stack, stack_multiple
from pandas.core.reshape import stack_levels_sequentially, stack_multi_levels_simultaneously

if isinstance(level, (tuple, list)):
return stack_multiple(self, level, dropna=dropna)
level_nums = self.columns._get_level_numbers(level, allow_mixed_names_and_numbers=False)
if level_nums == []:
if dropna:
return self.dropna(axis=0, how='all')
else:
return self
elif (not sequentially) and isinstance(self.columns, MultiIndex):
return stack_multi_levels_simultaneously(self, level_nums, dropna=dropna)
else:
return stack(self, level, dropna=dropna)
return stack_levels_sequentially(self, level_nums, dropna=dropna)

def unstack(self, level=-1):
def unstack(self, level=-1, dropna=False, sequentially=False):
"""
Pivot a level of the (necessarily hierarchical) index labels, returning
a DataFrame having a new level of column labels whose inner-most level
Expand All @@ -3769,8 +3779,15 @@ def unstack(self, level=-1):

Parameters
----------
level : int, string, or list of these, default -1 (last level)
Level(s) of index to unstack, can pass level name
level : int, string, list of these, or None; default -1 (last level)
Level(s) of index to unstack, can pass level name(s).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to add dropna to parameters list

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

None specifies all index levels, i.e. list(range(index.nlevels)).
dropna : boolean, default False
Whether to drop columns in the resulting Frame/Series with no valid
values
sequentially : boolean, default True
When level is a list (or None), whether the multiple index levels
should be stacked sequentially (if True) or simultaneously (if False).

See also
--------
Expand Down Expand Up @@ -3812,7 +3829,44 @@ def unstack(self, level=-1):
unstacked : DataFrame or Series
"""
from pandas.core.reshape import unstack
return unstack(self, level)

level_nums = self.index._get_level_numbers(level, allow_mixed_names_and_numbers=False)
if level_nums == []:
if dropna:
return self.dropna(axis=1, how='all')
else:
return self
if sequentially and isinstance(level_nums, list) and (len(level_nums) > 1):
result = self
# Adjust level_nums to account for the fact that levels move "up"
# as a result of stacking of earlier levels.
adjusted_level_nums = [x - sum((y < x) for y in level_nums[:i])
for i, x in enumerate(level_nums)]
for level_num in adjusted_level_nums:
result = unstack(result, level_num)
else:
result = unstack(self, level_nums)

if isinstance(result, DataFrame):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can prob just to convert_objects() and it should do most/all of this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. convert_objects() just works with object columns, but here I (may) need to convert float columns to int.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this logic needs to re thought
what u r doing is very fragile

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the idea is to try to restore the original dtypes (e.g. an integer or boolean type) unless can't do so due to the introduction of NaNs. This was not the case previously -- see, for example, my change to pandas/tools/tests/test_pivot.py below. I'd welcome a cleaner way of doing this.

# fix dtypes, if necessary
desired_dtypes = self.dtypes.values.repeat(len(result.columns) // len(self.columns))
result_dtypes = result.dtypes.values
for i, c in enumerate(result.columns):
if result_dtypes[i] != desired_dtypes[i]:
if result_dtypes[i] == np.object:
# use default Series constructor to set type
result[c] = Series(result[c].values.tolist(), index=result.index)
else:
# try to convert type directly
result[c] = result[c].astype(desired_dtypes[i], raise_on_error=False)
# drop empty columns, if necessary
if dropna:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't use inplace internally (I really don't like it even externally!)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, will change.

result = result.dropna(axis=1, how='all')
else:
if dropna:
result = result.dropna()

return result

#----------------------------------------------------------------------
# Time series-related
Expand Down
103 changes: 76 additions & 27 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,7 +1033,7 @@ def _validate_index_level(self, level):
verification must be done like in MultiIndex.

"""
if isinstance(level, int):
if com.is_integer(level):
if level < 0 and level != -1:
raise IndexError("Too many levels: Index has only 1 level,"
" %d is not a valid level number" % (level,))
Expand All @@ -1045,10 +1045,44 @@ def _validate_index_level(self, level):
raise KeyError('Level %s must be same as name (%s)'
% (level, self.name))

def _get_level_number(self, level):
def _get_level_number(self, level, ignore_names=False):
"""
Returns level number corresponding to level.
If level is a level name and ignore_names is False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you indent the 2nd parts of lines, prob not PEP8 but easier to read the separate items.

the level number corresponding to such level name is returned.
Otherwise level must be a number.
If level is a positive number, it is returned.
If level is a negative number, its sum with self.nlevels is returned.
"""
if ignore_names and (not com.is_integer(level)):
raise KeyError('Level %s not found' % str(level))
self._validate_index_level(level)
return 0

def _get_level_numbers(self, levels, allow_mixed_names_and_numbers=False):
"""
Returns level numbers corresponding to levels.
If levels is None, a list of all level numbers is returned.
If levels is a single number or level name,
then a single number is returned (using _get_level_number()).
If levels is a list of numbers or level names,
then a list of numbers is returned (each using _get_level_number()).
If allow_mixed_names_and_numbers is False, then levels must be
either all level numbers or all level names.
"""
if levels is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put a little explanation of what this returns (e.g. its a list)

return list(range(self.nlevels))
elif isinstance(levels, (list, tuple, set)):
if (not allow_mixed_names_and_numbers) and (not all(lev in self.names for lev in levels)):
if all(isinstance(lev, int) for lev in levels):
return type(levels)(self._get_level_number(level, ignore_names=True) for level in levels)
else:
raise ValueError("level should contain all level names or all level numbers, "
"not a mixture of the two.")
return type(levels)(self._get_level_number(level) for level in levels)
else:
return self._get_level_number(levels)

@cache_readonly
def inferred_type(self):
""" return a string of the type inferred from the values """
Expand Down Expand Up @@ -4294,28 +4328,38 @@ def _from_elements(values, labels=None, levels=None, names=None,
sortorder=None):
return MultiIndex(levels, labels, names, sortorder=sortorder)

def _get_level_number(self, level):
try:
def _get_level_number(self, level, ignore_names=False):
"""
Returns level number corresponding to level.
If level is a level name and ignore_names is False,
the level number corresponding to such level name is returned.
Otherwise level must be a number.
If level is a positive number, it is returned.
If level is a negative number, its sum with self.nlevels is returned.
"""
if not ignore_names:
count = self.names.count(level)
if count > 1:
raise ValueError('The name %s occurs multiple times, use a '
'level number' % level)
level = self.names.index(level)
except ValueError:
if not isinstance(level, int):
raise KeyError('Level %s not found' % str(level))
elif level < 0:
level += self.nlevels
if level < 0:
orig_level = level - self.nlevels
raise IndexError(
'Too many levels: Index has only %d levels, '
'%d is not a valid level number' % (self.nlevels, orig_level)
)
# Note: levels are zero-based
elif level >= self.nlevels:
raise IndexError('Too many levels: Index has only %d levels, '
'not %d' % (self.nlevels, level + 1))
try:
return self.names.index(level)
except ValueError:
pass
if not com.is_integer(level):
raise KeyError('Level %s not found' % str(level))
elif level < 0:
level += self.nlevels
if level < 0:
orig_level = level - self.nlevels
raise IndexError(
'Too many levels: Index has only %d levels, '
'%d is not a valid level number' % (self.nlevels, orig_level)
)
# Note: levels are zero-based
elif level >= self.nlevels:
raise IndexError('Too many levels: Index has only %d levels, '
'not %d' % (self.nlevels, level + 1))
return level

_tuples = None
Expand Down Expand Up @@ -4891,14 +4935,16 @@ def _drop_from_level(self, labels, level):

return self[mask]

def droplevel(self, level=0):
def droplevel(self, level=0, ignore_names=False):
"""
Return Index with requested level removed. If MultiIndex has only 2
levels, the result will be of Index type not MultiIndex.

Parameters
----------
level : int/level name or list thereof
ignore_names : boolean, default True
If True, level must be an int or list thereof

Notes
-----
Expand All @@ -4916,7 +4962,7 @@ def droplevel(self, level=0):
new_labels = list(self.labels)
new_names = list(self.names)

levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1]
levnums = sorted((self._get_level_number(lev, ignore_names) for lev in levels), reverse=True)

for i in levnums:
new_levels.pop(i)
Expand All @@ -4929,6 +4975,9 @@ def droplevel(self, level=0):
mask = new_labels[0] == -1
result = new_levels[0].take(new_labels[0])
if mask.any():
if result.is_integer():
# cannot store NaNs in an integer index, so promote to Float64Index
result = Float64Index(result.values, name=result.name)
result = result.putmask(mask, np.nan)

result.name = new_names[0]
Expand Down Expand Up @@ -5539,7 +5588,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels):

else:

loc = level_index.get_loc(key)
loc = -1 if com.is_float(key) and np.isnan(key) else level_index.get_loc(key)
if level > 0 or self.lexsort_depth == 0:
return np.array(labels == loc,dtype=bool)
else:
Expand Down Expand Up @@ -6050,7 +6099,7 @@ def _trim_front(strings):


def _sanitize_and_check(indexes):
kinds = list(set([type(index) for index in indexes]))
kinds = list(set(type(index) for index in indexes))

if list in kinds:
if len(kinds) > 1:
Expand All @@ -6071,11 +6120,11 @@ def _get_consensus_names(indexes):

# find the non-none names, need to tupleify to make
# the set hashable, then reverse on return
consensus_names = set([
consensus_names = set(
tuple(i.names) for i in indexes if all(n is not None for n in i.names)
])
)
if len(consensus_names) == 1:
return list(list(consensus_names)[0])
return list(consensus_names.pop())
return [None] * indexes[0].nlevels


Expand Down
Loading