-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH/API: DataFrame.stack() support for level=None, sequentially=True/False, and NaN level values. #9023
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH/API: DataFrame.stack() support for level=None, sequentially=True/False, and NaN level values. #9023
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3718,7 +3718,7 @@ def pivot(self, index=None, columns=None, values=None): | |
from pandas.core.reshape import pivot | ||
return pivot(self, index=index, columns=columns, values=values) | ||
|
||
def stack(self, level=-1, dropna=True): | ||
def stack(self, level=-1, dropna=True, sequentially=True): | ||
""" | ||
Pivot a level of the (possibly hierarchical) column labels, returning a | ||
DataFrame (or Series in the case of an object with a single level of | ||
|
@@ -3728,11 +3728,15 @@ def stack(self, level=-1, dropna=True): | |
|
||
Parameters | ||
---------- | ||
level : int, string, or list of these, default last level | ||
Level(s) to stack, can pass level name | ||
level : int, string, list of these, or None; default -1 (last level) | ||
Level(s) to stack, can pass level name(s). | ||
None specifies all column levels, i.e. list(range(columns.nlevels)). | ||
dropna : boolean, default True | ||
Whether to drop rows in the resulting Frame/Series with no valid | ||
values | ||
sequentially : boolean, default True | ||
When level is a list (or None), whether the multiple column levels | ||
should be stacked sequentially (if True) or simultaneously (if False). | ||
|
||
Examples | ||
---------- | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe add to the examples here for use of |
||
|
@@ -3751,14 +3755,20 @@ def stack(self, level=-1, dropna=True): | |
------- | ||
stacked : DataFrame or Series | ||
""" | ||
from pandas.core.reshape import stack, stack_multiple | ||
from pandas.core.reshape import stack_levels_sequentially, stack_multi_levels_simultaneously | ||
|
||
if isinstance(level, (tuple, list)): | ||
return stack_multiple(self, level, dropna=dropna) | ||
level_nums = self.columns._get_level_numbers(level, allow_mixed_names_and_numbers=False) | ||
if level_nums == []: | ||
if dropna: | ||
return self.dropna(axis=0, how='all') | ||
else: | ||
return self | ||
elif (not sequentially) and isinstance(self.columns, MultiIndex): | ||
return stack_multi_levels_simultaneously(self, level_nums, dropna=dropna) | ||
else: | ||
return stack(self, level, dropna=dropna) | ||
return stack_levels_sequentially(self, level_nums, dropna=dropna) | ||
|
||
def unstack(self, level=-1): | ||
def unstack(self, level=-1, dropna=False, sequentially=False): | ||
""" | ||
Pivot a level of the (necessarily hierarchical) index labels, returning | ||
a DataFrame having a new level of column labels whose inner-most level | ||
|
@@ -3769,8 +3779,15 @@ def unstack(self, level=-1): | |
|
||
Parameters | ||
---------- | ||
level : int, string, or list of these, default -1 (last level) | ||
Level(s) of index to unstack, can pass level name | ||
level : int, string, list of these, or None; default -1 (last level) | ||
Level(s) of index to unstack, can pass level name(s). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. |
||
None specifies all index levels, i.e. list(range(index.nlevels)). | ||
dropna : boolean, default False | ||
Whether to drop columns in the resulting Frame/Series with no valid | ||
values | ||
sequentially : boolean, default True | ||
When level is a list (or None), whether the multiple index levels | ||
should be stacked sequentially (if True) or simultaneously (if False). | ||
|
||
See also | ||
-------- | ||
|
@@ -3812,7 +3829,44 @@ def unstack(self, level=-1): | |
unstacked : DataFrame or Series | ||
""" | ||
from pandas.core.reshape import unstack | ||
return unstack(self, level) | ||
|
||
level_nums = self.index._get_level_numbers(level, allow_mixed_names_and_numbers=False) | ||
if level_nums == []: | ||
if dropna: | ||
return self.dropna(axis=1, how='all') | ||
else: | ||
return self | ||
if sequentially and isinstance(level_nums, list) and (len(level_nums) > 1): | ||
result = self | ||
# Adjust level_nums to account for the fact that levels move "up" | ||
# as a result of stacking of earlier levels. | ||
adjusted_level_nums = [x - sum((y < x) for y in level_nums[:i]) | ||
for i, x in enumerate(level_nums)] | ||
for level_num in adjusted_level_nums: | ||
result = unstack(result, level_num) | ||
else: | ||
result = unstack(self, level_nums) | ||
|
||
if isinstance(result, DataFrame): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can prob just to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this logic needs to re thought There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, the idea is to try to restore the original |
||
# fix dtypes, if necessary | ||
desired_dtypes = self.dtypes.values.repeat(len(result.columns) // len(self.columns)) | ||
result_dtypes = result.dtypes.values | ||
for i, c in enumerate(result.columns): | ||
if result_dtypes[i] != desired_dtypes[i]: | ||
if result_dtypes[i] == np.object: | ||
# use default Series constructor to set type | ||
result[c] = Series(result[c].values.tolist(), index=result.index) | ||
else: | ||
# try to convert type directly | ||
result[c] = result[c].astype(desired_dtypes[i], raise_on_error=False) | ||
# drop empty columns, if necessary | ||
if dropna: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't use inplace internally (I really don't like it even externally!) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, will change. |
||
result = result.dropna(axis=1, how='all') | ||
else: | ||
if dropna: | ||
result = result.dropna() | ||
|
||
return result | ||
|
||
#---------------------------------------------------------------------- | ||
# Time series-related | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1033,7 +1033,7 @@ def _validate_index_level(self, level): | |
verification must be done like in MultiIndex. | ||
|
||
""" | ||
if isinstance(level, int): | ||
if com.is_integer(level): | ||
if level < 0 and level != -1: | ||
raise IndexError("Too many levels: Index has only 1 level," | ||
" %d is not a valid level number" % (level,)) | ||
|
@@ -1045,10 +1045,44 @@ def _validate_index_level(self, level): | |
raise KeyError('Level %s must be same as name (%s)' | ||
% (level, self.name)) | ||
|
||
def _get_level_number(self, level): | ||
def _get_level_number(self, level, ignore_names=False): | ||
""" | ||
Returns level number corresponding to level. | ||
If level is a level name and ignore_names is False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you indent the 2nd parts of lines, prob not PEP8 but easier to read the separate items. |
||
the level number corresponding to such level name is returned. | ||
Otherwise level must be a number. | ||
If level is a positive number, it is returned. | ||
If level is a negative number, its sum with self.nlevels is returned. | ||
""" | ||
if ignore_names and (not com.is_integer(level)): | ||
raise KeyError('Level %s not found' % str(level)) | ||
self._validate_index_level(level) | ||
return 0 | ||
|
||
def _get_level_numbers(self, levels, allow_mixed_names_and_numbers=False): | ||
""" | ||
Returns level numbers corresponding to levels. | ||
If levels is None, a list of all level numbers is returned. | ||
If levels is a single number or level name, | ||
then a single number is returned (using _get_level_number()). | ||
If levels is a list of numbers or level names, | ||
then a list of numbers is returned (each using _get_level_number()). | ||
If allow_mixed_names_and_numbers is False, then levels must be | ||
either all level numbers or all level names. | ||
""" | ||
if levels is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put a little explanation of what this returns (e.g. its a list) |
||
return list(range(self.nlevels)) | ||
elif isinstance(levels, (list, tuple, set)): | ||
if (not allow_mixed_names_and_numbers) and (not all(lev in self.names for lev in levels)): | ||
if all(isinstance(lev, int) for lev in levels): | ||
return type(levels)(self._get_level_number(level, ignore_names=True) for level in levels) | ||
else: | ||
raise ValueError("level should contain all level names or all level numbers, " | ||
"not a mixture of the two.") | ||
return type(levels)(self._get_level_number(level) for level in levels) | ||
else: | ||
return self._get_level_number(levels) | ||
|
||
@cache_readonly | ||
def inferred_type(self): | ||
""" return a string of the type inferred from the values """ | ||
|
@@ -4294,28 +4328,38 @@ def _from_elements(values, labels=None, levels=None, names=None, | |
sortorder=None): | ||
return MultiIndex(levels, labels, names, sortorder=sortorder) | ||
|
||
def _get_level_number(self, level): | ||
try: | ||
def _get_level_number(self, level, ignore_names=False): | ||
""" | ||
Returns level number corresponding to level. | ||
If level is a level name and ignore_names is False, | ||
the level number corresponding to such level name is returned. | ||
Otherwise level must be a number. | ||
If level is a positive number, it is returned. | ||
If level is a negative number, its sum with self.nlevels is returned. | ||
""" | ||
if not ignore_names: | ||
count = self.names.count(level) | ||
if count > 1: | ||
raise ValueError('The name %s occurs multiple times, use a ' | ||
'level number' % level) | ||
level = self.names.index(level) | ||
except ValueError: | ||
if not isinstance(level, int): | ||
raise KeyError('Level %s not found' % str(level)) | ||
elif level < 0: | ||
level += self.nlevels | ||
if level < 0: | ||
orig_level = level - self.nlevels | ||
raise IndexError( | ||
'Too many levels: Index has only %d levels, ' | ||
'%d is not a valid level number' % (self.nlevels, orig_level) | ||
) | ||
# Note: levels are zero-based | ||
elif level >= self.nlevels: | ||
raise IndexError('Too many levels: Index has only %d levels, ' | ||
'not %d' % (self.nlevels, level + 1)) | ||
try: | ||
return self.names.index(level) | ||
except ValueError: | ||
pass | ||
if not com.is_integer(level): | ||
raise KeyError('Level %s not found' % str(level)) | ||
elif level < 0: | ||
level += self.nlevels | ||
if level < 0: | ||
orig_level = level - self.nlevels | ||
raise IndexError( | ||
'Too many levels: Index has only %d levels, ' | ||
'%d is not a valid level number' % (self.nlevels, orig_level) | ||
) | ||
# Note: levels are zero-based | ||
elif level >= self.nlevels: | ||
raise IndexError('Too many levels: Index has only %d levels, ' | ||
'not %d' % (self.nlevels, level + 1)) | ||
return level | ||
|
||
_tuples = None | ||
|
@@ -4891,14 +4935,16 @@ def _drop_from_level(self, labels, level): | |
|
||
return self[mask] | ||
|
||
def droplevel(self, level=0): | ||
def droplevel(self, level=0, ignore_names=False): | ||
""" | ||
Return Index with requested level removed. If MultiIndex has only 2 | ||
levels, the result will be of Index type not MultiIndex. | ||
|
||
Parameters | ||
---------- | ||
level : int/level name or list thereof | ||
ignore_names : boolean, default True | ||
If True, level must be an int or list thereof | ||
|
||
Notes | ||
----- | ||
|
@@ -4916,7 +4962,7 @@ def droplevel(self, level=0): | |
new_labels = list(self.labels) | ||
new_names = list(self.names) | ||
|
||
levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1] | ||
levnums = sorted((self._get_level_number(lev, ignore_names) for lev in levels), reverse=True) | ||
|
||
for i in levnums: | ||
new_levels.pop(i) | ||
|
@@ -4929,6 +4975,9 @@ def droplevel(self, level=0): | |
mask = new_labels[0] == -1 | ||
result = new_levels[0].take(new_labels[0]) | ||
if mask.any(): | ||
if result.is_integer(): | ||
# cannot store NaNs in an integer index, so promote to Float64Index | ||
result = Float64Index(result.values, name=result.name) | ||
result = result.putmask(mask, np.nan) | ||
|
||
result.name = new_names[0] | ||
|
@@ -5539,7 +5588,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): | |
|
||
else: | ||
|
||
loc = level_index.get_loc(key) | ||
loc = -1 if com.is_float(key) and np.isnan(key) else level_index.get_loc(key) | ||
if level > 0 or self.lexsort_depth == 0: | ||
return np.array(labels == loc,dtype=bool) | ||
else: | ||
|
@@ -6050,7 +6099,7 @@ def _trim_front(strings): | |
|
||
|
||
def _sanitize_and_check(indexes): | ||
kinds = list(set([type(index) for index in indexes])) | ||
kinds = list(set(type(index) for index in indexes)) | ||
|
||
if list in kinds: | ||
if len(kinds) > 1: | ||
|
@@ -6071,11 +6120,11 @@ def _get_consensus_names(indexes): | |
|
||
# find the non-none names, need to tupleify to make | ||
# the set hashable, then reverse on return | ||
consensus_names = set([ | ||
consensus_names = set( | ||
tuple(i.names) for i in indexes if all(n is not None for n in i.names) | ||
]) | ||
) | ||
if len(consensus_names) == 1: | ||
return list(list(consensus_names)[0]) | ||
return list(consensus_names.pop()) | ||
return [None] * indexes[0].nlevels | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we think we'll want to change the default eventually, let's default to
None
here for now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't quite understand. You want sequentially to default to
None
? What would be the meaning?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm. My comment here doesn't make sense to me anymore.