-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
POC: DataFrame.stack not including NA rows #53756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
848081c
fa46971
6cd80e9
bcf9eee
07d4683
db02720
c298022
4781906
d4ca34c
a3d5632
c0f3897
28b6a52
5c62b3a
2901557
0f72696
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,14 +28,19 @@ | |
from pandas.core.dtypes.missing import notna | ||
|
||
import pandas.core.algorithms as algos | ||
from pandas.core.algorithms import unique | ||
from pandas.core.algorithms import ( | ||
factorize, | ||
unique, | ||
) | ||
from pandas.core.arrays.categorical import factorize_from_iterable | ||
from pandas.core.construction import ensure_wrapped_if_datetimelike | ||
from pandas.core.frame import DataFrame | ||
from pandas.core.indexes.api import ( | ||
Index, | ||
MultiIndex, | ||
RangeIndex, | ||
) | ||
from pandas.core.reshape.concat import concat | ||
from pandas.core.series import Series | ||
from pandas.core.sorting import ( | ||
compress_group_index, | ||
|
@@ -876,3 +881,110 @@ def _reorder_for_extension_array_stack( | |
# c0r1, c1r1, c2r1, ...] | ||
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel() | ||
return arr.take(idx) | ||
|
||
|
||
def stack_v2(frame, level: list[int], dropna: bool = True, sort: bool = True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
if frame.columns.nunique() != len(frame.columns): | ||
raise ValueError("Columns with duplicate values are not supported in stack") | ||
|
||
# If we need to drop `level` from columns, it needs to be in descending order | ||
drop_levnums = sorted(level)[::-1] | ||
stack_cols = frame.columns._drop_level_numbers( | ||
[k for k in range(frame.columns.nlevels) if k not in level][::-1] | ||
) | ||
if len(level) > 1: | ||
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] | ||
sorter = np.argsort(level) | ||
ordered_stack_cols = stack_cols._reorder_ilevels(sorter) | ||
else: | ||
ordered_stack_cols = stack_cols | ||
|
||
stack_cols_unique = stack_cols.unique() | ||
ordered_stack_cols_unique = ordered_stack_cols.unique() | ||
|
||
# Grab data for each unique index to be stacked | ||
buf = [] | ||
for idx in stack_cols_unique: | ||
if len(frame.columns) == 1: | ||
data = frame.copy() | ||
else: | ||
# Take the data from frame corresponding to this idx value | ||
if not isinstance(idx, tuple): | ||
idx = (idx,) | ||
gen = iter(idx) | ||
column_indexer = tuple( | ||
next(gen) if k in level else slice(None) | ||
for k in range(frame.columns.nlevels) | ||
) | ||
data = frame.loc[:, column_indexer] | ||
|
||
if len(level) < frame.columns.nlevels: | ||
data.columns = data.columns._drop_level_numbers(drop_levnums) | ||
elif stack_cols.nlevels == 1: | ||
if data.ndim == 1: | ||
data = data.rename(0) | ||
else: | ||
data.columns = RangeIndex(len(data.columns)) | ||
buf.append(data) | ||
if len(buf) > 0: | ||
result = concat(buf) | ||
ratio = len(result) // len(frame) | ||
else: | ||
# input is empty | ||
if len(level) < frame.columns.nlevels: | ||
# concat column order may be different from dropping the levels | ||
new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() | ||
else: | ||
new_columns = [0] | ||
result = DataFrame(columns=new_columns) | ||
ratio = 0 | ||
|
||
if len(level) < frame.columns.nlevels: | ||
# concat column order may be different from dropping the levels | ||
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() | ||
if not result.columns.equals(desired_columns): | ||
result = result[desired_columns] | ||
|
||
# Construct the correct MultiIndex by combining the frame's index and | ||
# stacked columns. | ||
Comment on lines
+948
to
+949
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might be able to go back to your original simpler implementation, now that MultiIndex.append / concat performance has been improved (#53697). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have to guess that constructing a MultiIndex once is faster than constructing a MultiIndex many times and concat'ing them together. In any case, the complexity really arose from getting this to pass our suite of tests. The tests we have for stack/unstack are quite thorough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, indeed, just reusing the levels and repeating or tiling the existing codes should always be (at least a bit) faster than concatting |
||
if isinstance(frame.index, MultiIndex): | ||
index_levels = frame.index.levels | ||
index_codes = np.tile(frame.index.codes, (1, ratio)) | ||
else: | ||
index_levels = [frame.index.unique()] | ||
codes = factorize(frame.index)[0] | ||
index_codes = np.tile(codes, (1, ratio)) | ||
index_codes = list(index_codes) | ||
if isinstance(stack_cols, MultiIndex): | ||
column_levels = ordered_stack_cols.levels | ||
column_codes = ordered_stack_cols.drop_duplicates().codes | ||
else: | ||
column_levels = [ordered_stack_cols.unique()] | ||
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] | ||
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] | ||
result.index = MultiIndex( | ||
levels=index_levels + column_levels, | ||
codes=index_codes + column_codes, | ||
names=frame.index.names + list(ordered_stack_cols.names), | ||
verify_integrity=False, | ||
) | ||
|
||
# sort result, but faster than calling sort_index since we know the order we need | ||
len_df = len(frame) | ||
n_uniques = len(ordered_stack_cols_unique) | ||
indexer = np.arange(n_uniques) | ||
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques) | ||
result = result.take(idxs) | ||
|
||
# Reshape/rename if needed and dropna | ||
if result.ndim == 2 and frame.columns.nlevels == len(level): | ||
if len(result.columns) == 0: | ||
result = Series(index=result.index) | ||
else: | ||
result = result.iloc[:, 0] | ||
if result.ndim == 1: | ||
result.name = None | ||
if dropna: | ||
result = result.dropna(how="all") | ||
|
||
return result |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this a new restriction, or just moving up from the previous
stack_multiple
implementation?(it certainly sounds as a good restriction, though)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No - this already exists in the current implementation.