Skip to content

POC: DataFrame.stack not including NA rows #53756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
23 changes: 15 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9158,15 +9158,22 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
dog kg 2.0 NaN
m NaN 3.0
"""
from pandas.core.reshape.reshape import (
stack,
stack_multiple,
)
from pandas.core.reshape.reshape import stack_v2

if isinstance(level, (tuple, list)):
result = stack_multiple(self, level, dropna=dropna, sort=sort)
else:
result = stack(self, level, dropna=dropna, sort=sort)
if (
isinstance(level, (tuple, list))
and not all(lev in self.columns.names for lev in level)
and not all(isinstance(lev, int) for lev in level)
):
raise ValueError(
"level should contain all level names or all level "
"numbers, not a mixture of the two."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a new restriction, or just moving up from the previous stack_multiple implementation?

(it certainly sounds as a good restriction, though)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No - this already exists in the current implementation.

)

if not isinstance(level, (tuple, list)):
level = [level]
level = [self.columns._get_level_number(lev) for lev in level]
result = stack_v2(self, level, dropna=dropna, sort=sort)

return result.__finalize__(self, method="stack")

Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2386,6 +2386,10 @@ def reorder_levels(self, order) -> MultiIndex:
names=['y', 'x'])
"""
order = [self._get_level_number(i) for i in order]
result = self._reorder_ilevels(order)
return result

def _reorder_ilevels(self, order) -> MultiIndex:
if len(order) != self.nlevels:
raise AssertionError(
f"Length of order must be same as number of levels ({self.nlevels}), "
Expand Down
114 changes: 113 additions & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,19 @@
from pandas.core.dtypes.missing import notna

import pandas.core.algorithms as algos
from pandas.core.algorithms import unique
from pandas.core.algorithms import (
factorize,
unique,
)
from pandas.core.arrays.categorical import factorize_from_iterable
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import (
Index,
MultiIndex,
RangeIndex,
)
from pandas.core.reshape.concat import concat
from pandas.core.series import Series
from pandas.core.sorting import (
compress_group_index,
Expand Down Expand Up @@ -876,3 +881,110 @@ def _reorder_for_extension_array_stack(
# c0r1, c1r1, c2r1, ...]
idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
return arr.take(idx)


def stack_v2(frame, level: list[int], dropna: bool = True, sort: bool = True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sort keyword is currently ignored in this implementation?

if frame.columns.nunique() != len(frame.columns):
raise ValueError("Columns with duplicate values are not supported in stack")

# If we need to drop `level` from columns, it needs to be in descending order
drop_levnums = sorted(level)[::-1]
stack_cols = frame.columns._drop_level_numbers(
[k for k in range(frame.columns.nlevels) if k not in level][::-1]
)
if len(level) > 1:
# Arrange columns in the order we want to take them, e.g. level=[2, 0, 1]
sorter = np.argsort(level)
ordered_stack_cols = stack_cols._reorder_ilevels(sorter)
else:
ordered_stack_cols = stack_cols

stack_cols_unique = stack_cols.unique()
ordered_stack_cols_unique = ordered_stack_cols.unique()

# Grab data for each unique index to be stacked
buf = []
for idx in stack_cols_unique:
if len(frame.columns) == 1:
data = frame.copy()
else:
# Take the data from frame corresponding to this idx value
if not isinstance(idx, tuple):
idx = (idx,)
gen = iter(idx)
column_indexer = tuple(
next(gen) if k in level else slice(None)
for k in range(frame.columns.nlevels)
)
data = frame.loc[:, column_indexer]

if len(level) < frame.columns.nlevels:
data.columns = data.columns._drop_level_numbers(drop_levnums)
elif stack_cols.nlevels == 1:
if data.ndim == 1:
data = data.rename(0)
else:
data.columns = RangeIndex(len(data.columns))
buf.append(data)
if len(buf) > 0:
result = concat(buf)
ratio = len(result) // len(frame)
else:
# input is empty
if len(level) < frame.columns.nlevels:
# concat column order may be different from dropping the levels
new_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
else:
new_columns = [0]
result = DataFrame(columns=new_columns)
ratio = 0

if len(level) < frame.columns.nlevels:
# concat column order may be different from dropping the levels
desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique()
if not result.columns.equals(desired_columns):
result = result[desired_columns]

# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
Comment on lines +948 to +949
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might be able to go back to your original simpler implementation, now that MultiIndex.append / concat performance has been improved (#53697).
Didn't check if your custom version here is still faster though, but that PR implemented my suggested improvement from profiling this stack use case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have to guess that constructing a MultiIndex once is faster than constructing a MultiIndex many times and concat'ing them together. In any case, the complexity really arose from getting this to pass our suite of tests. The tests we have for stack/unstack are quite thorough.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, indeed, just reusing the levels and repeating or tiling the existing codes should always be (at least a bit) faster than concatting

if isinstance(frame.index, MultiIndex):
index_levels = frame.index.levels
index_codes = np.tile(frame.index.codes, (1, ratio))
else:
index_levels = [frame.index.unique()]
codes = factorize(frame.index)[0]
index_codes = np.tile(codes, (1, ratio))
index_codes = list(index_codes)
if isinstance(stack_cols, MultiIndex):
column_levels = ordered_stack_cols.levels
column_codes = ordered_stack_cols.drop_duplicates().codes
else:
column_levels = [ordered_stack_cols.unique()]
column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]]
column_codes = [np.repeat(codes, len(frame)) for codes in column_codes]
result.index = MultiIndex(
levels=index_levels + column_levels,
codes=index_codes + column_codes,
names=frame.index.names + list(ordered_stack_cols.names),
verify_integrity=False,
)

# sort result, but faster than calling sort_index since we know the order we need
len_df = len(frame)
n_uniques = len(ordered_stack_cols_unique)
indexer = np.arange(n_uniques)
idxs = np.tile(len_df * indexer, len_df) + np.repeat(np.arange(len_df), n_uniques)
result = result.take(idxs)

# Reshape/rename if needed and dropna
if result.ndim == 2 and frame.columns.nlevels == len(level):
if len(result.columns) == 0:
result = Series(index=result.index)
else:
result = result.iloc[:, 0]
if result.ndim == 1:
result.name = None
if dropna:
result = result.dropna(how="all")

return result
57 changes: 26 additions & 31 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,10 +1080,9 @@ def test_stack_full_multiIndex(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize("labels", [list("yxz"), list("yxy")])
def test_stack_preserve_categorical_dtype(self, ordered, labels):
def test_stack_preserve_categorical_dtype(self, ordered):
# GH13854
cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered)
cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered)
df = DataFrame([[10, 11, 12]], columns=cidx)
result = df.stack()

Expand All @@ -1110,8 +1109,10 @@ def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data):
df = DataFrame([sorted(data)], columns=midx)
result = df.stack([0, 1])

s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered)
expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2]))
s_cidx = pd.CategoricalIndex(labels, ordered=ordered)
expected = Series(
sorted(data), index=MultiIndex.from_product([[0], s_cidx, cidx2])
)

tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -1311,7 +1312,7 @@ def test_stack_timezone_aware_values():
def test_stack_empty_frame(dropna):
# GH 36113
levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
expected = Series(index=MultiIndex(levels=levels, codes=[[], []]))
result = DataFrame(dtype=np.float64).stack(dropna=dropna)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -1568,17 +1569,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data):
@pytest.mark.parametrize(
"idx, columns, exp_idx",
[
[
list("abab"),
["1st", "2nd", "3rd"],
MultiIndex(
levels=[["a", "b"], ["1st", "2nd", "3rd"]],
codes=[
np.tile(np.arange(2).repeat(3), 2),
np.tile(np.arange(3), 4),
],
),
],
[
list("abab"),
["1st", "2nd", "1st"],
Expand Down Expand Up @@ -1608,12 +1598,9 @@ def test_stack_duplicate_index(self, idx, columns, exp_idx):
index=idx,
columns=columns,
)
result = df.stack()
expected = Series(np.arange(12), index=exp_idx)
tm.assert_series_equal(result, expected)
assert result.index.is_unique is False
li, ri = result.index, expected.index
tm.assert_index_equal(li, ri)
msg = "Columns with duplicate values are not supported in stack"
with pytest.raises(ValueError, match=msg):
df.stack()

def test_unstack_odd_failure(self):
data = """day,time,smoker,sum,len
Expand Down Expand Up @@ -2199,8 +2186,14 @@ def test_stack_nan_in_multiindex_columns(self):
result = df.stack(2)
expected = DataFrame(
[[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]],
index=Index([(0, None), (0, 0), (0, 1)]),
columns=Index([(0, None), (0, 2), (0, 3)]),
index=MultiIndex(
levels=[[0], [0.0, 1.0]],
codes=[[0, 0, 0], [-1, 0, 1]],
),
columns=MultiIndex(
levels=[[0], [2, 3]],
codes=[[0, 0, 0], [-1, 0, 1]],
),
)
tm.assert_frame_equal(result, expected)

Expand All @@ -2218,20 +2211,20 @@ def test_multi_level_stack_categorical(self):
expected = DataFrame(
[
[0, np.nan],
[np.nan, 2],
[1, np.nan],
[np.nan, 2],
[np.nan, 3],
[4, np.nan],
[np.nan, 6],
[5, np.nan],
[np.nan, 6],
[np.nan, 7],
],
columns=["A", "B"],
index=MultiIndex.from_arrays(
[
[0] * 4 + [1] * 4,
pd.Categorical(list("aabbaabb")),
pd.Categorical(list("cdcdcdcd")),
pd.Categorical(list("abababab")),
pd.Categorical(list("ccddccdd")),
]
),
)
Expand All @@ -2251,8 +2244,10 @@ def test_stack_nan_level(self):
expected = DataFrame(
[[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]],
columns=Index(["A", "B"], name="Upper"),
index=MultiIndex.from_tuples(
[(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"]
index=MultiIndex(
levels=[[0, 1], [np.nan, "b"]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=["Num", "Lower"],
),
)
tm.assert_frame_equal(result, expected)
Expand Down