Skip to content

Commit ca20ea9

Browse files
authored
PERF: future_stack=True with non-MulitIndex columns (#58817)
* PERF: stack on non-MultiIndex columns * WIP * Use reshape instead of ravel * arrays -> blocks * Update test * whatsnew
1 parent f8d6296 commit ca20ea9

File tree

3 files changed

+34
-14
lines changed

3 files changed

+34
-14
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ Performance improvements
623623
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
624624
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
625625
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
626+
- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
626627
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
627628
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
628629
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)

pandas/core/reshape/reshape.py

+24-13
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
936936
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
937937
)
938938

939-
result = stack_reshape(frame, level, set_levels, stack_cols)
939+
result: Series | DataFrame
940+
if not isinstance(frame.columns, MultiIndex):
941+
# GH#58817 Fast path when we're stacking the columns of a non-MultiIndex.
942+
# When columns are homogeneous EAs, we pass through object
943+
# dtype but this is still slightly faster than the normal path.
944+
if len(frame.columns) > 0 and frame._is_homogeneous_type:
945+
dtype = frame._mgr.blocks[0].dtype
946+
else:
947+
dtype = None
948+
result = frame._constructor_sliced(
949+
frame._values.reshape(-1, order="F"), dtype=dtype
950+
)
951+
else:
952+
result = stack_reshape(frame, level, set_levels, stack_cols)
940953

941954
# Construct the correct MultiIndex by combining the frame's index and
942955
# stacked columns.
@@ -1018,6 +1031,8 @@ def stack_reshape(
10181031
-------
10191032
The data of behind the stacked DataFrame.
10201033
"""
1034+
# non-MultIndex takes a fast path.
1035+
assert isinstance(frame.columns, MultiIndex)
10211036
# If we need to drop `level` from columns, it needs to be in descending order
10221037
drop_levnums = sorted(level, reverse=True)
10231038

@@ -1027,18 +1042,14 @@ def stack_reshape(
10271042
if len(frame.columns) == 1:
10281043
data = frame.copy(deep=False)
10291044
else:
1030-
if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple):
1031-
# GH#57750 - if the frame is an Index with tuples, .loc below will fail
1032-
column_indexer = idx
1033-
else:
1034-
# Take the data from frame corresponding to this idx value
1035-
if len(level) == 1:
1036-
idx = (idx,)
1037-
gen = iter(idx)
1038-
column_indexer = tuple(
1039-
next(gen) if k in set_levels else slice(None)
1040-
for k in range(frame.columns.nlevels)
1041-
)
1045+
# Take the data from frame corresponding to this idx value
1046+
if len(level) == 1:
1047+
idx = (idx,)
1048+
gen = iter(idx)
1049+
column_indexer = tuple(
1050+
next(gen) if k in set_levels else slice(None)
1051+
for k in range(frame.columns.nlevels)
1052+
)
10421053
data = frame.loc[:, column_indexer]
10431054

10441055
if len(level) < frame.columns.nlevels:

pandas/tests/extension/base/reshaping.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.core.dtypes.dtypes import NumpyEADtype
7+
68
import pandas as pd
79
import pandas._testing as tm
810
from pandas.api.extensions import ExtensionArray
@@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack):
266268
expected = expected.astype(object)
267269

268270
if isinstance(expected, pd.Series):
269-
assert result.dtype == df.iloc[:, 0].dtype
271+
if future_stack and isinstance(data.dtype, NumpyEADtype):
272+
# GH#58817 future_stack=True constructs the result specifying the dtype
273+
# using the dtype of the input; we thus get the underlying
274+
# NumPy dtype as the result instead of the NumpyExtensionArray
275+
assert result.dtype == df.iloc[:, 0].to_numpy().dtype
276+
else:
277+
assert result.dtype == df.iloc[:, 0].dtype
270278
else:
271279
assert all(result.dtypes == df.iloc[:, 0].dtype)
272280

0 commit comments

Comments
 (0)