From 1ba1486845eff5a9767c64859e82ed5e34f1947d Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 19 May 2024 18:11:50 -0400 Subject: [PATCH 1/6] PERF: stack on non-MultiIndex columns --- pandas/core/reshape/reshape.py | 156 ++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 60 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 01cc85ceff181..16196ef92e417 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -925,75 +925,28 @@ def _reorder_for_extension_array_stack( def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if frame.columns.nunique() != len(frame.columns): raise ValueError("Columns with duplicate values are not supported in stack") - - # If we need to drop `level` from columns, it needs to be in descending order set_levels = set(level) - drop_levnums = sorted(level, reverse=True) stack_cols = frame.columns._drop_level_numbers( [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) - if len(level) > 1: - # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] - sorter = np.argsort(level) - assert isinstance(stack_cols, MultiIndex) - ordered_stack_cols = stack_cols._reorder_ilevels(sorter) - else: - ordered_stack_cols = stack_cols - - stack_cols_unique = stack_cols.unique() - ordered_stack_cols_unique = ordered_stack_cols.unique() - - # Grab data for each unique index to be stacked - buf = [] - for idx in stack_cols_unique: - if len(frame.columns) == 1: - data = frame.copy() - else: - if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): - # GH#57750 - if the frame is an Index with tuples, .loc below will fail - column_indexer = idx - else: - # Take the data from frame corresponding to this idx value - if len(level) == 1: - idx = (idx,) - gen = iter(idx) - column_indexer = tuple( - next(gen) if k in set_levels else slice(None) - for k in range(frame.columns.nlevels) - ) - data = frame.loc[:, column_indexer] - - if len(level) < frame.columns.nlevels: - data.columns = data.columns._drop_level_numbers(drop_levnums) - elif stack_cols.nlevels == 1: - if data.ndim == 1: - data.name = 0 - else: - data.columns = RangeIndex(len(data.columns)) - buf.append(data) result: Series | DataFrame - if len(buf) > 0 and not frame.empty: - result = concat(buf, ignore_index=True) - ratio = len(result) // len(frame) - else: - # input is empty - if len(level) < frame.columns.nlevels: - # concat column order may be different from dropping the levels - new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not isinstance(frame.columns, MultiIndex): + # Fast path when we're stacking the columns of a non-MultiIndex. + # When columns are homogeneous EAs, we pass through object + # dtype but this is still faster than the normal path. + if len(frame.columns) > 0 and frame._is_homogeneous_type: + dtype = frame._mgr.arrays[0].dtype else: - new_columns = [0] - result = DataFrame(columns=new_columns, dtype=frame._values.dtype) - ratio = 0 - - if len(level) < frame.columns.nlevels: - # concat column order may be different from dropping the levels - desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() - if not result.columns.equals(desired_columns): - result = result[desired_columns] + dtype = None + result = Series(frame._values.ravel(order="F"), dtype=dtype) + else: + result = stack_reshape(frame, level, set_levels, stack_cols) # Construct the correct MultiIndex by combining the frame's index and # stacked columns. + ratio = 0 if frame.empty else len(result) // len(frame) + index_levels: list | FrozenList if isinstance(frame.index, MultiIndex): index_levels = frame.index.levels @@ -1002,12 +955,22 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: codes, uniques = factorize(frame.index, use_na_sentinel=False) index_levels = [uniques] index_codes = list(np.tile(codes, (1, ratio))) + + if len(level) > 1: + # Arrange columns in the order we want to take them, e.g. level=[2, 0, 1] + sorter = np.argsort(level) + assert isinstance(stack_cols, MultiIndex) + ordered_stack_cols = stack_cols._reorder_ilevels(sorter) + else: + ordered_stack_cols = stack_cols + ordered_stack_cols_unique = ordered_stack_cols.unique() if isinstance(ordered_stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels column_codes = ordered_stack_cols.drop_duplicates().codes else: - column_levels = [ordered_stack_cols.unique()] + column_levels = [ordered_stack_cols_unique] column_codes = [factorize(ordered_stack_cols_unique, use_na_sentinel=False)[0]] + # error: Incompatible types in assignment (expression has type "list[ndarray[Any, # dtype[Any]]]", variable has type "FrozenList") column_codes = [np.repeat(codes, len(frame)) for codes in column_codes] # type: ignore[assignment] @@ -1035,3 +998,76 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: result.name = None return result + + +def stack_reshape( + frame: DataFrame, level: list[int], set_levels: set[int], stack_cols: Index +) -> Series | DataFrame: + """Reshape the data of a frame for stack. + + This function takes care of most of the work that stack needs to do. Caller + will sort the result once the appropriate index is set. + + Parameters + ---------- + frame: DataFrame + DataFrame that is to be stacked. + level: list of ints. + Levels of the columns to stack. + set_levels: set of ints. + Same as level, but as a set. + stack_cols: Index. + Columns of the result when the DataFrame is stacked. + + Returns + ------- + The data of behind the stacked DataFrame. + """ + # non-MultIndex takes a fast path. + assert isinstance(frame.columns, MultiIndex) + # If we need to drop `level` from columns, it needs to be in descending order + drop_levnums = sorted(level, reverse=True) + + # Grab data for each unique index to be stacked + buf = [] + for idx in stack_cols.unique(): + if len(frame.columns) == 1: + data = frame.copy() + else: + # Take the data from frame corresponding to this idx value + if len(level) == 1: + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in set_levels else slice(None) + for k in range(frame.columns.nlevels) + ) + data = frame.loc[:, column_indexer] + + if len(level) < frame.columns.nlevels: + data.columns = data.columns._drop_level_numbers(drop_levnums) + elif stack_cols.nlevels == 1: + if data.ndim == 1: + data.name = 0 + else: + data.columns = RangeIndex(len(data.columns)) + buf.append(data) + + if len(buf) > 0 and not frame.empty: + result = concat(buf, ignore_index=True) + else: + # input is empty + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + new_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + else: + new_columns = [0] + result = DataFrame(columns=new_columns, dtype=frame._values.dtype) + + if len(level) < frame.columns.nlevels: + # concat column order may be different from dropping the levels + desired_columns = frame.columns._drop_level_numbers(drop_levnums).unique() + if not result.columns.equals(desired_columns): + result = result[desired_columns] + + return result From bf5d60bcb4a559446ff2cfb88d23d69cb4b53ec7 Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 20 May 2024 21:32:05 -0400 Subject: [PATCH 2/6] WIP --- pandas/core/reshape/reshape.py | 2 +- pandas/tests/extension/base/reshaping.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 16196ef92e417..9ec0be22164e0 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -939,7 +939,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: dtype = frame._mgr.arrays[0].dtype else: dtype = None - result = Series(frame._values.ravel(order="F"), dtype=dtype) + result = frame._constructor_sliced(frame._values.ravel(order="F"), dtype=dtype) else: result = stack_reshape(frame, level, set_levels, stack_cols) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 489cd15644d04..4d3358115d9eb 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -266,7 +266,7 @@ def test_stack(self, data, columns, future_stack): expected = expected.astype(object) if isinstance(expected, pd.Series): - assert result.dtype == df.iloc[:, 0].dtype + assert result.dtype == df.iloc[:, 0].dtype, f'{type(result.dtype), result.dtype} vs {type(df.iloc[:, 0].dtype), df.iloc[:, 0].dtype}' else: assert all(result.dtypes == df.iloc[:, 0].dtype) From ea4d614003c5b9b41058e300a65ab7e294750464 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 23 May 2024 22:07:05 -0400 Subject: [PATCH 3/6] Use reshape instead of ravel --- pandas/core/reshape/reshape.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9ec0be22164e0..4b05f724922c3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -939,7 +939,9 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: dtype = frame._mgr.arrays[0].dtype else: dtype = None - result = frame._constructor_sliced(frame._values.ravel(order="F"), dtype=dtype) + result = frame._constructor_sliced( + frame._values.reshape(-1, order="F"), dtype=dtype + ) else: result = stack_reshape(frame, level, set_levels, stack_cols) From 348fbebc3634693cbb267efeb029066ae83b3aa8 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 23 May 2024 22:46:40 -0400 Subject: [PATCH 4/6] arrays -> blocks --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 76ecccfd7a4ad..d94069032c541 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -942,7 +942,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: # When columns are homogeneous EAs, we pass through object # dtype but this is still faster than the normal path. if len(frame.columns) > 0 and frame._is_homogeneous_type: - dtype = frame._mgr.arrays[0].dtype + dtype = frame._mgr.blocks[0].dtype else: dtype = None result = frame._constructor_sliced( From de694868e5e87b798c1dfcf3cde3e4aa254a2774 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 2 Jun 2024 10:07:03 -0400 Subject: [PATCH 5/6] Update test --- pandas/core/reshape/reshape.py | 2 +- pandas/tests/extension/base/reshaping.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d94069032c541..09d84255c04ba 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -940,7 +940,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: if not isinstance(frame.columns, MultiIndex): # Fast path when we're stacking the columns of a non-MultiIndex. # When columns are homogeneous EAs, we pass through object - # dtype but this is still faster than the normal path. + # dtype but this is still slightly faster than the normal path. if len(frame.columns) > 0 and frame._is_homogeneous_type: dtype = frame._mgr.blocks[0].dtype else: diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 4d3358115d9eb..c105a232b5fa4 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import NumpyEADtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import ExtensionArray @@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack): expected = expected.astype(object) if isinstance(expected, pd.Series): - assert result.dtype == df.iloc[:, 0].dtype, f'{type(result.dtype), result.dtype} vs {type(df.iloc[:, 0].dtype), df.iloc[:, 0].dtype}' + if future_stack and isinstance(data.dtype, NumpyEADtype): + # future_stack=True constructs the result specifying the dtype + # using the dtype of the input; we thus get the underlying + # NumPy dtype as the result instead of the NumpyExtensionArray + assert result.dtype == df.iloc[:, 0].to_numpy().dtype + else: + assert result.dtype == df.iloc[:, 0].dtype else: assert all(result.dtypes == df.iloc[:, 0].dtype) From 8feda0e984c04d7be6e7d1481314990eb0964b8f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 2 Jun 2024 10:13:43 -0400 Subject: [PATCH 6/6] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/reshape.py | 2 +- pandas/tests/extension/base/reshaping.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 865996bdf8892..be89b08d2b3f0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -432,6 +432,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 09d84255c04ba..5cb2edc1fa912 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -938,7 +938,7 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: result: Series | DataFrame if not isinstance(frame.columns, MultiIndex): - # Fast path when we're stacking the columns of a non-MultiIndex. + # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex. # When columns are homogeneous EAs, we pass through object # dtype but this is still slightly faster than the normal path. if len(frame.columns) > 0 and frame._is_homogeneous_type: diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c105a232b5fa4..e6887d80cf8c1 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -269,7 +269,7 @@ def test_stack(self, data, columns, future_stack): if isinstance(expected, pd.Series): if future_stack and isinstance(data.dtype, NumpyEADtype): - # future_stack=True constructs the result specifying the dtype + # GH#58817 future_stack=True constructs the result specifying the dtype # using the dtype of the input; we thus get the underlying # NumPy dtype as the result instead of the NumpyExtensionArray assert result.dtype == df.iloc[:, 0].to_numpy().dtype