From 976d4f35b27218b03ab0e379ecee5ede46469007 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 May 2023 10:43:06 -0700 Subject: [PATCH 1/7] Start adding sort to unstack --- pandas/core/frame.py | 6 ++- pandas/core/reshape/reshape.py | 55 ++++++++++++++++-------- pandas/core/series.py | 8 +++- pandas/tests/frame/test_stack_unstack.py | 17 ++++++++ 4 files changed, 65 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 19564afc41b49..f11514dfb46dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9298,7 +9298,7 @@ def explode( return result.__finalize__(self, method="explode") - def unstack(self, level: Level = -1, fill_value=None): + def unstack(self, level: Level = -1, fill_value=None, sort: bool = True): """ Pivot a level of the (necessarily hierarchical) index labels. @@ -9314,6 +9314,8 @@ def unstack(self, level: Level = -1, fill_value=None): Level(s) of index to unstack, can pass level name. fill_value : int, str or dict Replace NaN with this value if the unstack produces missing values. + sort: bool, default True + Sort the level(s) in the resulting MultiIndex columns. Returns ------- @@ -9361,7 +9363,7 @@ def unstack(self, level: Level = -1, fill_value=None): """ from pandas.core.reshape.reshape import unstack - result = unstack(self, level, fill_value) + result = unstack(self, level, fill_value, sort) return result.__finalize__(self, method="unstack") diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 65fd9137313f1..6955678f28d6f 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -28,6 +28,7 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos +from pandas.core.algorithms import unique from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame @@ -101,8 +102,11 @@ class _Unstacker: unstacked : DataFrame """ - def __init__(self, index: MultiIndex, level: Level, constructor) -> None: + def __init__( + self, index: MultiIndex, level: Level, constructor, sort: bool = True + ) -> None: self.constructor = constructor + self.sort = sort self.index = index.remove_unused_levels() @@ -118,11 +122,15 @@ def __init__(self, index: MultiIndex, level: Level, constructor) -> None: self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + if not self.sort: + unique_codes = unique(self.index.codes[self.level]) + self.removed_level = self.removed_level.take(unique_codes) + self.removed_level_full = self.removed_level.take(unique_codes) # Bug fix GH 20601 # If the data frame is too big, the number of unique index combination # will cause int32 overflow on windows environments. - # We want to check and raise an error before this happens + # We want to check and raise an warning before this happens num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size @@ -163,13 +171,17 @@ def _indexer_and_to_sort( @cache_readonly def sorted_labels(self) -> list[np.ndarray]: indexer, to_sort = self._indexer_and_to_sort - return [line.take(indexer) for line in to_sort] + if self.sort: + return [line.take(indexer) for line in to_sort] + return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - indexer, _ = self._indexer_and_to_sort + if self.sort: + indexer, _ = self._indexer_and_to_sort - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values + return values def _make_selectors(self): new_levels = self.new_index_levels @@ -358,7 +370,12 @@ def _repeater(self) -> np.ndarray: @cache_readonly def new_index(self) -> MultiIndex: # Does not depend on values or value_columns - result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] + if self.sort: + result_codes = [ + lab.take(self.compressor) for lab in self.sorted_labels[:-1] + ] + else: + result_codes = self.sorted_labels[:-1] # construct the new index if len(self.new_index_levels) == 1: @@ -375,7 +392,9 @@ def new_index(self) -> MultiIndex: ) -def _unstack_multiple(data: Series | DataFrame, clocs, fill_value=None): +def _unstack_multiple( + data: Series | DataFrame, clocs, fill_value=None, sort: bool = True +): if len(clocs) == 0: return data @@ -420,7 +439,7 @@ def _unstack_multiple(data: Series | DataFrame, clocs, fill_value=None): dummy = data.copy() dummy.index = dummy_index - unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort) new_levels = clevels new_names = cnames new_codes = recons_codes @@ -429,7 +448,7 @@ def _unstack_multiple(data: Series | DataFrame, clocs, fill_value=None): result = data while clocs: val = clocs.pop(0) - result = result.unstack(val, fill_value=fill_value) + result = result.unstack(val, fill_value=fill_value, sort=sort) clocs = [v if v < val else v - 1 for v in clocs] return result @@ -438,7 +457,9 @@ def _unstack_multiple(data: Series | DataFrame, clocs, fill_value=None): dummy_df = data.copy(deep=False) dummy_df.index = dummy_index - unstacked = dummy_df.unstack("__placeholder__", fill_value=fill_value) + unstacked = dummy_df.unstack( + "__placeholder__", fill_value=fill_value, sort=sort + ) if isinstance(unstacked, Series): unstcols = unstacked.index else: @@ -463,12 +484,12 @@ def _unstack_multiple(data: Series | DataFrame, clocs, fill_value=None): return unstacked -def unstack(obj: Series | DataFrame, level, fill_value=None): +def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, # and isn't needed for a single level - return _unstack_multiple(obj, level, fill_value=fill_value) + return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort) else: level = level[0] @@ -478,9 +499,9 @@ def unstack(obj: Series | DataFrame, level, fill_value=None): if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): - return _unstack_frame(obj, level, fill_value=fill_value) + return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - return obj.T.stack(dropna=False) + return obj.T.stack(dropna=False, sort=sort) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose @@ -490,9 +511,9 @@ def unstack(obj: Series | DataFrame, level, fill_value=None): ) else: if is_1d_only_ea_dtype(obj.dtype): - return _unstack_extension_series(obj, level, fill_value) + return _unstack_extension_series(obj, level, fill_value, sort=sort) unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor_expanddim + obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort ) return unstacker.get_result( obj._values, value_columns=None, fill_value=fill_value diff --git a/pandas/core/series.py b/pandas/core/series.py index 569a95aff1de8..00dc6fcdcd11d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4238,7 +4238,9 @@ def explode(self, ignore_index: bool = False) -> Series: return self._constructor(values, index=index, name=self.name, copy=False) - def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFrame: + def unstack( + self, level: IndexLabel = -1, fill_value: Hashable = None, sort: bool = True + ) -> DataFrame: """ Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. @@ -4248,6 +4250,8 @@ def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFr Level(s) to unstack, can pass level name. fill_value : scalar value, default None Value to use when replacing NaN values. + sort: bool, default True + Sort the level(s) in the resulting MultiIndex columns. Returns ------- @@ -4282,7 +4286,7 @@ def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFr """ from pandas.core.reshape.reshape import unstack - return unstack(self, level, fill_value) + return unstack(self, level, fill_value, sort) # ---------------------------------------------------------------------- # function application diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 889c44522f7bb..78cb127926823 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1206,6 +1206,23 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) +def test_unstack_series_sort_false(): + # GH 15105 + index = MultiIndex.from_tuples( + [("one", "z", "b"), ("one", "y", "a"), ("two", "z", "b"), ("two", "y", "a")] + ) + ser = Series(np.arange(1.0, 5.0), index=index) + result = ser.unstack(level=-1, sort=False) + expected = DataFrame( + [[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]], + columns=["b", "a"], + index=MultiIndex.from_tuples( + [("one", "z"), ("one", "y"), ("two", "z"), ("two", "y")] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. data = Series(["a", "b", "c", "a"], dtype="object") From b2ab54cf079abfc2d3864ab2d700026ed9d74005 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 May 2023 15:04:55 -0700 Subject: [PATCH 2/7] Fix compressor --- pandas/core/reshape/reshape.py | 12 +++++------- pandas/tests/frame/test_stack_unstack.py | 12 ++++++++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6955678f28d6f..2a9a22a0c45d0 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -206,7 +206,10 @@ def _make_selectors(self): self.group_index = comp_index self.mask = mask - self.compressor = comp_index.searchsorted(np.arange(ngroups)) + if self.sort: + self.compressor = comp_index.searchsorted(np.arange(ngroups)) + else: + self.compressor = np.sort(np.unique(comp_index, return_index=True)[1]) @cache_readonly def mask_all(self) -> bool: @@ -370,12 +373,7 @@ def _repeater(self) -> np.ndarray: @cache_readonly def new_index(self) -> MultiIndex: # Does not depend on values or value_columns - if self.sort: - result_codes = [ - lab.take(self.compressor) for lab in self.sorted_labels[:-1] - ] - else: - result_codes = self.sorted_labels[:-1] + result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 78cb127926823..dc8884205ccef 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1209,7 +1209,7 @@ def test_unstack_swaplevel_sortlevel(self, level): def test_unstack_series_sort_false(): # GH 15105 index = MultiIndex.from_tuples( - [("one", "z", "b"), ("one", "y", "a"), ("two", "z", "b"), ("two", "y", "a")] + [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) ser = Series(np.arange(1.0, 5.0), index=index) result = ser.unstack(level=-1, sort=False) @@ -1217,11 +1217,19 @@ def test_unstack_series_sort_false(): [[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]], columns=["b", "a"], index=MultiIndex.from_tuples( - [("one", "z"), ("one", "y"), ("two", "z"), ("two", "y")] + [("two", "z"), ("two", "y"), ("one", "z"), ("one", "y")] ), ) tm.assert_frame_equal(result, expected) + result = ser.unstack(level=[1, 2], sort=False) + expected = DataFrame( + [[1.0, 2.0], [3.0, 4.0]], + index=["two", "one"], + columns=MultiIndex.from_tuples([("z", "b"), ("y", "a")]), + ) + tm.assert_frame_equal(result, expected) + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. From af44b3ddf68f2302d4ed8e1b167435644d30d496 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 May 2023 16:48:49 -0700 Subject: [PATCH 3/7] Add testing for dataframe --- pandas/core/reshape/reshape.py | 18 ++++++++++++----- pandas/tests/frame/test_stack_unstack.py | 25 ++++++++++++++++++------ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2a9a22a0c45d0..c6160da2ff854 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -125,7 +125,7 @@ def __init__( if not self.sort: unique_codes = unique(self.index.codes[self.level]) self.removed_level = self.removed_level.take(unique_codes) - self.removed_level_full = self.removed_level.take(unique_codes) + self.removed_level_full = self.removed_level_full.take(unique_codes) # Bug fix GH 20601 # If the data frame is too big, the number of unique index combination @@ -518,9 +518,13 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): ) -def _unstack_frame(obj: DataFrame, level, fill_value=None) -> DataFrame: +def _unstack_frame( + obj: DataFrame, level, fill_value=None, sort: bool = True +) -> DataFrame: assert isinstance(obj.index, MultiIndex) # checked by caller - unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor) + unstacker = _Unstacker( + obj.index, level=level, constructor=obj._constructor, sort=sort + ) if not obj._can_fast_transpose: mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) @@ -531,7 +535,9 @@ def _unstack_frame(obj: DataFrame, level, fill_value=None) -> DataFrame: ) -def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame: +def _unstack_extension_series( + series: Series, level, fill_value, sort: bool +) -> DataFrame: """ Unstack an ExtensionArray-backed Series. @@ -547,6 +553,8 @@ def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame: The user-level (not physical storage) fill value to use for missing values introduced by the reshape. Passed to ``series.values.take``. + sort : bool + Whether to sort the resulting MuliIndex levels Returns ------- @@ -556,7 +564,7 @@ def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame: """ # Defer to the logic in ExtensionBlock._unstack df = series.to_frame() - result = df.unstack(level=level, fill_value=fill_value) + result = df.unstack(level=level, fill_value=fill_value, sort=sort) # equiv: result.droplevel(level=0, axis=1) # but this avoids an extra copy diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index dc8884205ccef..7903103561a59 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1206,27 +1206,40 @@ def test_unstack_swaplevel_sortlevel(self, level): tm.assert_frame_equal(result, expected) -def test_unstack_series_sort_false(): +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_unstack_sort_false(frame_or_series, dtype): # GH 15105 index = MultiIndex.from_tuples( [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")] ) - ser = Series(np.arange(1.0, 5.0), index=index) - result = ser.unstack(level=-1, sort=False) + obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype) + result = obj.unstack(level=-1, sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "b"), (0, "a")]) + else: + expected_columns = ["b", "a"] expected = DataFrame( [[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]], - columns=["b", "a"], + columns=expected_columns, index=MultiIndex.from_tuples( [("two", "z"), ("two", "y"), ("one", "z"), ("one", "y")] ), + dtype=dtype, ) tm.assert_frame_equal(result, expected) - result = ser.unstack(level=[1, 2], sort=False) + result = obj.unstack(level=[1, 2], sort=False) + + if frame_or_series is DataFrame: + expected_columns = MultiIndex.from_tuples([(0, "z", "b"), (0, "y", "a")]) + else: + expected_columns = MultiIndex.from_tuples([("z", "b"), ("y", "a")]) expected = DataFrame( [[1.0, 2.0], [3.0, 4.0]], index=["two", "one"], - columns=MultiIndex.from_tuples([("z", "b"), ("y", "a")]), + columns=expected_columns, + dtype=dtype, ) tm.assert_frame_equal(result, expected) From 9973c69326a66f70a9e18b17994e8597fc2739f1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 May 2023 16:53:21 -0700 Subject: [PATCH 4/7] Add whatsnew --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e1ac9e3309de7..76c522260fd26 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -97,6 +97,7 @@ Other enhancements - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) +- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) From 83746d88792c0597d1b089b36ae3dc595170e9af Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 May 2023 11:55:25 -0700 Subject: [PATCH 5/7] remove sort for now --- pandas/core/reshape/reshape.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c6160da2ff854..0e37018596c14 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -499,7 +499,9 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - return obj.T.stack(dropna=False, sort=sort) + # TODO: Add sort=sort once https://github.com/pandas-dev/pandas/pull/53282 + # is merged + return obj.T.stack(dropna=False) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose From 20ebf7803cefb81b4f2e3099c2431951e5bdd485 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 19 May 2023 14:16:54 -0700 Subject: [PATCH 6/7] Fix param --- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d404bd0b9aa5c..eb34fcca5c617 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9313,7 +9313,7 @@ def unstack(self, level: Level = -1, fill_value=None, sort: bool = True): Level(s) of index to unstack, can pass level name. fill_value : int, str or dict Replace NaN with this value if the unstack produces missing values. - sort: bool, default True + sort : bool, default True Sort the level(s) in the resulting MultiIndex columns. Returns diff --git a/pandas/core/series.py b/pandas/core/series.py index b868e264ff117..cb3edbfd6cfc3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4286,7 +4286,7 @@ def unstack( Level(s) to unstack, can pass level name. fill_value : scalar value, default None Value to use when replacing NaN values. - sort: bool, default True + sort : bool, default True Sort the level(s) in the resulting MultiIndex columns. Returns From 9d4dc644515ce313e98dec6c4137995f9e12f760 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 May 2023 13:52:52 -0700 Subject: [PATCH 7/7] Add sort=sort --- pandas/core/reshape/reshape.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2338ebe6ba4e2..3866d30e9c757 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -499,9 +499,7 @@ def unstack(obj: Series | DataFrame, level, fill_value=None, sort: bool = True): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value, sort=sort) else: - # TODO: Add sort=sort once https://github.com/pandas-dev/pandas/pull/53282 - # is merged - return obj.T.stack(dropna=False) + return obj.T.stack(dropna=False, sort=sort) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose