From 23c033922e03cfc9c2d66239fe4cc316838effdd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 6 Sep 2022 21:27:50 +0200 Subject: [PATCH 1/3] Backport PR #48412: BUG: safe_sort losing MultiIndex dtypes --- pandas/core/algorithms.py | 28 +++++++++++++++++++++++----- pandas/tests/test_sorting.py | 13 +++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 80fe74ccee583..1aaf389b673cf 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,6 +14,7 @@ Sequence, cast, final, + overload, ) import warnings @@ -101,6 +102,7 @@ Categorical, DataFrame, Index, + MultiIndex, Series, ) from pandas.core.arrays import ( @@ -1792,7 +1794,7 @@ def safe_sort( na_sentinel: int = -1, assume_unique: bool = False, verify: bool = True, -) -> np.ndarray | tuple[np.ndarray, np.ndarray]: +) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]: """ Sort ``values`` and reorder corresponding ``codes``. @@ -1821,7 +1823,7 @@ def safe_sort( Returns ------- - ordered : ndarray + ordered : ndarray or MultiIndex Sorted ``values`` new_codes : ndarray Reordered ``codes``; returned when ``codes`` is not None. @@ -1839,6 +1841,7 @@ def safe_sort( raise TypeError( "Only list-like objects are allowed to be passed to safe_sort as values" ) + original_values = values if not isinstance(values, (np.ndarray, ABCExtensionArray)): # don't convert to string types @@ -1850,6 +1853,7 @@ def safe_sort( values = np.asarray(values, dtype=dtype) # type: ignore[arg-type] sorter = None + ordered: np.ndarray | MultiIndex if ( not is_extension_array_dtype(values) @@ -1865,7 +1869,7 @@ def safe_sort( # which would work, but which fails for special case of 1d arrays # with tuples. if values.size and isinstance(values[0], tuple): - ordered = _sort_tuples(values) + ordered = _sort_tuples(values, original_values) else: ordered = _sort_mixed(values) @@ -1927,19 +1931,33 @@ def _sort_mixed(values) -> np.ndarray: ) -def _sort_tuples(values: np.ndarray) -> np.ndarray: +@overload +def _sort_tuples(values: np.ndarray, original_values: np.ndarray) -> np.ndarray: + ... + + +@overload +def _sort_tuples(values: np.ndarray, original_values: MultiIndex) -> MultiIndex: + ... + + +def _sort_tuples( + values: np.ndarray, original_values: np.ndarray | MultiIndex +) -> np.ndarray | MultiIndex: """ Convert array of tuples (1d) to array or array (2d). We need to keep the columns separately as they contain different types and nans (can't use `np.sort` as it may fail when str and nan are mixed in a column as types cannot be compared). + We have to apply the indexer to the original values to keep the dtypes in + case of MultiIndexes """ from pandas.core.internals.construction import to_arrays from pandas.core.sorting import lexsort_indexer arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) - return values[indexer] + return original_values[indexer] def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike: diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 396c4d82d01fc..537792ea8263c 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -11,6 +11,7 @@ ) from pandas import ( + NA, DataFrame, MultiIndex, Series, @@ -510,3 +511,15 @@ def test_mixed_str_nan(): result = safe_sort(values) expected = np.array([np.nan, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) + + +def test_safe_sort_multiindex(): + # GH#48412 + arr1 = Series([2, 1, NA, NA], dtype="Int64") + arr2 = [2, 1, 3, 3] + midx = MultiIndex.from_arrays([arr1, arr2]) + result = safe_sort(midx) + expected = MultiIndex.from_arrays( + [Series([1, 2, NA, NA], dtype="Int64"), [1, 2, 3, 3]] + ) + tm.assert_index_equal(result, expected) From f92af3b7eb53e08372ee6c92454762ebe0383f73 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 9 Oct 2022 22:31:19 +0200 Subject: [PATCH 2/3] Update algorithms.py --- pandas/core/algorithms.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1aaf389b673cf..3954b912005ed 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1842,6 +1842,7 @@ def safe_sort( "Only list-like objects are allowed to be passed to safe_sort as values" ) original_values = values + is_mi = isinstance(original_values, ABCMultiIndex) if not isinstance(values, (np.ndarray, ABCExtensionArray)): # don't convert to string types @@ -1863,7 +1864,11 @@ def safe_sort( else: try: sorter = values.argsort() - ordered = values.take(sorter) + if is_mi: + # Operate on original object instead of casted array (MultiIndex) + ordered = original_values.take(sorter) + else: + ordered = values.take(sorter) except TypeError: # Previous sorters failed or were not applicable, try `_sort_mixed` # which would work, but which fails for special case of 1d arrays From 7d9bee5b038776c6ec2ff6c0c9e0efd2376d40ab Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 10 Oct 2022 14:58:01 +0200 Subject: [PATCH 3/3] Update algorithms.py --- pandas/core/algorithms.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3954b912005ed..fc2050dcc7f4d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1865,10 +1865,10 @@ def safe_sort( try: sorter = values.argsort() if is_mi: - # Operate on original object instead of casted array (MultiIndex) - ordered = original_values.take(sorter) - else: - ordered = values.take(sorter) + # Operate on original object instead of casted array (MultiIndex) + ordered = original_values.take(sorter) + else: + ordered = values.take(sorter) except TypeError: # Previous sorters failed or were not applicable, try `_sort_mixed` # which would work, but which fails for special case of 1d arrays