diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 12d35288d1ee6..e6a114b491670 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -28,6 +28,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) - @@ -108,6 +109,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is backed by an extension dtype (:issue:`51574`) - Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`) +- Performance improvement in :meth:`MultiIndex.sortlevel` when ``ascending`` is a list (:issue:`51612`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index acebe8a498f03..4f89686e5a820 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1893,7 +1893,11 @@ def _get_level_number(self, level) -> int: return 0 def sortlevel( - self, level=None, ascending: bool | list[bool] = True, sort_remaining=None + self, + level=None, + ascending: bool | list[bool] = True, + sort_remaining=None, + na_position: str_t = "first", ): """ For internal compatibility with the Index API. @@ -1904,6 +1908,11 @@ def sortlevel( ---------- ascending : bool, default True False to sort in descending order + na_position : {'first' or 'last'}, default 'first' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 2.1.0 level, sort_remaining are compat parameters @@ -1925,7 +1934,9 @@ def sortlevel( if not isinstance(ascending, bool): raise TypeError("ascending must be a bool value") - return self.sort_values(return_indexer=True, ascending=ascending) + return self.sort_values( + return_indexer=True, ascending=ascending, na_position=na_position + ) def _get_level_values(self, level) -> Index: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b381752818ba0..857d1fb3f6554 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -94,7 +94,6 @@ from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( get_group_index, - indexer_from_factorized, lexsort_indexer, ) @@ -2367,6 +2366,7 @@ def sortlevel( level: IndexLabel = 0, ascending: bool | list[bool] = True, sort_remaining: bool = True, + na_position: str = "first", ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: """ Sort MultiIndex at the requested level. @@ -2383,6 +2383,11 @@ def sortlevel( False to sort in descending order. Can also be a list to specify a directed ordering. sort_remaining : sort by the remaining levels after level + na_position : {'first' or 'last'}, default 'first' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 2.1.0 Returns ------- @@ -2428,40 +2433,21 @@ def sortlevel( ] sortorder = None + codes = [self.codes[lev] for lev in level] # we have a directed ordering via ascending if isinstance(ascending, list): if not len(level) == len(ascending): raise ValueError("level must have same length as ascending") - - indexer = lexsort_indexer( - [self.codes[lev] for lev in level], orders=ascending + elif sort_remaining: + codes.extend( + [self.codes[lev] for lev in range(len(self.levels)) if lev not in level] ) - - # level ordering else: - codes = list(self.codes) - shape = list(self.levshape) - - # partition codes and shape - primary = tuple(codes[lev] for lev in level) - primshp = tuple(shape[lev] for lev in level) - - # Reverse sorted to retain the order of - # smaller indices that needs to be removed - for lev in sorted(level, reverse=True): - codes.pop(lev) - shape.pop(lev) - - if sort_remaining: - primary += primary + tuple(codes) - primshp += primshp + tuple(shape) - else: - sortorder = level[0] - - indexer = indexer_from_factorized(primary, primshp, compress=False) + sortorder = level[0] - if not ascending: - indexer = indexer[::-1] + indexer = lexsort_indexer( + codes, orders=ascending, na_position=na_position, codes_given=True + ) indexer = ensure_platform_int(indexer) new_codes = [level_codes.take(indexer) for level_codes in self.codes] diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 970c9998f5f5d..1430dd22c32db 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -83,11 +83,14 @@ def get_indexer_indexer( if level is not None: _, indexer = target.sortlevel( - level, ascending=ascending, sort_remaining=sort_remaining + level, + ascending=ascending, + sort_remaining=sort_remaining, + na_position=na_position, ) elif isinstance(target, ABCMultiIndex): indexer = lexsort_indexer( - target._get_codes_for_sorting(), orders=ascending, na_position=na_position + target.codes, orders=ascending, na_position=na_position, codes_given=True ) else: # Check monotonic-ness before sort an index (GH 11080) @@ -302,7 +305,11 @@ def indexer_from_factorized( def lexsort_indexer( - keys, orders=None, na_position: str = "last", key: Callable | None = None + keys, + orders=None, + na_position: str = "last", + key: Callable | None = None, + codes_given: bool = False, ) -> npt.NDArray[np.intp]: """ Performs lexical sorting on a set of keys @@ -321,6 +328,8 @@ def lexsort_indexer( Determines placement of NA elements in the sorted list ("last" or "first") key : Callable, optional Callable key function applied to every element in keys before sorting + codes_given: bool, False + Avoid categorical materialization if codes are already provided. Returns ------- @@ -338,15 +347,27 @@ def lexsort_indexer( keys = [ensure_key_mapped(k, key) for k in keys] for k, order in zip(keys, orders): - cat = Categorical(k, ordered=True) - if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") - n = len(cat.categories) - codes = cat.codes.copy() + if codes_given: + mask = k == -1 + codes = k.copy() + n = len(codes) + mask_n = n + if mask.any(): + n -= 1 + + else: + cat = Categorical(k, ordered=True) + n = len(cat.categories) + codes = cat.codes.copy() + mask = cat.codes == -1 + if mask.any(): + mask_n = n + 1 + else: + mask_n = n - mask = cat.codes == -1 if order: # ascending if na_position == "last": codes = np.where(mask, n, codes) @@ -357,10 +378,8 @@ def lexsort_indexer( codes = np.where(mask, n, n - codes - 1) elif na_position == "first": codes = np.where(mask, 0, n - codes) - if mask.any(): - n += 1 - shape.append(n) + shape.append(mask_n) labels.append(codes) return indexer_from_factorized(labels, tuple(shape)) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 8384fcb4de5aa..a47a988da65c5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -907,3 +907,10 @@ def test_sort_index_multiindex_sparse_column(self): result = expected.sort_index(level=0) tm.assert_frame_equal(result, expected) + + def test_sort_index_na_position(self): + # GH#51612 + df = DataFrame([1, 2], index=MultiIndex.from_tuples([(1, 1), (1, pd.NA)])) + expected = df.copy() + result = df.sort_index(level=[0, 1], na_position="last") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 2746c12b120ef..9392ff9368c6a 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -76,6 +76,14 @@ def test_sortlevel_deterministic(): assert sorted_idx.equals(expected[::-1]) +def test_sortlevel_na_position(): + # GH#51612 + midx = MultiIndex.from_tuples([(1, np.nan), (1, 1)]) + result = midx.sortlevel(level=[0, 1], na_position="last")[0] + expected = MultiIndex.from_tuples([(1, 1), (1, np.nan)]) + tm.assert_index_equal(result, expected) + + def test_numpy_argsort(idx): result = np.argsort(idx) expected = idx.argsort() diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6dbe61decfc73..cdd3c2c2754b9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1262,6 +1262,13 @@ def test_sortlevel(self): result = index.sortlevel(ascending=False) tm.assert_index_equal(result[0], expected) + def test_sortlevel_na_position(self): + # GH#51612 + idx = Index([1, np.nan]) + result = idx.sortlevel(na_position="first")[0] + expected = Index([np.nan, 1]) + tm.assert_index_equal(result, expected) + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ