From 5338e66e4d8828574b7d5d55396ea34b7e9d2cd3 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 10 Sep 2022 05:36:45 -0400 Subject: [PATCH 01/12] use lexsort_indexer in MultiIndex.argsort --- pandas/core/indexes/multi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 13c3b9200371f..0fb7adc71c3a5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2226,9 +2226,9 @@ def append(self, other): def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: if len(args) == 0 and len(kwargs) == 0: - # np.lexsort is significantly faster than self._values.argsort() - values = [self._get_level_values(i) for i in reversed(range(self.nlevels))] - return np.lexsort(values) + # lexsort is significantly faster than self._values.argsort() + target = self._sort_levels_monotonic() + return lexsort_indexer(target._get_codes_for_sorting()) return self._values.argsort(*args, **kwargs) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) From bd50149511202aab272525b6b5bcfb6577b9ff31 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 10 Sep 2022 06:07:02 -0400 Subject: [PATCH 02/12] add test --- .../tests/indexing/multiindex/test_sorted.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 2214aaa9cfbdb..41307894d4eb1 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -2,9 +2,11 @@ import pytest from pandas import ( + NA, DataFrame, MultiIndex, Series, + array, ) import pandas._testing as tm @@ -86,6 +88,25 @@ def test_sort_values_key(self): tm.assert_frame_equal(result, expected) + def test_sort_values_with_na(self): + arrays = [ + array([2, 1, NA], dtype="Int64"), + array([1, 2, 3], dtype="Int64"), + ] + index = MultiIndex.from_arrays(arrays) + index = index.sort_values() + result = DataFrame(range(3), index=index) + + arrays = [ + array([1, 2, NA], dtype="Int64"), + array([2, 1, 3], dtype="Int64"), + ] + index = MultiIndex.from_arrays(arrays) + index = index.sort_values() + expected = DataFrame(range(3), index=index) + + tm.assert_frame_equal(result, expected) + def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T From 911ded08fab89ac9ca6b10f7bd6b5809cd440bd0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 10 Sep 2022 06:19:05 -0400 Subject: [PATCH 03/12] whatsnew --- doc/source/whatsnew/v1.6.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index ee5085fd9ad89..bd3bdc1bf9a9a 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -167,6 +167,7 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`) - Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) - From 8d3fdad9eea57a871ab91f750d856b70241b5ef5 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 12 Sep 2022 20:03:07 -0400 Subject: [PATCH 04/12] add argsort test --- pandas/tests/indexing/multiindex/test_sorted.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 41307894d4eb1..3e22a4fd722f1 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -88,9 +88,19 @@ def test_sort_values_key(self): tm.assert_frame_equal(result, expected) + def test_argsort_with_na(self): + arrays = [ + array([2, NA, 1], dtype="Int64"), + array([1, 2, 3], dtype="Int64"), + ] + index = MultiIndex.from_arrays(arrays) + result = index.argsort() + expected = np.array([2, 0, 1]) + tm.assert_numpy_array_equal(result, expected) + def test_sort_values_with_na(self): arrays = [ - array([2, 1, NA], dtype="Int64"), + array([2, NA, 1], dtype="Int64"), array([1, 2, 3], dtype="Int64"), ] index = MultiIndex.from_arrays(arrays) @@ -99,10 +109,9 @@ def test_sort_values_with_na(self): arrays = [ array([1, 2, NA], dtype="Int64"), - array([2, 1, 3], dtype="Int64"), + array([3, 1, 2], dtype="Int64"), ] index = MultiIndex.from_arrays(arrays) - index = index.sort_values() expected = DataFrame(range(3), index=index) tm.assert_frame_equal(result, expected) From adb0e8b7bb3b0773a5f36bfd3c8a12cd787e114f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 13 Sep 2022 21:13:27 -0400 Subject: [PATCH 05/12] remove runtime warning expectation from test --- pandas/tests/indexes/multi/test_setops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index ce310a75e8e45..237181b8be5fb 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -379,8 +379,7 @@ def test_union_sort_other_incomparable(): idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None - with tm.assert_produces_warning(RuntimeWarning): - result = idx.union(idx[:1]) + result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) # sort=False From 7de6268189efa8ddde074259fa80da13a0314971 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 14 Sep 2022 05:43:52 -0400 Subject: [PATCH 06/12] fix test --- pandas/tests/indexing/multiindex/test_sorted.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 3e22a4fd722f1..0bc5b6958f07c 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -95,7 +95,7 @@ def test_argsort_with_na(self): ] index = MultiIndex.from_arrays(arrays) result = index.argsort() - expected = np.array([2, 0, 1]) + expected = np.array([2, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) def test_sort_values_with_na(self): From 80047dcab6c9da77329a59b8efa38c45f6494108 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 14 Sep 2022 05:48:19 -0400 Subject: [PATCH 07/12] optionally raise if values are uncomparable --- pandas/core/indexes/multi.py | 7 ++++--- pandas/tests/indexes/multi/test_setops.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b440f176bdb9d..d54be65ea9992 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1933,7 +1933,7 @@ def _lexsort_depth(self) -> int: return self.sortorder return _lexsort_depth(self.codes, self.nlevels) - def _sort_levels_monotonic(self) -> MultiIndex: + def _sort_levels_monotonic(self, raise_if_unsortable=False) -> MultiIndex: """ This is an *internal* function. @@ -1980,7 +1980,8 @@ def _sort_levels_monotonic(self) -> MultiIndex: # indexer to reorder the levels indexer = lev.argsort() except TypeError: - pass + if raise_if_unsortable: + raise else: lev = lev.take(indexer) @@ -2227,7 +2228,7 @@ def append(self, other): def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: if len(args) == 0 and len(kwargs) == 0: # lexsort is significantly faster than self._values.argsort() - target = self._sort_levels_monotonic() + target = self._sort_levels_monotonic(raise_if_unsortable=True) return lexsort_indexer(target._get_codes_for_sorting()) return self._values.argsort(*args, **kwargs) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 237181b8be5fb..ce310a75e8e45 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -379,7 +379,8 @@ def test_union_sort_other_incomparable(): idx = MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None - result = idx.union(idx[:1]) + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) tm.assert_index_equal(result, idx) # sort=False From d4916a4d0b32c7d24a39e748802869231074f9a2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 18 Sep 2022 17:51:35 -0400 Subject: [PATCH 08/12] add test --- pandas/tests/indexes/multi/test_sorting.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 6fd1781beeda4..c73bc51215aae 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -280,3 +280,12 @@ def test_remove_unused_levels_with_nan(): result = idx.levels expected = FrozenList([["a", np.nan], [4]]) assert str(result) == str(expected) + + +def test_sort_values_nan(): + midx = MultiIndex(levels=[["A", "B", "C"], ["D"]], codes=[[1, 0, 2], [-1, -1, 0]]) + result = midx.sort_values() + expected = MultiIndex( + levels=[["A", "B", "C"], ["D"]], codes=[[0, 1, 2], [-1, -1, 0]] + ) + tm.assert_index_equal(result, expected) From ef974f3ab04ed82067cf96cd4d15311d14bce732 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 20 Sep 2022 05:42:55 -0400 Subject: [PATCH 09/12] add test --- pandas/core/indexes/multi.py | 6 +++--- pandas/tests/indexes/multi/test_sorting.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4b1743444db7d..f89c920ba6095 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1940,7 +1940,7 @@ def _lexsort_depth(self) -> int: return self.sortorder return _lexsort_depth(self.codes, self.nlevels) - def _sort_levels_monotonic(self, raise_if_unsortable: bool = False) -> MultiIndex: + def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIndex: """ This is an *internal* function. @@ -1987,7 +1987,7 @@ def _sort_levels_monotonic(self, raise_if_unsortable: bool = False) -> MultiInde # indexer to reorder the levels indexer = lev.argsort() except TypeError: - if raise_if_unsortable: + if raise_if_incomparable: raise else: lev = lev.take(indexer) @@ -2235,7 +2235,7 @@ def append(self, other): def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: if len(args) == 0 and len(kwargs) == 0: # lexsort is significantly faster than self._values.argsort() - target = self._sort_levels_monotonic(raise_if_unsortable=True) + target = self._sort_levels_monotonic(raise_if_incomparable=True) return lexsort_indexer(target._get_codes_for_sorting()) return self._values.argsort(*args, **kwargs) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index c73bc51215aae..03ee630a2037f 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -14,6 +14,7 @@ Index, MultiIndex, RangeIndex, + Timestamp, ) import pandas._testing as tm from pandas.core.indexes.frozen import FrozenList @@ -289,3 +290,20 @@ def test_sort_values_nan(): levels=[["A", "B", "C"], ["D"]], codes=[[0, 1, 2], [-1, -1, 0]] ) tm.assert_index_equal(result, expected) + + +def test_sort_values_monotonic_incomparable(): + mi = MultiIndex.from_arrays( + [ + [1, Timestamp("2000-01-01")], + [3, 4], + ] + ) + + match = "'<' not supported between instances of 'Timestamp' and 'int'" + with pytest.raises(TypeError, match=match): + mi._sort_levels_monotonic(raise_if_incomparable=True) + + result = mi._sort_levels_monotonic() + expected = mi + tm.assert_index_equal(result, expected) From 28cbf70475c803b50c81cec45a4bb04d3ad560e6 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 20 Sep 2022 06:20:31 -0400 Subject: [PATCH 10/12] update test --- pandas/tests/indexes/multi/test_sorting.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 03ee630a2037f..3f2d727f7b069 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -292,18 +292,13 @@ def test_sort_values_nan(): tm.assert_index_equal(result, expected) -def test_sort_values_monotonic_incomparable(): +def test_sort_values_incomparable(): mi = MultiIndex.from_arrays( [ [1, Timestamp("2000-01-01")], [3, 4], ] ) - match = "'<' not supported between instances of 'Timestamp' and 'int'" with pytest.raises(TypeError, match=match): - mi._sort_levels_monotonic(raise_if_incomparable=True) - - result = mi._sort_levels_monotonic() - expected = mi - tm.assert_index_equal(result, expected) + mi.sort_values() From 528b3440832e83511c04395e6c4ba65f90ad876f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 23 Sep 2022 18:54:34 -0400 Subject: [PATCH 11/12] add gh refs --- pandas/tests/indexes/multi/test_sorting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 3f2d727f7b069..29919abf6600f 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -284,6 +284,7 @@ def test_remove_unused_levels_with_nan(): def test_sort_values_nan(): + # GH48495, GH48626 midx = MultiIndex(levels=[["A", "B", "C"], ["D"]], codes=[[1, 0, 2], [-1, -1, 0]]) result = midx.sort_values() expected = MultiIndex( From d03627517f38ee27a970cf80045d1aeeb93786bb Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 6 Oct 2022 06:26:16 -0400 Subject: [PATCH 12/12] add gh refs --- pandas/tests/indexes/multi/test_sorting.py | 1 + pandas/tests/indexing/multiindex/test_sorted.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 29919abf6600f..3f364473270fb 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -294,6 +294,7 @@ def test_sort_values_nan(): def test_sort_values_incomparable(): + # GH48495 mi = MultiIndex.from_arrays( [ [1, Timestamp("2000-01-01")], diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 0bc5b6958f07c..1d2fd6586337b 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -89,6 +89,7 @@ def test_sort_values_key(self): tm.assert_frame_equal(result, expected) def test_argsort_with_na(self): + # GH48495 arrays = [ array([2, NA, 1], dtype="Int64"), array([1, 2, 3], dtype="Int64"), @@ -99,6 +100,7 @@ def test_argsort_with_na(self): tm.assert_numpy_array_equal(result, expected) def test_sort_values_with_na(self): + # GH48495 arrays = [ array([2, NA, 1], dtype="Int64"), array([1, 2, 3], dtype="Int64"),