From 66f809e482551f209517d1b69e21fbc6133a0182 Mon Sep 17 00:00:00 2001 From: Line Pedersen Date: Tue, 4 Apr 2017 14:43:04 +0200 Subject: [PATCH] BUG GH14784 na_position doesn't work for sort_index() with MultiIndex --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 3 +- pandas/core/indexes/multi.py | 16 +++++++++ pandas/core/series.py | 4 ++- pandas/tests/test_multilevel.py | 57 +++++++++++++++++++++++++++++++++ 5 files changed, 79 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6b6f532ed2323..63693b4583ff4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -1564,6 +1564,7 @@ Indexing - Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) +- Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` ``na_position`` doesn't work with ``MultiIndex`` I/O ^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 153042d4a09c9..7fbfa7962c2c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3352,7 +3352,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer labels = labels._sort_levels_monotonic() - indexer = lexsort_indexer(labels.labels, orders=ascending, + indexer = lexsort_indexer(labels._get_labels_for_sorting(), + orders=ascending, na_position=na_position) else: from pandas.core.sorting import nargsort diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6d9a9aa691f66..92baf9d289cd2 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1635,6 +1635,22 @@ def reorder_levels(self, order): def __getslice__(self, i, j): return self.__getitem__(slice(i, j)) + def _get_labels_for_sorting(self): + """ + we categorizing our labels by using the + available catgories (all, not just observed) + excluding any missing ones (-1); this is in preparation + for sorting, where we need to disambiguate that -1 is not + a valid valid + """ + from pandas.core.categorical import Categorical + + return [Categorical.from_codes(label, + np.arange(np.array(label).max() + 1, + dtype=label.dtype), + ordered=True) + for label in self.labels] + def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ Sort MultiIndex at the requested level. The result will respect the diff --git a/pandas/core/series.py b/pandas/core/series.py index 8a2351527856d..e0364ad629c5d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1753,7 +1753,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer labels = index._sort_levels_monotonic() - indexer = lexsort_indexer(labels.labels, orders=ascending) + indexer = lexsort_indexer(labels._get_labels_for_sorting(), + orders=ascending, + na_position=na_position) else: from pandas.core.sorting import nargsort diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 24bbf895508d7..f7e7ab6b190b5 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2634,3 +2634,60 @@ def test_sort_non_lexsorted(self): with pytest.raises(UnsortedIndexError): result.loc[pd.IndexSlice['B':'C', 'a':'c'], :] + + def test_sort_index_nan(self): + tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] + mi = MultiIndex.from_tuples(tuples) + + df = DataFrame(np.arange(16).reshape(4, 4), + index=mi, columns=list('ABCD')) + s = Series(np.arange(4), index=mi) + + df2 = DataFrame({ + 'date': pd.to_datetime([ + '20121002', '20121007', '20130130', '20130202', '20130305', + '20121002', '20121207', '20130130', '20130202', '20130305', + '20130202', '20130305' + ]), + 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + 'whole_cost': [1790, np.nan, 280, 259, np.nan, 623, 90, 312, + np.nan, 301, 359, 801], + 'cost': [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12] + }).set_index(['date', 'user_id']) + + # sorting frame, default nan position is last + result = df.sort_index() + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position last + result = df.sort_index(na_position='last') + expected = df.iloc[[3, 0, 2, 1], :] + tm.assert_frame_equal(result, expected) + + # sorting frame, nan position first + result = df.sort_index(na_position='first') + expected = df.iloc[[1, 2, 3, 0], :] + tm.assert_frame_equal(result, expected) + + # sorting frame with removed rows + result = df2.dropna().sort_index() + expected = df2.sort_index().dropna() + tm.assert_frame_equal(result, expected) + + # sorting series, default nan position is last + result = s.sort_index() + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position last + result = s.sort_index(na_position='last') + expected = s.iloc[[3, 0, 2, 1]] + tm.assert_series_equal(result, expected) + + # sorting series, nan position first + result = s.sort_index(na_position='first') + expected = s.iloc[[1, 2, 3, 0]] + tm.assert_series_equal(result, expected) + +