Skip to content

Commit ba6de64

Browse files
jrebackpcluo
authored andcommitted
BUG: fix degenerate MultiIndex sorting (pandas-dev#16092)
xref pandas-dev#15694 closes pandas-dev#15797
1 parent e83a6f7 commit ba6de64

File tree

6 files changed

+36
-16
lines changed

6 files changed

+36
-16
lines changed

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@ DataFrame.sort_index changes
870870
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
871871

872872
In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
873-
This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
873+
This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`, :issue:`15797`)
874874

875875
This is *unchanged* from prior versions, but shown for illustration purposes:
876876

pandas/core/frame.py

+3
Original file line numberDiff line numberDiff line change
@@ -3364,6 +3364,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33643364
axis=baxis,
33653365
convert=False, verify=False)
33663366

3367+
# reconstruct axis if needed
3368+
new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
3369+
33673370
if inplace:
33683371
return self._update_inplace(new_data)
33693372
else:

pandas/core/indexes/base.py

+4
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,10 @@ def _update_inplace(self, result, **kwargs):
465465
# guard when called from IndexOpsMixin
466466
raise TypeError("Index can't be updated inplace")
467467

468+
def _sort_levels_monotonic(self):
469+
""" compat with MultiIndex """
470+
return self
471+
468472
_index_shared_docs['_get_grouper_for_level'] = """
469473
Get index grouper corresponding to an index level
470474

pandas/core/reshape/reshape.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -650,8 +650,15 @@ def _convert_level_number(level_num, columns):
650650
drop_cols = []
651651
for key in unique_groups:
652652
loc = this.columns.get_loc(key)
653-
slice_len = loc.stop - loc.start
653+
654654
# can make more efficient?
655+
# we almost always return a slice
656+
# but if unsorted can get a boolean
657+
# indexer
658+
if not isinstance(loc, slice):
659+
slice_len = len(loc)
660+
else:
661+
slice_len = loc.stop - loc.start
655662

656663
if slice_len == 0:
657664
drop_cols.append(key)

pandas/core/series.py

+1
Original file line numberDiff line numberDiff line change
@@ -1773,6 +1773,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
17731773

17741774
indexer = _ensure_platform_int(indexer)
17751775
new_index = index.take(indexer)
1776+
new_index = new_index._sort_levels_monotonic()
17761777

17771778
new_values = self._values.take(indexer)
17781779
result = self._constructor(new_values, index=new_index)

pandas/tests/test_multilevel.py

+19-14
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from pandas.core.index import Index, MultiIndex
1212
from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp
1313

14-
from pandas.core.common import UnsortedIndexError
1514
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
1615
import pandas.core.common as com
1716
import pandas.util.testing as tm
@@ -938,7 +937,7 @@ def test_stack_mixed_dtype(self):
938937
df = df.sort_index(level=1, axis=1)
939938

940939
stacked = df.stack()
941-
result = df['foo'].stack()
940+
result = df['foo'].stack().sort_index()
942941
tm.assert_series_equal(stacked['foo'], result, check_names=False)
943942
self.assertIs(result.name, None)
944943
self.assertEqual(stacked['bar'].dtype, np.float_)
@@ -2456,11 +2455,11 @@ def test_frame_getitem_not_sorted2(self):
24562455

24572456
assert df2_original.index.equals(df2.index)
24582457
expected = df2.sort_index()
2459-
assert not expected.index.is_lexsorted()
2458+
assert expected.index.is_lexsorted()
24602459
assert expected.index.is_monotonic
24612460

24622461
result = df2.sort_index(level=0)
2463-
assert not result.index.is_lexsorted()
2462+
assert result.index.is_lexsorted()
24642463
assert result.index.is_monotonic
24652464
tm.assert_frame_equal(result, expected)
24662465

@@ -2536,8 +2535,7 @@ def test_sort_index_and_reconstruction(self):
25362535
concatted = pd.concat([df, df], keys=[0.8, 0.5])
25372536
result = concatted.sort_index()
25382537

2539-
# this will be monotonic, but not lexsorted!
2540-
assert not result.index.is_lexsorted()
2538+
assert result.index.is_lexsorted()
25412539
assert result.index.is_monotonic
25422540

25432541
tm.assert_frame_equal(result, expected)
@@ -2576,7 +2574,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
25762574
levels=[['a', 'b'], ['aa', 'bb']],
25772575
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
25782576
result = df.sort_index()
2579-
assert not result.index.is_lexsorted()
2577+
assert result.index.is_lexsorted()
25802578
assert result.index.is_monotonic
25812579

25822580
tm.assert_frame_equal(result, expected)
@@ -2618,22 +2616,29 @@ def my_func(group):
26182616
def test_sort_non_lexsorted(self):
26192617
# degenerate case where we sort but don't
26202618
# have a satisfying result :<
2621-
2619+
# GH 15797
26222620
idx = MultiIndex([['A', 'B', 'C'],
26232621
['c', 'b', 'a']],
26242622
[[0, 1, 2, 0, 1, 2],
26252623
[0, 2, 1, 1, 0, 2]])
26262624

2627-
df = DataFrame({'col': range(len(idx))}, index=idx)
2625+
df = DataFrame({'col': range(len(idx))},
2626+
index=idx,
2627+
dtype='int64')
26282628
assert df.index.is_lexsorted() is False
26292629
assert df.index.is_monotonic is False
26302630

2631-
result = df.sort_index()
2632-
assert result.index.is_lexsorted() is False
2633-
assert result.index.is_monotonic is True
2631+
sorted = df.sort_index()
2632+
assert sorted.index.is_lexsorted() is True
2633+
assert sorted.index.is_monotonic is True
26342634

2635-
with pytest.raises(UnsortedIndexError):
2636-
result.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
2635+
expected = DataFrame(
2636+
{'col': [1, 4, 5, 2]},
2637+
index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'),
2638+
('C', 'a'), ('C', 'b')]),
2639+
dtype='int64')
2640+
result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
2641+
tm.assert_frame_equal(result, expected)
26372642

26382643
def test_sort_index_nan(self):
26392644
# GH 14784

0 commit comments

Comments
 (0)