Skip to content

Commit ae3777e

Browse files
committed
BUG: construct MultiIndex identically from levels/labels when concatting
closes pandas-dev#15622 closes pandas-dev#15687 closes pandas-dev#14015 closes pandas-dev#13431
1 parent cd24fa9 commit ae3777e

File tree

13 files changed

+375
-24
lines changed

13 files changed

+375
-24
lines changed

asv_bench/benchmarks/timeseries.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,10 @@ def setup(self):
292292
self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S')
293293
self.ts3 = Series(1, index=self.rng3)
294294

295-
def time_sort_index(self):
295+
def time_sort_index_monotonic(self):
296+
self.ts2.sort_index()
297+
298+
def time_sort_index_non_monotonic(self):
296299
self.ts.sort_index()
297300

298301
def time_timeseries_slice_minutely(self):

doc/source/whatsnew/v0.20.0.txt

+73-1
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,78 @@ If indicated, a deprecation warning will be issued if you reference that module.
689689
"pandas._hash", "pandas.tools.libhash", ""
690690
"pandas._window", "pandas.core.libwindow", ""
691691

692+
.. _whatsnew_0200.api_breaking.sort_index:
693+
694+
DataFrame.sort_index changes
695+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
696+
697+
In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
698+
This would happen with a ``lexsorted``, but non-montonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
699+
700+
This is UNCHANGED between versions, but showing for illustration purposes:
701+
702+
.. ipython:: python
703+
704+
df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)]))
705+
df
706+
707+
.. ipython:: python
708+
709+
df.index.is_lexsorted()
710+
df.index.is_monotonic
711+
712+
Sorting works as expected
713+
714+
.. ipython:: python
715+
716+
df.sort_index()
717+
718+
.. ipython:: python
719+
720+
df.sort_index().index.is_lexsorted()
721+
df.sort_index().index.is_monotonic
722+
723+
However, this example, which has a monotonic level, doesn't behave as desired.
724+
725+
.. ipython:: python
726+
df = pd.DataFrame({'value': [1, 2, 3, 4]},
727+
index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
728+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
729+
730+
Previous Behavior:
731+
732+
.. ipython:: python
733+
734+
In [11]: df.sort_index()
735+
Out[11]:
736+
value
737+
a bb 1
738+
aa 2
739+
b bb 3
740+
aa 4
741+
742+
In [14]: df.sort_index().index.is_lexsorted()
743+
Out[14]: True
744+
745+
In [15]: df.sort_index().index.is_monotonic
746+
Out[15]: False
747+
748+
New Behavior:
749+
750+
.. ipython:: python
751+
752+
df.sort_index()
753+
df.sort_index().index.is_lexsorted()
754+
df.sort_index().index.is_monotonic
755+
756+
Previous Behavior:
757+
758+
.. code-block:: ipython
759+
760+
New Behavior:
761+
762+
.. ipython:: python
763+
692764

693765
.. _whatsnew_0200.api_breaking.groupby_describe:
694766

@@ -928,7 +1000,7 @@ Performance Improvements
9281000
- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied
9291001
function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
9301002
- Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`).
931-
1003+
- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)
9321004

9331005
.. _whatsnew_0200.bug_fixes:
9341006

pandas/core/frame.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -3322,6 +3322,10 @@ def trans(v):
33223322
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33233323
kind='quicksort', na_position='last', sort_remaining=True,
33243324
by=None):
3325+
3326+
# TODO: this can be combined with Series.sort_index impl as
3327+
# almost identical
3328+
33253329
inplace = validate_bool_kwarg(inplace, 'inplace')
33263330
# 10726
33273331
if by is not None:
@@ -3335,8 +3339,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33353339
axis = self._get_axis_number(axis)
33363340
labels = self._get_axis(axis)
33373341

3338-
# sort by the index
3339-
if level is not None:
3342+
if level:
33403343

33413344
new_axis, indexer = labels.sortlevel(level, ascending=ascending,
33423345
sort_remaining=sort_remaining)
@@ -3346,17 +3349,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33463349

33473350
# make sure that the axis is lexsorted to start
33483351
# if not we need to reconstruct to get the correct indexer
3349-
if not labels.is_lexsorted():
3350-
labels = MultiIndex.from_tuples(labels.values)
3352+
labels = labels._reconstruct(sort=True)
33513353

33523354
indexer = lexsort_indexer(labels.labels, orders=ascending,
33533355
na_position=na_position)
33543356
else:
33553357
from pandas.core.sorting import nargsort
33563358

3357-
# GH11080 - Check monotonic-ness before sort an index
3358-
# if monotonic (already sorted), return None or copy() according
3359-
# to 'inplace'
3359+
# Check monotonic-ness before sort an index
3360+
# GH11080
33603361
if ((ascending and labels.is_monotonic_increasing) or
33613362
(not ascending and labels.is_monotonic_decreasing)):
33623363
if inplace:
@@ -3367,8 +3368,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33673368
indexer = nargsort(labels, kind=kind, ascending=ascending,
33683369
na_position=na_position)
33693370

3371+
baxis = self._get_block_manager_axis(axis)
33703372
new_data = self._data.take(indexer,
3371-
axis=self._get_block_manager_axis(axis),
3373+
axis=baxis,
33723374
convert=False, verify=False)
33733375

33743376
if inplace:

pandas/core/groupby.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1882,6 +1882,13 @@ def get_group_levels(self):
18821882
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
18831883
}
18841884

1885+
def _is_builtin_func(self, arg):
1886+
"""
1887+
if we define an builtin function for this argument, return it,
1888+
otherwise return the arg
1889+
"""
1890+
return SelectionMixin._builtin_table.get(arg, arg)
1891+
18851892
def _get_cython_function(self, kind, how, values, is_numeric):
18861893

18871894
dtype_str = values.dtype.name
@@ -2107,7 +2114,7 @@ def _aggregate_series_fast(self, obj, func):
21072114
# avoids object / Series creation overhead
21082115
dummy = obj._get_values(slice(None, 0)).to_dense()
21092116
indexer = get_group_index_sorter(group_index, ngroups)
2110-
obj = obj.take(indexer, convert=False)
2117+
obj = obj.take(indexer, convert=False).to_dense()
21112118
group_index = algorithms.take_nd(
21122119
group_index, indexer, allow_fill=False)
21132120
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,

pandas/core/reshape.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
from pandas.sparse.libsparse import IntIndex
2323

2424
from pandas.core.categorical import Categorical, _factorize_from_iterable
25-
from pandas.core.sorting import (get_group_index, compress_group_index,
26-
decons_obs_group_ids)
25+
from pandas.core.sorting import (get_group_index, get_compressed_ids,
26+
compress_group_index, decons_obs_group_ids)
2727

2828
import pandas.core.algorithms as algos
2929
from pandas._libs import algos as _algos, reshape as _reshape
@@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
494494
return unstacker.get_result()
495495

496496

497-
def get_compressed_ids(labels, sizes):
498-
ids = get_group_index(labels, sizes, sort=True, xnull=False)
499-
return compress_group_index(ids, sort=True)
500-
501-
502497
def stack(frame, level=-1, dropna=True):
503498
"""
504499
Convert DataFrame to Series with multi-level Index. Columns become the

pandas/core/series.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -1751,17 +1751,31 @@ def _try_kind_sort(arr):
17511751
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
17521752
kind='quicksort', na_position='last', sort_remaining=True):
17531753

1754+
# TODO: this can be combined with DataFrame.sort_index impl as
1755+
# almost identical
17541756
inplace = validate_bool_kwarg(inplace, 'inplace')
17551757
axis = self._get_axis_number(axis)
17561758
index = self.index
1757-
if level is not None:
1759+
1760+
if level:
17581761
new_index, indexer = index.sortlevel(level, ascending=ascending,
17591762
sort_remaining=sort_remaining)
17601763
elif isinstance(index, MultiIndex):
17611764
from pandas.core.sorting import lexsort_indexer
1762-
indexer = lexsort_indexer(index.labels, orders=ascending)
1765+
labels = index._reconstruct(sort=True)
1766+
indexer = lexsort_indexer(labels.labels, orders=ascending)
17631767
else:
17641768
from pandas.core.sorting import nargsort
1769+
1770+
# Check monotonic-ness before sort an index
1771+
# GH11080
1772+
if ((ascending and index.is_monotonic_increasing) or
1773+
(not ascending and index.is_monotonic_decreasing)):
1774+
if inplace:
1775+
return
1776+
else:
1777+
return self.copy()
1778+
17651779
indexer = nargsort(index, kind=kind, ascending=ascending,
17661780
na_position=na_position)
17671781

pandas/core/sorting.py

+5
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values
9393
return loop(list(labels), list(shape))
9494

9595

96+
def get_compressed_ids(labels, sizes):
97+
ids = get_group_index(labels, sizes, sort=True, xnull=False)
98+
return compress_group_index(ids, sort=True)
99+
100+
96101
def is_int64_overflow_possible(shape):
97102
the_prod = long(1)
98103
for x in shape:

pandas/indexes/multi.py

+50-2
Original file line numberDiff line numberDiff line change
@@ -1173,9 +1173,57 @@ def from_product(cls, iterables, sortorder=None, names=None):
11731173

11741174
labels, levels = _factorize_from_iterables(iterables)
11751175
labels = cartesian_product(labels)
1176+
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11761177

1177-
return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
1178-
names=names)
1178+
def _reconstruct(self, sort=False):
1179+
"""
1180+
reconstruct the MultiIndex
1181+
1182+
The MultiIndex will have the same outward appearance (e.g. values)
1183+
and will also .equals()
1184+
1185+
Parameters
1186+
----------
1187+
sort: boolean, default False
1188+
monotonically sort the levels
1189+
1190+
Returns
1191+
-------
1192+
MultiIndex
1193+
1194+
"""
1195+
new_levels = []
1196+
new_labels = []
1197+
1198+
if sort:
1199+
1200+
if self.is_monotonic:
1201+
return self
1202+
1203+
for lev, lab in zip(self.levels, self.labels):
1204+
1205+
if lev.is_monotonic:
1206+
new_levels.append(lev)
1207+
new_labels.append(lab)
1208+
continue
1209+
1210+
# indexer to reorder the levels
1211+
indexer = lev.argsort()
1212+
lev = lev.take(indexer)
1213+
1214+
# indexer to reorder the labels
1215+
ri = lib.get_reverse_indexer(indexer, len(indexer))
1216+
lab = algos.take_1d(ri, lab)
1217+
1218+
new_levels.append(lev)
1219+
new_labels.append(lab)
1220+
1221+
else:
1222+
return self
1223+
1224+
return MultiIndex(new_levels, new_labels,
1225+
names=self.names, sortorder=self.sortorder,
1226+
verify_integrity=False)
11791227

11801228
@property
11811229
def nlevels(self):

pandas/tests/indexes/test_multi.py

+53
Original file line numberDiff line numberDiff line change
@@ -2411,6 +2411,59 @@ def test_is_monotonic(self):
24112411

24122412
self.assertFalse(i.is_monotonic)
24132413

2414+
def test_reconstruct_sort(self):
2415+
2416+
# starts off lexsorted & monotonic
2417+
mi = MultiIndex.from_arrays([
2418+
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
2419+
])
2420+
assert mi.is_lexsorted()
2421+
assert mi.is_monotonic
2422+
2423+
recons = mi._reconstruct(sort=True)
2424+
assert recons.is_lexsorted()
2425+
assert recons.is_monotonic
2426+
assert mi is recons
2427+
2428+
assert mi.equals(recons)
2429+
assert Index(mi.values).equals(Index(recons.values))
2430+
2431+
recons = mi._reconstruct(sort=False)
2432+
assert recons.is_lexsorted()
2433+
assert recons.is_monotonic
2434+
assert mi is recons
2435+
2436+
assert mi.equals(recons)
2437+
assert Index(mi.values).equals(Index(recons.values))
2438+
2439+
# cannot convert to lexsorted
2440+
mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
2441+
('x', 'b'), ('y', 'a'), ('z', 'b')],
2442+
names=['one', 'two'])
2443+
assert not mi.is_lexsorted()
2444+
assert not mi.is_monotonic
2445+
2446+
recons = mi._reconstruct(sort=True)
2447+
assert not recons.is_lexsorted()
2448+
assert not recons.is_monotonic
2449+
2450+
assert mi.equals(recons)
2451+
assert Index(mi.values).equals(Index(recons.values))
2452+
2453+
# cannot convert to lexsorted
2454+
mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
2455+
labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
2456+
names=['col1', 'col2'])
2457+
assert not mi.is_lexsorted()
2458+
assert not mi.is_monotonic
2459+
2460+
recons = mi._reconstruct(sort=True)
2461+
assert not recons.is_lexsorted()
2462+
assert not recons.is_monotonic
2463+
2464+
assert mi.equals(recons)
2465+
assert Index(mi.values).equals(Index(recons.values))
2466+
24142467
def test_isin(self):
24152468
values = [('foo', 2), ('bar', 3), ('quux', 4)]
24162469

pandas/tests/series/test_analytics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,7 @@ def test_unstack(self):
16001600
labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
16011601
expected = DataFrame({'bar': s.values},
16021602
index=exp_index).sort_index(level=0)
1603-
unstacked = s.unstack(0)
1603+
unstacked = s.unstack(0).sort_index()
16041604
assert_frame_equal(unstacked, expected)
16051605

16061606
# GH5873

0 commit comments

Comments
 (0)