Skip to content

Commit a6f352c

Browse files
committed
BUG: construct MultiIndex identically from levels/labels when concatting
closes pandas-dev#15622 closes pandas-dev#15687 closes pandas-dev#14015 closes pandas-dev#13431
1 parent 37e5f78 commit a6f352c

File tree

13 files changed

+267
-24
lines changed

13 files changed

+267
-24
lines changed

asv_bench/benchmarks/timeseries.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,10 @@ def setup(self):
292292
self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S')
293293
self.ts3 = Series(1, index=self.rng3)
294294

295-
def time_sort_index(self):
295+
def time_sort_index_monotonic(self):
296+
self.ts2.sort_index()
297+
298+
def time_sort_index_non_monotonic(self):
296299
self.ts.sort_index()
297300

298301
def time_timeseries_slice_minutely(self):

doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ Performance Improvements
785785
- Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
786786
- Improved performance when using ``.unstack()`` (:issue:`15503`)
787787
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
788-
788+
- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)
789789

790790
.. _whatsnew_0200.bug_fixes:
791791

@@ -818,6 +818,7 @@ Bug Fixes
818818
- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`)
819819
- Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`)
820820

821+
- Bug in ``DataFrame.sort_index()`` that would not sort a lexsorted, but non monotonic ``MultiIndex`` (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
821822

822823
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
823824

pandas/core/frame.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -3365,6 +3365,10 @@ def sort(self, columns=None, axis=0, ascending=True, inplace=False,
33653365
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33663366
kind='quicksort', na_position='last', sort_remaining=True,
33673367
by=None):
3368+
3369+
# TODO: this can be combined with Series.sort_index impl as
3370+
# almost identical
3371+
33683372
inplace = validate_bool_kwarg(inplace, 'inplace')
33693373
# 10726
33703374
if by is not None:
@@ -3378,8 +3382,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33783382
axis = self._get_axis_number(axis)
33793383
labels = self._get_axis(axis)
33803384

3381-
# sort by the index
3382-
if level is not None:
3385+
if level:
33833386

33843387
new_axis, indexer = labels.sortlevel(level, ascending=ascending,
33853388
sort_remaining=sort_remaining)
@@ -3389,17 +3392,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33893392

33903393
# make sure that the axis is lexsorted to start
33913394
# if not we need to reconstruct to get the correct indexer
3392-
if not labels.is_lexsorted():
3393-
labels = MultiIndex.from_tuples(labels.values)
3395+
labels = labels._reconstruct_as_sorted()
33943396

33953397
indexer = lexsort_indexer(labels.labels, orders=ascending,
33963398
na_position=na_position)
33973399
else:
33983400
from pandas.core.sorting import nargsort
33993401

3400-
# GH11080 - Check monotonic-ness before sort an index
3401-
# if monotonic (already sorted), return None or copy() according
3402-
# to 'inplace'
3402+
# Check monotonic-ness before sort an index
3403+
# GH11080
34033404
if ((ascending and labels.is_monotonic_increasing) or
34043405
(not ascending and labels.is_monotonic_decreasing)):
34053406
if inplace:
@@ -3410,8 +3411,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
34103411
indexer = nargsort(labels, kind=kind, ascending=ascending,
34113412
na_position=na_position)
34123413

3414+
baxis = self._get_block_manager_axis(axis)
34133415
new_data = self._data.take(indexer,
3414-
axis=self._get_block_manager_axis(axis),
3416+
axis=baxis,
34153417
convert=False, verify=False)
34163418

34173419
if inplace:

pandas/core/groupby.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1808,6 +1808,13 @@ def get_group_levels(self):
18081808
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
18091809
}
18101810

1811+
def _is_builtin_func(self, arg):
1812+
"""
1813+
if we define an builtin function for this argument, return it,
1814+
otherwise return the arg
1815+
"""
1816+
return SelectionMixin._builtin_table.get(arg, arg)
1817+
18111818
def _get_cython_function(self, kind, how, values, is_numeric):
18121819

18131820
dtype_str = values.dtype.name
@@ -2033,7 +2040,7 @@ def _aggregate_series_fast(self, obj, func):
20332040
# avoids object / Series creation overhead
20342041
dummy = obj._get_values(slice(None, 0)).to_dense()
20352042
indexer = get_group_index_sorter(group_index, ngroups)
2036-
obj = obj.take(indexer, convert=False)
2043+
obj = obj.take(indexer, convert=False).to_dense()
20372044
group_index = algorithms.take_nd(
20382045
group_index, indexer, allow_fill=False)
20392046
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,

pandas/core/reshape.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
from pandas.sparse.libsparse import IntIndex
2323

2424
from pandas.core.categorical import Categorical, _factorize_from_iterable
25-
from pandas.core.sorting import (get_group_index, compress_group_index,
26-
decons_obs_group_ids)
25+
from pandas.core.sorting import (get_group_index, get_compressed_ids,
26+
compress_group_index, decons_obs_group_ids)
2727

2828
import pandas.core.algorithms as algos
2929
from pandas._libs import algos as _algos, reshape as _reshape
@@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
494494
return unstacker.get_result()
495495

496496

497-
def get_compressed_ids(labels, sizes):
498-
ids = get_group_index(labels, sizes, sort=True, xnull=False)
499-
return compress_group_index(ids, sort=True)
500-
501-
502497
def stack(frame, level=-1, dropna=True):
503498
"""
504499
Convert DataFrame to Series with multi-level Index. Columns become the

pandas/core/series.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -1756,17 +1756,31 @@ def _try_kind_sort(arr):
17561756
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
17571757
kind='quicksort', na_position='last', sort_remaining=True):
17581758

1759+
# TODO: this can be combined with DataFrame.sort_index impl as
1760+
# almost identical
17591761
inplace = validate_bool_kwarg(inplace, 'inplace')
17601762
axis = self._get_axis_number(axis)
17611763
index = self.index
1762-
if level is not None:
1764+
1765+
if level:
17631766
new_index, indexer = index.sortlevel(level, ascending=ascending,
17641767
sort_remaining=sort_remaining)
17651768
elif isinstance(index, MultiIndex):
17661769
from pandas.core.sorting import lexsort_indexer
1767-
indexer = lexsort_indexer(index.labels, orders=ascending)
1770+
labels = index._reconstruct_as_sorted()
1771+
indexer = lexsort_indexer(labels.labels, orders=ascending)
17681772
else:
17691773
from pandas.core.sorting import nargsort
1774+
1775+
# Check monotonic-ness before sort an index
1776+
# GH11080
1777+
if ((ascending and index.is_monotonic_increasing) or
1778+
(not ascending and index.is_monotonic_decreasing)):
1779+
if inplace:
1780+
return
1781+
else:
1782+
return self.copy()
1783+
17701784
indexer = nargsort(index, kind=kind, ascending=ascending,
17711785
na_position=na_position)
17721786

pandas/core/sorting.py

+5
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values
9393
return loop(list(labels), list(shape))
9494

9595

96+
def get_compressed_ids(labels, sizes):
97+
ids = get_group_index(labels, sizes, sort=True, xnull=False)
98+
return compress_group_index(ids, sort=True)
99+
100+
96101
def is_int64_overflow_possible(shape):
97102
the_prod = long(1)
98103
for x in shape:

pandas/indexes/multi.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -1175,9 +1175,40 @@ def from_product(cls, iterables, sortorder=None, names=None):
11751175

11761176
labels, levels = _factorize_from_iterables(iterables)
11771177
labels = cartesian_product(labels)
1178+
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11781179

1179-
return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
1180-
names=names)
1180+
def _reconstruct_as_sorted(self):
1181+
"""
1182+
reconstruct the MultiIndex, such that we are
1183+
monotonically sorted; this will also ensure that
1184+
we are lexsorted
1185+
"""
1186+
if self.is_lexsorted() and self.is_monotonic:
1187+
return self
1188+
1189+
new_levels = []
1190+
new_labels = []
1191+
for lev, lab in zip(self.levels, self.labels):
1192+
1193+
if lev.is_monotonic:
1194+
new_levels.append(lev)
1195+
new_labels.append(lab)
1196+
continue
1197+
1198+
# indexer to reorder the levels
1199+
indexer = lev.argsort()
1200+
lev = lev.take(indexer)
1201+
1202+
# indexer to reorder the labels
1203+
ri = lib.get_reverse_indexer(indexer, len(indexer))
1204+
lab = algos.take_1d(ri, lab)
1205+
1206+
new_levels.append(lev)
1207+
new_labels.append(lab)
1208+
1209+
return MultiIndex(new_levels, new_labels,
1210+
names=self.names, sortorder=self.sortorder,
1211+
verify_integrity=False)
11811212

11821213
@property
11831214
def nlevels(self):

pandas/tests/indexes/test_multi.py

+45
Original file line numberDiff line numberDiff line change
@@ -2411,6 +2411,51 @@ def test_is_monotonic(self):
24112411

24122412
self.assertFalse(i.is_monotonic)
24132413

2414+
def test_reconstruct_as_sorted(self):
2415+
2416+
# starts off lexsorted & monotonic
2417+
mi = MultiIndex.from_arrays([
2418+
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
2419+
])
2420+
assert mi.is_lexsorted()
2421+
assert mi.is_monotonic
2422+
2423+
recons = mi._reconstruct_as_sorted()
2424+
assert recons.is_lexsorted()
2425+
assert recons.is_monotonic
2426+
assert mi is recons
2427+
2428+
assert mi.equals(recons)
2429+
assert Index(mi.values).equals(Index(recons.values))
2430+
2431+
# cannot convert to lexsorted
2432+
mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
2433+
('x', 'b'), ('y', 'a'), ('z', 'b')],
2434+
names=['one', 'two'])
2435+
assert not mi.is_lexsorted()
2436+
assert not mi.is_monotonic
2437+
2438+
recons = mi._reconstruct_as_sorted()
2439+
assert not recons.is_lexsorted()
2440+
assert not recons.is_monotonic
2441+
2442+
assert mi.equals(recons)
2443+
assert Index(mi.values).equals(Index(recons.values))
2444+
2445+
# cannot convert to lexsorted
2446+
mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
2447+
labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
2448+
names=['col1', 'col2'])
2449+
assert not mi.is_lexsorted()
2450+
assert not mi.is_monotonic
2451+
2452+
recons = mi._reconstruct_as_sorted()
2453+
assert not recons.is_lexsorted()
2454+
assert not recons.is_monotonic
2455+
2456+
assert mi.equals(recons)
2457+
assert Index(mi.values).equals(Index(recons.values))
2458+
24142459
def test_isin(self):
24152460
values = [('foo', 2), ('bar', 3), ('quux', 4)]
24162461

pandas/tests/series/test_analytics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1632,7 +1632,7 @@ def test_unstack(self):
16321632
labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
16331633
expected = DataFrame({'bar': s.values},
16341634
index=exp_index).sort_index(level=0)
1635-
unstacked = s.unstack(0)
1635+
unstacked = s.unstack(0).sort_index()
16361636
assert_frame_equal(unstacked, expected)
16371637

16381638
# GH5873

pandas/tests/test_multilevel.py

+110
Original file line numberDiff line numberDiff line change
@@ -2449,6 +2449,30 @@ def test_getitem_slice_not_sorted(self):
24492449
expected = df.reindex(columns=df.columns[:3])
24502450
tm.assert_frame_equal(result, expected)
24512451

2452+
def test_frame_getitem_not_sorted2(self):
2453+
# 13431
2454+
df = DataFrame({'col1': ['b', 'd', 'b', 'a'],
2455+
'col2': [3, 1, 1, 2],
2456+
'data': ['one', 'two', 'three', 'four']})
2457+
2458+
df2 = df.set_index(['col1', 'col2'])
2459+
df2_original = df2.copy()
2460+
2461+
df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True)
2462+
df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True)
2463+
assert not df2.index.is_lexsorted()
2464+
assert not df2.index.is_monotonic
2465+
2466+
assert df2_original.index.equals(df2.index)
2467+
expected = df2.sort_index()
2468+
assert not expected.index.is_lexsorted()
2469+
assert expected.index.is_monotonic
2470+
2471+
result = df2.sort_index(level=0)
2472+
assert not result.index.is_lexsorted()
2473+
assert result.index.is_monotonic
2474+
tm.assert_frame_equal(result, expected)
2475+
24522476
def test_frame_getitem_not_sorted(self):
24532477
df = self.frame.T
24542478
df['foo', 'four'] = 'foo'
@@ -2485,3 +2509,89 @@ def test_series_getitem_not_sorted(self):
24852509
expected.index = expected.index.droplevel(0)
24862510
tm.assert_series_equal(result, expected)
24872511
tm.assert_series_equal(result2, expected)
2512+
2513+
def test_sort_index_and_reconstruction(self):
2514+
2515+
# 15622
2516+
# lexsortedness should be identical
2517+
# across MultiIndex consruction methods
2518+
2519+
df = DataFrame([[1, 1], [2, 2]], index=list('ab'))
2520+
expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]],
2521+
index=MultiIndex.from_tuples([(0.5, 'a'),
2522+
(0.5, 'b'),
2523+
(0.8, 'a'),
2524+
(0.8, 'b')]))
2525+
assert expected.index.is_lexsorted()
2526+
2527+
result = DataFrame(
2528+
[[1, 1], [2, 2], [1, 1], [2, 2]],
2529+
index=MultiIndex.from_product([[0.5, 0.8], list('ab')]))
2530+
result = result.sort_index()
2531+
assert result.index.is_lexsorted()
2532+
assert result.index.is_monotonic
2533+
2534+
tm.assert_frame_equal(result, expected)
2535+
2536+
result = DataFrame(
2537+
[[1, 1], [2, 2], [1, 1], [2, 2]],
2538+
index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']],
2539+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
2540+
result = result.sort_index()
2541+
assert result.index.is_lexsorted()
2542+
2543+
tm.assert_frame_equal(result, expected)
2544+
2545+
concatted = pd.concat([df, df], keys=[0.8, 0.5])
2546+
result = concatted.sort_index()
2547+
2548+
# this will be monotonic, but not lexsorted!
2549+
assert not result.index.is_lexsorted()
2550+
assert result.index.is_monotonic
2551+
2552+
tm.assert_frame_equal(result, expected)
2553+
2554+
# 14015
2555+
df = DataFrame([[1, 2], [6, 7]],
2556+
columns=MultiIndex.from_tuples(
2557+
[(0, '20160811 12:00:00'),
2558+
(0, '20160809 12:00:00')],
2559+
names=['l1', 'Date']))
2560+
2561+
df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
2562+
level=1,
2563+
inplace=True)
2564+
assert not df.columns.is_lexsorted()
2565+
assert not df.columns.is_monotonic
2566+
result = df.sort_index(axis=1)
2567+
assert result.columns.is_lexsorted()
2568+
assert result.columns.is_monotonic
2569+
result = df.sort_index(axis=1, level=1)
2570+
assert result.columns.is_lexsorted()
2571+
assert result.columns.is_monotonic
2572+
2573+
def test_sort_index_reorder_on_ops(self):
2574+
# 15687
2575+
df = pd.DataFrame(
2576+
np.random.randn(8, 2),
2577+
index=MultiIndex.from_product(
2578+
[['a', 'b'],
2579+
['big', 'small'],
2580+
['red', 'blu']],
2581+
names=['letter', 'size', 'color']),
2582+
columns=['near', 'far'])
2583+
df = df.sort_index()
2584+
2585+
def my_func(group):
2586+
group.index = ['newz', 'newa']
2587+
return group
2588+
2589+
result = df.groupby(level=['letter', 'size']).apply(
2590+
my_func).sort_index()
2591+
expected = MultiIndex.from_product(
2592+
[['a', 'b'],
2593+
['big', 'small'],
2594+
['newa', 'newz']],
2595+
names=['letter', 'size', None])
2596+
2597+
tm.assert_index_equal(result.index, expected)

0 commit comments

Comments
 (0)