Skip to content

Commit 72bc7d0

Browse files
committed
BUG: construct MultiIndex identically from levels/labels when concatting
closes pandas-dev#15622 closes pandas-dev#15687 closes pandas-dev#14015 xref pandas-dev#13431
1 parent e7956c4 commit 72bc7d0

File tree

8 files changed

+161
-14
lines changed

8 files changed

+161
-14
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ Bug Fixes
818818
- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`)
819819
- Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`)
820820

821+
- Bug in ``DataFrame.sort_index()`` that would not sort a lexsorted, but non monotonic ``MultiIndex`` (:issue:`15622`, :issue:`15687`, :issue:`14015`)
821822

822823
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
823824

pandas/core/frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3389,8 +3389,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33893389

33903390
# make sure that the axis is lexsorted to start
33913391
# if not we need to reconstruct to get the correct indexer
3392-
if not labels.is_lexsorted():
3393-
labels = MultiIndex.from_tuples(labels.values)
3392+
labels = labels._reconstruct_as_sorted()
33943393

33953394
indexer = lexsort_indexer(labels.labels, orders=ascending,
33963395
na_position=na_position)
@@ -3410,8 +3409,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
34103409
indexer = nargsort(labels, kind=kind, ascending=ascending,
34113410
na_position=na_position)
34123411

3412+
baxis = self._get_block_manager_axis(axis)
34133413
new_data = self._data.take(indexer,
3414-
axis=self._get_block_manager_axis(axis),
3414+
axis=baxis,
34153415
convert=False, verify=False)
34163416

34173417
if inplace:

pandas/core/groupby.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1807,6 +1807,13 @@ def get_group_levels(self):
18071807
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
18081808
}
18091809

1810+
def _is_builtin_func(self, arg):
1811+
"""
1812+
if we define an builtin function for this argument, return it,
1813+
otherwise return the arg
1814+
"""
1815+
return SelectionMixin._builtin_table.get(arg, arg)
1816+
18101817
def _get_cython_function(self, kind, how, values, is_numeric):
18111818

18121819
dtype_str = values.dtype.name
@@ -2032,7 +2039,7 @@ def _aggregate_series_fast(self, obj, func):
20322039
# avoids object / Series creation overhead
20332040
dummy = obj._get_values(slice(None, 0)).to_dense()
20342041
indexer = get_group_index_sorter(group_index, ngroups)
2035-
obj = obj.take(indexer, convert=False)
2042+
obj = obj.take(indexer, convert=False).to_dense()
20362043
group_index = algorithms.take_nd(
20372044
group_index, indexer, allow_fill=False)
20382045
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,

pandas/core/reshape.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
from pandas.sparse.libsparse import IntIndex
2323

2424
from pandas.core.categorical import Categorical, _factorize_from_iterable
25-
from pandas.core.sorting import (get_group_index, compress_group_index,
26-
decons_obs_group_ids)
25+
from pandas.core.sorting import (get_group_index, get_compressed_ids,
26+
compress_group_index, decons_obs_group_ids)
2727

2828
import pandas.core.algorithms as algos
2929
from pandas._libs import algos as _algos, reshape as _reshape
@@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
494494
return unstacker.get_result()
495495

496496

497-
def get_compressed_ids(labels, sizes):
498-
ids = get_group_index(labels, sizes, sort=True, xnull=False)
499-
return compress_group_index(ids, sort=True)
500-
501-
502497
def stack(frame, level=-1, dropna=True):
503498
"""
504499
Convert DataFrame to Series with multi-level Index. Columns become the

pandas/core/sorting.py

+5
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values
9393
return loop(list(labels), list(shape))
9494

9595

96+
def get_compressed_ids(labels, sizes):
97+
ids = get_group_index(labels, sizes, sort=True, xnull=False)
98+
return compress_group_index(ids, sort=True)
99+
100+
96101
def is_int64_overflow_possible(shape):
97102
the_prod = long(1)
98103
for x in shape:

pandas/indexes/multi.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -1175,9 +1175,36 @@ def from_product(cls, iterables, sortorder=None, names=None):
11751175

11761176
labels, levels = _factorize_from_iterables(iterables)
11771177
labels = cartesian_product(labels)
1178+
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11781179

1179-
return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
1180-
names=names)
1180+
def _reconstruct_as_sorted(self):
1181+
"""
1182+
reconstruct the MultiIndex, such that we are
1183+
monotonically sorted; this will also ensure that
1184+
we are lexsorted
1185+
"""
1186+
if self.is_lexsorted() and self.is_monotonic:
1187+
return self
1188+
1189+
new_levels = []
1190+
new_labels = []
1191+
for lev, lab in zip(self.levels, self.labels):
1192+
1193+
if lev.is_monotonic:
1194+
new_levels.append(lev)
1195+
new_labels.append(lab)
1196+
continue
1197+
1198+
indexer = lev.argsort()
1199+
lev = lev.take(indexer)
1200+
lab = algos.take_1d(indexer, lab)
1201+
1202+
new_levels.append(lev)
1203+
new_labels.append(lab)
1204+
1205+
return MultiIndex(new_levels, new_labels,
1206+
names=self.names, sortorder=self.sortorder,
1207+
verify_integrity=False)
11811208

11821209
@property
11831210
def nlevels(self):

pandas/tests/test_multilevel.py

+111
Original file line numberDiff line numberDiff line change
@@ -2449,6 +2449,31 @@ def test_getitem_slice_not_sorted(self):
24492449
expected = df.reindex(columns=df.columns[:3])
24502450
tm.assert_frame_equal(result, expected)
24512451

2452+
@pytest.mark.xfail(reason="need axis reconstruction")
2453+
def test_frame_getitem_not_sorted2(self):
2454+
# 13431
2455+
df = DataFrame({'col1': ['b', 'd', 'b', 'a'],
2456+
'col2': [3, 1, 1, 2],
2457+
'data': ['one', 'two', 'three', 'four']})
2458+
2459+
df2 = df.set_index(['col1', 'col2'])
2460+
df2_original = df2.copy()
2461+
2462+
df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True)
2463+
df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True)
2464+
assert not df2.index.is_lexsorted()
2465+
assert not df2.index.is_monotonic
2466+
2467+
assert df2_original.index.equals(df2.index)
2468+
2469+
result = df2.sort_index()
2470+
assert result.index.is_lexsorted()
2471+
assert result.index.is_monotonic
2472+
2473+
result = df2.sort_index(level=0)
2474+
assert result.index.is_lexsorted()
2475+
assert result.index.is_monotonic
2476+
24522477
def test_frame_getitem_not_sorted(self):
24532478
df = self.frame.T
24542479
df['foo', 'four'] = 'foo'
@@ -2485,3 +2510,89 @@ def test_series_getitem_not_sorted(self):
24852510
expected.index = expected.index.droplevel(0)
24862511
tm.assert_series_equal(result, expected)
24872512
tm.assert_series_equal(result2, expected)
2513+
2514+
def test_sort_index_and_reconstruction(self):
2515+
2516+
# 15622
2517+
# lexsortedness should be identical
2518+
# across MultiIndex consruction methods
2519+
2520+
df = DataFrame([[1, 1], [2, 2]], index=list('ab'))
2521+
expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]],
2522+
index=MultiIndex.from_tuples([(0.5, 'a'),
2523+
(0.5, 'b'),
2524+
(0.8, 'a'),
2525+
(0.8, 'b')]))
2526+
assert expected.index.is_lexsorted()
2527+
2528+
result = DataFrame(
2529+
[[1, 1], [2, 2], [1, 1], [2, 2]],
2530+
index=MultiIndex.from_product([[0.5, 0.8], list('ab')]))
2531+
result = result.sort_index()
2532+
assert result.index.is_lexsorted()
2533+
assert result.index.is_monotonic
2534+
2535+
tm.assert_frame_equal(result, expected)
2536+
2537+
result = DataFrame(
2538+
[[1, 1], [2, 2], [1, 1], [2, 2]],
2539+
index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']],
2540+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
2541+
result = result.sort_index()
2542+
assert result.index.is_lexsorted()
2543+
2544+
tm.assert_frame_equal(result, expected)
2545+
2546+
concatted = pd.concat([df, df], keys=[0.8, 0.5])
2547+
result = concatted.sort_index()
2548+
2549+
# this will be monotonic, but not lexsorted!
2550+
assert not result.index.is_lexsorted()
2551+
assert result.index.is_monotonic
2552+
2553+
tm.assert_frame_equal(result, expected)
2554+
2555+
# 14015
2556+
df = DataFrame([[1, 2], [6, 7]],
2557+
columns=MultiIndex.from_tuples(
2558+
[(0, '20160811 12:00:00'),
2559+
(0, '20160809 12:00:00')],
2560+
names=['l1', 'Date']))
2561+
2562+
df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
2563+
level=1,
2564+
inplace=True)
2565+
assert not df.columns.is_lexsorted()
2566+
assert not df.columns.is_monotonic
2567+
result = df.sort_index(axis=1)
2568+
assert result.columns.is_lexsorted()
2569+
assert result.columns.is_monotonic
2570+
result = df.sort_index(axis=1, level=1)
2571+
assert result.columns.is_lexsorted()
2572+
assert result.columns.is_monotonic
2573+
2574+
def test_sort_index_reorder_on_ops(self):
2575+
# 15687
2576+
df = pd.DataFrame(
2577+
np.random.randn(8, 2),
2578+
index=MultiIndex.from_product(
2579+
[['a', 'b'],
2580+
['big', 'small'],
2581+
['red', 'blu']],
2582+
names=['letter', 'size', 'color']),
2583+
columns=['near', 'far'])
2584+
df = df.sort_index()
2585+
2586+
def my_func(group):
2587+
group.index = ['newz', 'newa']
2588+
return group
2589+
2590+
result = df.groupby(level=['letter', 'size']).apply(
2591+
my_func).sort_index()
2592+
expected = MultiIndex.from_product(
2593+
[['a', 'b'],
2594+
['big', 'small'],
2595+
['newa', 'newz']],
2596+
names=['letter', 'size', None])
2597+
2598+
tm.assert_index_equal(result.index, expected)

pandas/tests/tools/test_pivot.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import numpy as np
44

5+
from collections import OrderedDict
56
import pandas as pd
67
from pandas import (DataFrame, Series, Index, MultiIndex,
78
Grouper, date_range, concat)
@@ -513,7 +514,7 @@ def test_pivot_columns_lexsorted(self):
513514
self.assertTrue(pivoted.columns.is_monotonic)
514515

515516
def test_pivot_complex_aggfunc(self):
516-
f = {'D': ['std'], 'E': ['sum']}
517+
f = OrderedDict([('D', ['std']), ('E', ['sum'])])
517518
expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
518519
result = self.data.pivot_table(index='A', columns='B', aggfunc=f)
519520

0 commit comments

Comments
 (0)