Skip to content

Commit 8ec3c9b

Browse files
committed
reindex multi-index at level with reordered labels
1 parent 4ab5409 commit 8ec3c9b

File tree

4 files changed

+163
-16
lines changed

4 files changed

+163
-16
lines changed

doc/source/whatsnew/v0.15.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ Bug Fixes
119119
- Bug in ``BlockManager`` where setting values with different type would break block integrity (:issue:`8850`)
120120
- Bug in ``DatetimeIndex`` when using ``time`` object as key (:issue:`8667`)
121121
- Bug in ``merge`` where ``how='left'`` and ``sort=False`` would not preserve left frame order (:issue:`7331`)
122+
- Bug in ``MultiIndex.reindex`` where reindexing at level would not reorder labels (:issue:`4088`)
122123

123124
- Fix negative step support for label-based slices (:issue:`8753`)
124125

pandas/core/index.py

+81-16
Original file line numberDiff line numberDiff line change
@@ -1828,13 +1828,41 @@ def _join_non_unique(self, other, how='left', return_indexers=False):
18281828
else:
18291829
return join_index
18301830

1831-
def _join_level(self, other, level, how='left', return_indexers=False):
1831+
def _join_level(self, other, level, how='left',
1832+
return_indexers=False,
1833+
keep_order=True):
18321834
"""
18331835
The join method *only* affects the level of the resulting
18341836
MultiIndex. Otherwise it just exactly aligns the Index data to the
1835-
labels of the level in the MultiIndex. The order of the data indexed by
1836-
the MultiIndex will not be changed (currently)
1837-
"""
1837+
labels of the level in the MultiIndex. If `keep_order` == True, the
1838+
order of the data indexed by the MultiIndex will not be changed;
1839+
otherwise, it will tie out with `other`.
1840+
"""
1841+
from pandas.algos import groupsort_indexer
1842+
1843+
def _get_leaf_sorter(labels):
1844+
'''
1845+
returns sorter for the inner most level while preserving the
1846+
order of higher levels
1847+
'''
1848+
if labels[0].size == 0:
1849+
return np.empty(0, dtype='int64')
1850+
1851+
if len(labels) == 1:
1852+
lab = com._ensure_int64(labels[0])
1853+
sorter, _ = groupsort_indexer(lab, 1 + lab.max())
1854+
return sorter
1855+
1856+
# find indexers of begining of each set of
1857+
# same-key labels w.r.t all but last level
1858+
tic = labels[0][:-1] != labels[0][1:]
1859+
for lab in labels[1:-1]:
1860+
tic |= lab[:-1] != lab[1:]
1861+
1862+
starts = np.hstack(([True], tic, [True])).nonzero()[0]
1863+
lab = com._ensure_int64(labels[-1])
1864+
return lib.get_level_sorter(lab, starts)
1865+
18381866
if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
18391867
raise TypeError('Join on level between two MultiIndex objects '
18401868
'is ambiguous')
@@ -1849,33 +1877,69 @@ def _join_level(self, other, level, how='left', return_indexers=False):
18491877
level = left._get_level_number(level)
18501878
old_level = left.levels[level]
18511879

1880+
if not right.is_unique:
1881+
raise NotImplementedError('Index._join_level on non-unique index '
1882+
'is not implemented')
1883+
18521884
new_level, left_lev_indexer, right_lev_indexer = \
18531885
old_level.join(right, how=how, return_indexers=True)
18541886

1855-
if left_lev_indexer is not None:
1887+
if left_lev_indexer is None:
1888+
if keep_order or len(left) == 0:
1889+
left_indexer = None
1890+
join_index = left
1891+
else: # sort the leaves
1892+
left_indexer = _get_leaf_sorter(left.labels[:level + 1])
1893+
join_index = left[left_indexer]
1894+
1895+
else:
18561896
left_lev_indexer = com._ensure_int64(left_lev_indexer)
18571897
rev_indexer = lib.get_reverse_indexer(left_lev_indexer,
18581898
len(old_level))
18591899

18601900
new_lev_labels = com.take_nd(rev_indexer, left.labels[level],
18611901
allow_fill=False)
1862-
omit_mask = new_lev_labels != -1
18631902

18641903
new_labels = list(left.labels)
18651904
new_labels[level] = new_lev_labels
18661905

1867-
if not omit_mask.all():
1868-
new_labels = [lab[omit_mask] for lab in new_labels]
1869-
18701906
new_levels = list(left.levels)
18711907
new_levels[level] = new_level
18721908

1873-
join_index = MultiIndex(levels=new_levels, labels=new_labels,
1874-
names=left.names, verify_integrity=False)
1875-
left_indexer = np.arange(len(left))[new_lev_labels != -1]
1876-
else:
1877-
join_index = left
1878-
left_indexer = None
1909+
if keep_order: # just drop missing values. o.w. keep order
1910+
left_indexer = np.arange(len(left))
1911+
mask = new_lev_labels != -1
1912+
if not mask.all():
1913+
new_labels = [lab[mask] for lab in new_labels]
1914+
left_indexer = left_indexer[mask]
1915+
1916+
else: # tie out the order with other
1917+
if level == 0: # outer most level, take the fast route
1918+
ngroups = 1 + new_lev_labels.max()
1919+
left_indexer, counts = groupsort_indexer(new_lev_labels,
1920+
ngroups)
1921+
# missing values are placed first; drop them!
1922+
left_indexer = left_indexer[counts[0]:]
1923+
new_labels = [lab[left_indexer] for lab in new_labels]
1924+
1925+
else: # sort the leaves
1926+
mask = new_lev_labels != -1
1927+
mask_all = mask.all()
1928+
if not mask_all:
1929+
new_labels = [lab[mask] for lab in new_labels]
1930+
1931+
left_indexer = _get_leaf_sorter(new_labels[:level + 1])
1932+
new_labels = [lab[left_indexer] for lab in new_labels]
1933+
1934+
# left_indexers are w.r.t masked frame.
1935+
# reverse to original frame!
1936+
if not mask_all:
1937+
left_indexer = mask.nonzero()[0][left_indexer]
1938+
1939+
join_index = MultiIndex(levels=new_levels,
1940+
labels=new_labels,
1941+
names=left.names,
1942+
verify_integrity=False)
18791943

18801944
if right_lev_indexer is not None:
18811945
right_indexer = com.take_nd(right_lev_indexer,
@@ -3925,7 +3989,8 @@ def reindex(self, target, method=None, level=None, limit=None):
39253989
else:
39263990
target = _ensure_index(target)
39273991
target, indexer, _ = self._join_level(target, level, how='right',
3928-
return_indexers=True)
3992+
return_indexers=True,
3993+
keep_order=False)
39293994
else:
39303995
if self.equals(target):
39313996
indexer = None

pandas/lib.pyx

+21
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,27 @@ def row_bool_subset_object(ndarray[object, ndim=2] values,
11381138

11391139
return out
11401140

1141+
@cython.boundscheck(False)
1142+
@cython.wraparound(False)
1143+
def get_level_sorter(ndarray[int64_t, ndim=1] label,
1144+
ndarray[int64_t, ndim=1] starts):
1145+
"""
1146+
argsort for a single level of a multi-index, keeping the order of higher
1147+
levels unchanged. `starts` points to starts of same-key indices w.r.t
1148+
to leading levels; equivalent to:
1149+
np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort')
1150+
+ starts[i] for i in range(len(starts) - 1)])
1151+
"""
1152+
cdef:
1153+
int64_t l, r
1154+
Py_ssize_t i
1155+
ndarray[int64_t, ndim=1] out = np.empty(len(label), dtype=np.int64)
1156+
1157+
for i in range(len(starts) - 1):
1158+
l, r = starts[i], starts[i + 1]
1159+
out[l:r] = l + label[l:r].argsort(kind='mergesort')
1160+
1161+
return out
11411162

11421163
def group_count(ndarray[int64_t] values, Py_ssize_t size):
11431164
cdef:

pandas/tests/test_frame.py

+60
Original file line numberDiff line numberDiff line change
@@ -1897,6 +1897,66 @@ def test_reversed_reindex_ffill_raises(self):
18971897
self.assertRaises(ValueError, df.reindex, dr[::-1], method='ffill')
18981898
self.assertRaises(ValueError, df.reindex, dr[::-1], method='bfill')
18991899

1900+
def test_reindex_level(self):
1901+
from itertools import permutations
1902+
icol = ['jim', 'joe', 'jolie']
1903+
1904+
def verify_first_level(df, level, idx):
1905+
f = lambda val: np.nonzero(df[level] == val)[0]
1906+
i = np.concatenate(list(map(f, idx)))
1907+
left = df.set_index(icol).reindex(idx, level=level)
1908+
right = df.iloc[i].set_index(icol)
1909+
assert_frame_equal(left, right)
1910+
1911+
def verify(df, level, idx, indexer):
1912+
left = df.set_index(icol).reindex(idx, level=level)
1913+
right = df.iloc[indexer].set_index(icol)
1914+
assert_frame_equal(left, right)
1915+
1916+
df = pd.DataFrame({'jim':list('B' * 4 + 'A' * 2 + 'C' * 3),
1917+
'joe':list('abcdeabcd')[::-1],
1918+
'jolie':[10, 20, 30] * 3,
1919+
'joline': np.random.randint(0, 1000, 9)})
1920+
1921+
target = [['C', 'B', 'A'], ['F', 'C', 'A', 'D'], ['A'], ['D', 'F'],
1922+
['A', 'B', 'C'], ['C', 'A', 'B'], ['C', 'B'], ['C', 'A'],
1923+
['A', 'B'], ['B', 'A', 'C'], ['A', 'C', 'B']]
1924+
1925+
for idx in target:
1926+
verify_first_level(df, 'jim', idx)
1927+
1928+
verify(df, 'joe', list('abcde'), [3, 2, 1, 0, 5, 4, 8, 7, 6])
1929+
verify(df, 'joe', list('abcd'), [3, 2, 1, 0, 5, 8, 7, 6])
1930+
verify(df, 'joe', list('abc'), [3, 2, 1, 8, 7, 6])
1931+
verify(df, 'joe', list('eca'), [1, 3, 4, 6, 8])
1932+
verify(df, 'joe', list('edc'), [0, 1, 4, 5, 6])
1933+
verify(df, 'joe', list('eadbc'), [3, 0, 2, 1, 4, 5, 8, 7, 6])
1934+
verify(df, 'joe', list('edwq'), [0, 4, 5])
1935+
verify(df, 'joe', list('wq'), [])
1936+
1937+
df = DataFrame({'jim':['mid'] * 5 + ['btm'] * 8 + ['top'] * 7,
1938+
'joe':['3rd'] * 2 + ['1st'] * 3 + ['2nd'] * 3 +
1939+
['1st'] * 2 + ['3rd'] * 3 + ['1st'] * 2 +
1940+
['3rd'] * 3 + ['2nd'] * 2,
1941+
'jolie':np.random.randint(0, 1000, 20),
1942+
'joline': np.random.randn(20).round(3) * 10})
1943+
1944+
for idx in permutations(df['jim'].unique()):
1945+
for i in range(3):
1946+
verify_first_level(df, 'jim', idx[:i+1])
1947+
1948+
i = [2,3,4,0,1,8,9,5,6,7,10,11,12,13,14,18,19,15,16,17]
1949+
verify(df, 'joe', ['1st', '2nd', '3rd'], i)
1950+
1951+
i = [0,1,2,3,4,10,11,12,5,6,7,8,9,15,16,17,18,19,13,14]
1952+
verify(df, 'joe', ['3rd', '2nd', '1st'], i)
1953+
1954+
i = [0,1,5,6,7,10,11,12,18,19,15,16,17]
1955+
verify(df, 'joe', ['2nd', '3rd'], i)
1956+
1957+
i = [0,1,2,3,4,10,11,12,8,9,15,16,17,13,14]
1958+
verify(df, 'joe', ['3rd', '1st'], i)
1959+
19001960
def test_getitem_ix_float_duplicates(self):
19011961
df = pd.DataFrame(np.random.randn(3, 3),
19021962
index=[0.1, 0.2, 0.2], columns=list('abc'))

0 commit comments

Comments
 (0)