Skip to content

Commit 7677df2

Browse files
committed
support for removing unused levels (internally)
xref pandas-dev#2770
1 parent 245dee7 commit 7677df2

File tree

3 files changed

+108
-16
lines changed

3 files changed

+108
-16
lines changed

pandas/indexes/multi.py

+48-5
Original file line numberDiff line numberDiff line change
@@ -1173,7 +1173,7 @@ def from_product(cls, iterables, sortorder=None, names=None):
11731173
labels = cartesian_product(labels)
11741174
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11751175

1176-
def _reconstruct(self, sort=False):
1176+
def _reconstruct(self, sort=False, remove_unused=False):
11771177
"""
11781178
reconstruct the MultiIndex
11791179
@@ -1184,21 +1184,33 @@ def _reconstruct(self, sort=False):
11841184
----------
11851185
sort: boolean, default False
11861186
monotonically sort the levels
1187+
remove_unused: boolean, default False
1188+
remove unsued levels
11871189
11881190
Returns
11891191
-------
11901192
MultiIndex
11911193
11921194
"""
1195+
1196+
if sort and remove_unused:
1197+
raise ValueError("only support one of sort / remove_unused")
1198+
1199+
if not (sort or remove_unused):
1200+
raise ValueError("must supply one of sort / remove_unsued")
1201+
1202+
levels = self.levels
1203+
labels = self.labels
1204+
11931205
new_levels = []
11941206
new_labels = []
11951207

11961208
if sort:
11971209

1198-
if self.is_monotonic:
1210+
if self.is_lexsorted() and self.is_monotonic:
11991211
return self
12001212

1201-
for lev, lab in zip(self.levels, self.labels):
1213+
for lev, lab in zip(levels, labels):
12021214

12031215
if lev.is_monotonic:
12041216
new_levels.append(lev)
@@ -1216,8 +1228,39 @@ def _reconstruct(self, sort=False):
12161228
new_levels.append(lev)
12171229
new_labels.append(lab)
12181230

1219-
else:
1220-
return self
1231+
elif remove_unused:
1232+
1233+
changed = np.zeros(self.nlevels, dtype=bool)
1234+
for i, (lev, lab) in enumerate(zip(levels, labels)):
1235+
1236+
uniques = np.sort(algos.unique(lab))
1237+
1238+
# nothing unused
1239+
if len(uniques) == len(lev):
1240+
new_levels.append(lev)
1241+
new_labels.append(lab)
1242+
changed[i] = True
1243+
continue
1244+
1245+
unused = list(reversed(sorted(set(
1246+
np.arange(len(lev))) - set(uniques))))
1247+
1248+
# new levels are simple
1249+
lev = lev.take(uniques)
1250+
1251+
# new labels, we remove the unsued
1252+
# by decrementing the labels for that value
1253+
# prob a better way
1254+
for u in unused:
1255+
1256+
lab = np.where(lab > u, lab - 1, lab)
1257+
1258+
new_levels.append(lev)
1259+
new_labels.append(lab)
1260+
1261+
# nothing changed
1262+
if not changed.any():
1263+
return self
12211264

12221265
return MultiIndex(new_levels, new_labels,
12231266
names=self.names, sortorder=self.sortorder,

pandas/tests/indexes/test_multi.py

+41-8
Original file line numberDiff line numberDiff line change
@@ -2411,6 +2411,18 @@ def test_is_monotonic(self):
24112411

24122412
self.assertFalse(i.is_monotonic)
24132413

2414+
def test_reconstruct_api(self):
2415+
2416+
mi = MultiIndex.from_arrays([
2417+
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
2418+
])
2419+
2420+
with pytest.raises(ValueError):
2421+
mi._reconstruct()
2422+
2423+
with pytest.raises(ValueError):
2424+
mi._reconstruct(sort=True, remove_unused=True)
2425+
24142426
def test_reconstruct_sort(self):
24152427

24162428
# starts off lexsorted & monotonic
@@ -2428,14 +2440,6 @@ def test_reconstruct_sort(self):
24282440
assert mi.equals(recons)
24292441
assert Index(mi.values).equals(Index(recons.values))
24302442

2431-
recons = mi._reconstruct(sort=False)
2432-
assert recons.is_lexsorted()
2433-
assert recons.is_monotonic
2434-
assert mi is recons
2435-
2436-
assert mi.equals(recons)
2437-
assert Index(mi.values).equals(Index(recons.values))
2438-
24392443
# cannot convert to lexsorted
24402444
mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
24412445
('x', 'b'), ('y', 'a'), ('z', 'b')],
@@ -2464,6 +2468,35 @@ def test_reconstruct_sort(self):
24642468
assert mi.equals(recons)
24652469
assert Index(mi.values).equals(Index(recons.values))
24662470

2471+
def test_reconstruct_remove_unused(self):
2472+
# xref to GH 2770
2473+
df = DataFrame([['deleteMe', 1, 9],
2474+
['keepMe', 2, 9],
2475+
['keepMeToo', 3, 9]],
2476+
columns=['first', 'second', 'third'])
2477+
df2 = df.set_index(['first', 'second'], drop=False)
2478+
df2 = df2[df2['first'] != 'deleteMe']
2479+
2480+
# removed levels are there
2481+
expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'],
2482+
[1, 2, 3]],
2483+
labels=[[1, 2], [1, 2]],
2484+
names=['first', 'second'])
2485+
result = df2.index
2486+
tm.assert_index_equal(result, expected)
2487+
2488+
expected = MultiIndex(levels=[['keepMe', 'keepMeToo'],
2489+
[2, 3]],
2490+
labels=[[0, 1], [0, 1]],
2491+
names=['first', 'second'])
2492+
result = df2.index._reconstruct(remove_unused=True)
2493+
tm.assert_index_equal(result, expected)
2494+
2495+
# idempotent
2496+
result2 = result._reconstruct(remove_unused=True)
2497+
tm.assert_index_equal(result2, expected)
2498+
assert result2 is result
2499+
24672500
def test_isin(self):
24682501
values = [('foo', 2), ('bar', 3), ('quux', 4)]
24692502

pandas/tests/test_multilevel.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -2559,16 +2559,32 @@ def test_sort_index_and_reconstruction(self):
25592559
assert result.columns.is_lexsorted()
25602560
assert result.columns.is_monotonic
25612561

2562+
def test_sort_index_and_reconstruction_doc_example(self):
25622563
# doc example
25632564
df = DataFrame({'value': [1, 2, 3, 4]},
25642565
index=MultiIndex(
25652566
levels=[['a', 'b'], ['bb', 'aa']],
2566-
labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
2567-
result = df.sort_index()
2567+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
2568+
assert df.index.is_lexsorted()
2569+
assert not df.index.is_monotonic
2570+
2571+
# sort it
25682572
expected = DataFrame({'value': [2, 1, 4, 3]},
25692573
index=MultiIndex(
25702574
levels=[['a', 'b'], ['aa', 'bb']],
2571-
labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
2575+
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
2576+
result = df.sort_index()
2577+
assert not result.index.is_lexsorted()
2578+
assert result.index.is_monotonic
2579+
2580+
tm.assert_frame_equal(result, expected)
2581+
2582+
# reconstruct
2583+
result = df.sort_index().copy()
2584+
result.index = result.index._reconstruct(sort=True)
2585+
assert result.index.is_lexsorted()
2586+
assert result.index.is_monotonic
2587+
25722588
tm.assert_frame_equal(result, expected)
25732589

25742590
def test_sort_index_reorder_on_ops(self):

0 commit comments

Comments
 (0)