Skip to content

Commit 84bb2b9

Browse files
committed
support for removing unused levels (internally)
xref pandas-dev#2770
1 parent 1a9be09 commit 84bb2b9

File tree

2 files changed

+92
-5
lines changed

2 files changed

+92
-5
lines changed

pandas/indexes/multi.py

+50-4
Original file line numberDiff line numberDiff line change
@@ -1177,7 +1177,7 @@ def from_product(cls, iterables, sortorder=None, names=None):
11771177
labels = cartesian_product(labels)
11781178
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11791179

1180-
def _reconstruct(self, sort=False):
1180+
def _reconstruct(self, sort=False, remove_unused=False):
11811181
"""
11821182
reconstruct the MultiIndex
11831183
@@ -1188,20 +1188,32 @@ def _reconstruct(self, sort=False):
11881188
----------
11891189
sort: boolean, default False
11901190
monotonically sort the levels
1191+
remove_unused: boolean, default False
1192+
remove unsued levels
11911193
11921194
Returns
11931195
-------
11941196
MultiIndex
11951197
11961198
"""
1199+
1200+
if sort and remove_unused:
1201+
raise ValueError("only support one of sort / remove_unused")
1202+
1203+
if not (sort or remove_unused):
1204+
raise ValueError("must supply one of sort / remove_unsued")
1205+
1206+
levels = self.levels
1207+
labels = self.labels
1208+
new_levels = []
1209+
new_labels = []
1210+
11971211
if sort:
11981212

11991213
if self.is_monotonic:
12001214
return self
12011215

1202-
new_levels = []
1203-
new_labels = []
1204-
for lev, lab in zip(self.levels, self.labels):
1216+
for lev, lab in zip(levels, labels):
12051217

12061218
if lev.is_monotonic:
12071219
new_levels.append(lev)
@@ -1219,6 +1231,40 @@ def _reconstruct(self, sort=False):
12191231
new_levels.append(lev)
12201232
new_labels.append(lab)
12211233

1234+
elif remove_unused:
1235+
1236+
changed = np.zeros(self.nlevels, dtype=bool)
1237+
for i, (lev, lab) in enumerate(zip(levels, labels)):
1238+
1239+
uniques = np.sort(algos.unique(lab))
1240+
1241+
# nothing unused
1242+
if len(uniques) == len(lev):
1243+
new_levels.append(lev)
1244+
new_labels.append(lab)
1245+
changed[i] = True
1246+
continue
1247+
1248+
unused = list(reversed(sorted(set(
1249+
np.arange(len(lev))) - set(uniques))))
1250+
1251+
# new levels are simple
1252+
lev = lev.take(uniques)
1253+
1254+
# new labels, we remove the unsued
1255+
# by decrementing the labels for that value
1256+
# prob a better way
1257+
for u in unused:
1258+
1259+
lab = np.where(lab > u, lab - 1, lab)
1260+
1261+
new_levels.append(lev)
1262+
new_labels.append(lab)
1263+
1264+
# nothing changed
1265+
if not changed.any():
1266+
return self
1267+
12221268
return MultiIndex(new_levels, new_labels,
12231269
names=self.names, sortorder=self.sortorder,
12241270
verify_integrity=False)

pandas/tests/indexes/test_multi.py

+42-1
Original file line numberDiff line numberDiff line change
@@ -2411,7 +2411,19 @@ def test_is_monotonic(self):
24112411

24122412
self.assertFalse(i.is_monotonic)
24132413

2414-
def test_reconstruct(self):
2414+
def test_reconstruct_api(self):
2415+
2416+
mi = MultiIndex.from_arrays([
2417+
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
2418+
])
2419+
2420+
with pytest.raises(ValueError):
2421+
mi._reconstruct()
2422+
2423+
with pytest.raises(ValueError):
2424+
mi._reconstruct(sort=True, remove_unused=True)
2425+
2426+
def test_reconstruct_sort(self):
24152427

24162428
# starts off lexsorted & monotonic
24172429
mi = MultiIndex.from_arrays([
@@ -2456,6 +2468,35 @@ def test_reconstruct(self):
24562468
assert mi.equals(recons)
24572469
assert Index(mi.values).equals(Index(recons.values))
24582470

2471+
def test_reconstruct_remove_unused(self):
2472+
# xref to GH 2770
2473+
df = DataFrame([['deleteMe', 1, 9],
2474+
['keepMe', 2, 9],
2475+
['keepMeToo', 3, 9]],
2476+
columns=['first', 'second', 'third'])
2477+
df2 = df.set_index(['first', 'second'], drop=False)
2478+
df2 = df2[df2['first'] != 'deleteMe']
2479+
2480+
# removed levels are there
2481+
expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'],
2482+
[1, 2, 3]],
2483+
labels=[[1, 2], [1, 2]],
2484+
names=['first', 'second'])
2485+
result = df2.index
2486+
tm.assert_index_equal(result, expected)
2487+
2488+
expected = MultiIndex(levels=[['keepMe', 'keepMeToo'],
2489+
[2, 3]],
2490+
labels=[[0, 1], [0, 1]],
2491+
names=['first', 'second'])
2492+
result = df2.index._reconstruct(remove_unused=True)
2493+
tm.assert_index_equal(result, expected)
2494+
2495+
# idempotent
2496+
result2 = result._reconstruct(remove_unused=True)
2497+
tm.assert_index_equal(result2, expected)
2498+
assert result2 is result
2499+
24592500
def test_isin(self):
24602501
values = [('foo', 2), ('bar', 3), ('quux', 4)]
24612502

0 commit comments

Comments
 (0)