Skip to content

Commit f2ddc9c

Browse files
committed
replace _reconstruct with: sort_monotonic, and remove_unused_levels (public)
1 parent 3c4ca22 commit f2ddc9c

File tree

9 files changed

+96
-100
lines changed

9 files changed

+96
-100
lines changed

doc/source/advanced.rst

+23-18
Original file line numberDiff line numberDiff line change
@@ -175,35 +175,40 @@ completely analogous way to selecting a column in a regular DataFrame:
175175
See :ref:`Cross-section with hierarchical index <advanced.xs>` for how to select
176176
on a deeper level.
177177

178-
.. note::
178+
.. _advanced.shown_levels:
179+
180+
Defined Levels
181+
~~~~~~~~~~~~~~
182+
183+
The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
184+
if the they are not actually used. When slicing an index, you may notice this.
185+
For example:
179186

180-
The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
181-
if the they are not actually used. When slicing an index, you may notice this.
182-
For example:
187+
.. ipython:: python
183188
184-
.. ipython:: python
189+
# original multi-index
190+
df.columns
185191
186-
# original multi-index
187-
df.columns
192+
# sliced
193+
df[['foo','qux']].columns
188194
189-
# sliced
190-
df[['foo','qux']].columns
195+
This is done to avoid a recomputation of the levels in order to make slicing
196+
highly performant. If you want to see the actual used levels.
191197

192-
This is done to avoid a recomputation of the levels in order to make slicing
193-
highly performant. If you want to see the actual used levels.
198+
.. ipython:: python
194199
195-
.. ipython:: python
200+
df[['foo','qux']].columns.values
196201
197-
df[['foo','qux']].columns.values
202+
# for a specific level
203+
df[['foo','qux']].columns.get_level_values(0)
198204
199-
# for a specific level
200-
df[['foo','qux']].columns.get_level_values(0)
205+
To reconstruct the multiindex with only the used levels
201206

202-
To reconstruct the multiindex with only the used levels
207+
.. versionadded:: 0.20.0
203208

204-
.. ipython:: python
209+
.. ipython:: python
205210
206-
pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values)
211+
df[['foo','qux']].columns.remove_unused_levels()
207212
208213
Data alignment and using ``reindex``
209214
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,7 @@ MultiIndex Components
14321432
MultiIndex.droplevel
14331433
MultiIndex.swaplevel
14341434
MultiIndex.reorder_levels
1435+
MultiIndex.remove_unused_levels
14351436

14361437
.. _api.datetimeindex:
14371438

doc/source/whatsnew/v0.20.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ Other Enhancements
366366
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
367367
- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
368368
- ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
369+
- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)
369370

370371

371372
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
@@ -778,6 +779,7 @@ New Behavior:
778779
df.sort_index().index.is_lexsorted()
779780
df.sort_index().index.is_monotonic
780781

782+
781783
.. _whatsnew_0200.api_breaking.groupby_describe:
782784

783785
Groupby Describe Formatting

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3349,7 +3349,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
33493349

33503350
# make sure that the axis is lexsorted to start
33513351
# if not we need to reconstruct to get the correct indexer
3352-
labels = labels._reconstruct(sort=True)
3352+
labels = labels.sort_monotonic()
33533353

33543354
indexer = lexsort_indexer(labels.labels, orders=ascending,
33553355
na_position=na_position)

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1762,7 +1762,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
17621762
sort_remaining=sort_remaining)
17631763
elif isinstance(index, MultiIndex):
17641764
from pandas.core.sorting import lexsort_indexer
1765-
labels = index._reconstruct(sort=True)
1765+
labels = index.sort_monotonic()
17661766
indexer = lexsort_indexer(labels.labels, orders=ascending)
17671767
else:
17681768
from pandas.core.sorting import nargsort

pandas/indexes/multi.py

+61-61
Original file line numberDiff line numberDiff line change
@@ -1173,98 +1173,98 @@ def from_product(cls, iterables, sortorder=None, names=None):
11731173
labels = cartesian_product(labels)
11741174
return MultiIndex(levels, labels, sortorder=sortorder, names=names)
11751175

1176-
def _reconstruct(self, sort=False, remove_unused=False):
1176+
def sort_monotonic(self):
11771177
"""
1178-
create a new MultiIndex from the current to provide either:
1179-
- monotonically sorted items IN the levels
1180-
- removing unused levels (meaning that they are not expressed
1181-
in the labels)
1178+
create a new MultiIndex from the current to monotonically sorted
1179+
items IN the levels
11821180
11831181
The resulting MultiIndex will have the same outward
11841182
appearance, meaning the same .values and ordering. It will also
11851183
be .equals() to the original.
11861184
1187-
Parameters
1188-
----------
1189-
sort: boolean, default False
1190-
monotonically sort the levels
1191-
remove_unused: boolean, default False
1192-
remove unsued levels
1193-
11941185
Returns
11951186
-------
1196-
new MultiIndex
1187+
MultiIndex
11971188
11981189
"""
11991190

1200-
if sort and remove_unused:
1201-
raise ValueError("only support one of sort / remove_unused")
1202-
1203-
if not (sort or remove_unused):
1204-
raise ValueError("must supply one of sort / remove_unsued")
1205-
1206-
levels = self.levels
1207-
labels = self.labels
1191+
if self.is_lexsorted() and self.is_monotonic:
1192+
return self
12081193

12091194
new_levels = []
12101195
new_labels = []
12111196

1212-
if sort:
1213-
1214-
if self.is_lexsorted() and self.is_monotonic:
1215-
return self
1197+
for lev, lab in zip(self.levels, self.labels):
12161198

1217-
for lev, lab in zip(levels, labels):
1199+
if lev.is_monotonic:
1200+
new_levels.append(lev)
1201+
new_labels.append(lab)
1202+
continue
12181203

1219-
if lev.is_monotonic:
1220-
new_levels.append(lev)
1221-
new_labels.append(lab)
1222-
continue
1204+
# indexer to reorder the levels
1205+
indexer = lev.argsort()
1206+
lev = lev.take(indexer)
12231207

1224-
# indexer to reorder the levels
1225-
indexer = lev.argsort()
1226-
lev = lev.take(indexer)
1208+
# indexer to reorder the labels
1209+
ri = lib.get_reverse_indexer(indexer, len(indexer))
1210+
lab = algos.take_1d(ri, lab)
12271211

1228-
# indexer to reorder the labels
1229-
ri = lib.get_reverse_indexer(indexer, len(indexer))
1230-
lab = algos.take_1d(ri, lab)
1212+
new_levels.append(lev)
1213+
new_labels.append(lab)
12311214

1232-
new_levels.append(lev)
1233-
new_labels.append(lab)
1234-
1235-
elif remove_unused:
1215+
return MultiIndex(new_levels, new_labels,
1216+
names=self.names, sortorder=self.sortorder,
1217+
verify_integrity=False)
12361218

1237-
changed = np.zeros(self.nlevels, dtype=bool)
1238-
for i, (lev, lab) in enumerate(zip(levels, labels)):
1219+
def remove_unused_levels(self):
1220+
"""
1221+
create a new MultiIndex from the current that removesing
1222+
unused levels, meaning that they are not expressed in the labels
12391223
1240-
uniques = np.sort(algos.unique(lab))
1224+
The resulting MultiIndex will have the same outward
1225+
appearance, meaning the same .values and ordering. It will also
1226+
be .equals() to the original.
12411227
1242-
# nothing unused
1243-
if len(uniques) == len(lev):
1244-
new_levels.append(lev)
1245-
new_labels.append(lab)
1246-
changed[i] = True
1247-
continue
1228+
Returns
1229+
-------
1230+
MultiIndex
12481231
1249-
unused = list(reversed(sorted(set(
1250-
np.arange(len(lev))) - set(uniques))))
1232+
"""
12511233

1252-
# new levels are simple
1253-
lev = lev.take(uniques)
1234+
new_levels = []
1235+
new_labels = []
12541236

1255-
# new labels, we remove the unsued
1256-
# by decrementing the labels for that value
1257-
# prob a better way
1258-
for u in unused:
1237+
changed = np.zeros(self.nlevels, dtype=bool)
1238+
for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):
12591239

1260-
lab = np.where(lab > u, lab - 1, lab)
1240+
uniques = np.sort(algos.unique(lab))
12611241

1242+
# nothing unused
1243+
if len(uniques) == len(lev):
12621244
new_levels.append(lev)
12631245
new_labels.append(lab)
1246+
changed[i] = True
1247+
continue
1248+
1249+
unused = list(reversed(sorted(set(
1250+
np.arange(len(lev))) - set(uniques))))
1251+
1252+
# new levels are simple
1253+
lev = lev.take(uniques)
12641254

1265-
# nothing changed
1266-
if not changed.any():
1267-
return self
1255+
# new labels, we remove the unsued
1256+
# by decrementing the labels for that value
1257+
# prob a better way
1258+
for u in unused:
1259+
1260+
lab = np.where(lab > u, lab - 1, lab)
1261+
1262+
new_levels.append(lev)
1263+
new_labels.append(lab)
1264+
1265+
# nothing changed
1266+
if not changed.any():
1267+
return self
12681268

12691269
return MultiIndex(new_levels, new_labels,
12701270
names=self.names, sortorder=self.sortorder,

pandas/tests/indexes/test_multi.py

+5-17
Original file line numberDiff line numberDiff line change
@@ -2411,18 +2411,6 @@ def test_is_monotonic(self):
24112411

24122412
self.assertFalse(i.is_monotonic)
24132413

2414-
def test_reconstruct_api(self):
2415-
2416-
mi = MultiIndex.from_arrays([
2417-
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
2418-
])
2419-
2420-
with pytest.raises(ValueError):
2421-
mi._reconstruct()
2422-
2423-
with pytest.raises(ValueError):
2424-
mi._reconstruct(sort=True, remove_unused=True)
2425-
24262414
def test_reconstruct_sort(self):
24272415

24282416
# starts off lexsorted & monotonic
@@ -2432,7 +2420,7 @@ def test_reconstruct_sort(self):
24322420
assert mi.is_lexsorted()
24332421
assert mi.is_monotonic
24342422

2435-
recons = mi._reconstruct(sort=True)
2423+
recons = mi.sort_monotonic()
24362424
assert recons.is_lexsorted()
24372425
assert recons.is_monotonic
24382426
assert mi is recons
@@ -2447,7 +2435,7 @@ def test_reconstruct_sort(self):
24472435
assert not mi.is_lexsorted()
24482436
assert not mi.is_monotonic
24492437

2450-
recons = mi._reconstruct(sort=True)
2438+
recons = mi.sort_monotonic()
24512439
assert not recons.is_lexsorted()
24522440
assert not recons.is_monotonic
24532441

@@ -2461,7 +2449,7 @@ def test_reconstruct_sort(self):
24612449
assert not mi.is_lexsorted()
24622450
assert not mi.is_monotonic
24632451

2464-
recons = mi._reconstruct(sort=True)
2452+
recons = mi.sort_monotonic()
24652453
assert not recons.is_lexsorted()
24662454
assert not recons.is_monotonic
24672455

@@ -2489,11 +2477,11 @@ def test_reconstruct_remove_unused(self):
24892477
[2, 3]],
24902478
labels=[[0, 1], [0, 1]],
24912479
names=['first', 'second'])
2492-
result = df2.index._reconstruct(remove_unused=True)
2480+
result = df2.index.remove_unused_levels()
24932481
tm.assert_index_equal(result, expected)
24942482

24952483
# idempotent
2496-
result2 = result._reconstruct(remove_unused=True)
2484+
result2 = result.remove_unused_levels()
24972485
tm.assert_index_equal(result2, expected)
24982486
assert result2 is result
24992487

pandas/tests/test_multilevel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2582,7 +2582,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
25822582

25832583
# reconstruct
25842584
result = df.sort_index().copy()
2585-
result.index = result.index._reconstruct(sort=True)
2585+
result.index = result.index.sort_monotonic()
25862586
assert result.index.is_lexsorted()
25872587
assert result.index.is_monotonic
25882588

pandas/tests/tools/test_hashing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def test_multiindex_objects(self):
9191
mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
9292
labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
9393
names=['col1', 'col2'])
94-
recons = mi._reconstruct(sort=True)
94+
recons = mi.sort_monotonic()
9595

9696
# these are equal
9797
assert mi.equals(recons)

0 commit comments

Comments
 (0)