replace _reconstruct with: sort_monotonic, and remove_unused_levels (public)

jreback · jreback · commit f2ddc9c5d280 · 2017-04-06T20:17:26.000-04:00
diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
@@ -175,35 +175,40 @@ completely analogous way to selecting a column in a regular DataFrame:
 See :ref:`Cross-section with hierarchical index <advanced.xs>` for how to select
 on a deeper level.
 
-.. note::
+.. _advanced.shown_levels:
+
+Defined Levels
+~~~~~~~~~~~~~~
+
+The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
+if the they are not actually used. When slicing an index, you may notice this.
+For example:
 
-   The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
-   if the they are not actually used. When slicing an index, you may notice this.
-   For example:
+.. ipython:: python
 
-   .. ipython:: python
+   # original multi-index
+   df.columns
 
-      # original multi-index
-      df.columns
+   # sliced
+   df[['foo','qux']].columns
 
-      # sliced
-      df[['foo','qux']].columns
+This is done to avoid a recomputation of the levels in order to make slicing
+highly performant. If you want to see the actual used levels.
 
-   This is done to avoid a recomputation of the levels in order to make slicing
-   highly performant. If you want to see the actual used levels.
+.. ipython:: python
 
-   .. ipython:: python
+   df[['foo','qux']].columns.values
 
-      df[['foo','qux']].columns.values
+   # for a specific level
+   df[['foo','qux']].columns.get_level_values(0)
 
-      # for a specific level
-      df[['foo','qux']].columns.get_level_values(0)
+To reconstruct the multiindex with only the used levels
 
-   To reconstruct the multiindex with only the used levels
+.. versionadded:: 0.20.0
 
-   .. ipython:: python
+.. ipython:: python
 
-      pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values)
+   df[['foo','qux']].columns.remove_unused_levels()
 
 Data alignment and using ``reindex``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1432,6 +1432,7 @@ MultiIndex Components
    MultiIndex.droplevel
    MultiIndex.swaplevel
    MultiIndex.reorder_levels
+   MultiIndex.remove_unused_levels
 
 .. _api.datetimeindex:
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -366,6 +366,7 @@ Other Enhancements
 - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
 - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
 - ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
+- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
@@ -778,6 +779,7 @@ New Behavior:
    df.sort_index().index.is_lexsorted()
    df.sort_index().index.is_monotonic
 
+
 .. _whatsnew_0200.api_breaking.groupby_describe:
 
 Groupby Describe Formatting
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3349,7 +3349,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
 
             # make sure that the axis is lexsorted to start
             # if not we need to reconstruct to get the correct indexer
-            labels = labels._reconstruct(sort=True)
+            labels = labels.sort_monotonic()
 
             indexer = lexsort_indexer(labels.labels, orders=ascending,
                                       na_position=na_position)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1762,7 +1762,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
                                                  sort_remaining=sort_remaining)
         elif isinstance(index, MultiIndex):
             from pandas.core.sorting import lexsort_indexer
-            labels = index._reconstruct(sort=True)
+            labels = index.sort_monotonic()
             indexer = lexsort_indexer(labels.labels, orders=ascending)
         else:
             from pandas.core.sorting import nargsort
diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -1173,98 +1173,98 @@ def from_product(cls, iterables, sortorder=None, names=None):
         labels = cartesian_product(labels)
         return MultiIndex(levels, labels, sortorder=sortorder, names=names)
 
-    def _reconstruct(self, sort=False, remove_unused=False):
+    def sort_monotonic(self):
         """
-        create a new MultiIndex from the current to provide either:
-          - monotonically sorted items IN the levels
-          - removing unused levels (meaning that they are not expressed
-            in the labels)
+        create a new MultiIndex from the current to monotonically sorted
+        items IN the levels
 
         The resulting MultiIndex will have the same outward
         appearance, meaning the same .values and ordering. It will also
         be .equals() to the original.
 
-        Parameters
-        ----------
-        sort: boolean, default False
-            monotonically sort the levels
-        remove_unused: boolean, default False
-            remove unsued levels
-
         Returns
         -------
-        new MultiIndex
+        MultiIndex
 
         """
 
-        if sort and remove_unused:
-            raise ValueError("only support one of sort / remove_unused")
-
-        if not (sort or remove_unused):
-            raise ValueError("must supply one of sort / remove_unsued")
-
-        levels = self.levels
-        labels = self.labels
+        if self.is_lexsorted() and self.is_monotonic:
+            return self
 
         new_levels = []
         new_labels = []
 
-        if sort:
-
-            if self.is_lexsorted() and self.is_monotonic:
-                return self
+        for lev, lab in zip(self.levels, self.labels):
 
-            for lev, lab in zip(levels, labels):
+            if lev.is_monotonic:
+                new_levels.append(lev)
+                new_labels.append(lab)
+                continue
 
-                if lev.is_monotonic:
-                    new_levels.append(lev)
-                    new_labels.append(lab)
-                    continue
+            # indexer to reorder the levels
+            indexer = lev.argsort()
+            lev = lev.take(indexer)
 
-                # indexer to reorder the levels
-                indexer = lev.argsort()
-                lev = lev.take(indexer)
+            # indexer to reorder the labels
+            ri = lib.get_reverse_indexer(indexer, len(indexer))
+            lab = algos.take_1d(ri, lab)
 
-                # indexer to reorder the labels
-                ri = lib.get_reverse_indexer(indexer, len(indexer))
-                lab = algos.take_1d(ri, lab)
+            new_levels.append(lev)
+            new_labels.append(lab)
 
-                new_levels.append(lev)
-                new_labels.append(lab)
-
-        elif remove_unused:
+        return MultiIndex(new_levels, new_labels,
+                          names=self.names, sortorder=self.sortorder,
+                          verify_integrity=False)
 
-            changed = np.zeros(self.nlevels, dtype=bool)
-            for i, (lev, lab) in enumerate(zip(levels, labels)):
+    def remove_unused_levels(self):
+        """
+        create a new MultiIndex from the current that removesing
+        unused levels, meaning that they are not expressed in the labels
 
-                uniques = np.sort(algos.unique(lab))
+        The resulting MultiIndex will have the same outward
+        appearance, meaning the same .values and ordering. It will also
+        be .equals() to the original.
 
-                # nothing unused
-                if len(uniques) == len(lev):
-                    new_levels.append(lev)
-                    new_labels.append(lab)
-                    changed[i] = True
-                    continue
+        Returns
+        -------
+        MultiIndex
 
-                unused = list(reversed(sorted(set(
-                    np.arange(len(lev))) - set(uniques))))
+        """
 
-                # new levels are simple
-                lev = lev.take(uniques)
+        new_levels = []
+        new_labels = []
 
-                # new labels, we remove the unsued
-                # by decrementing the labels for that value
-                # prob a better way
-                for u in unused:
+        changed = np.zeros(self.nlevels, dtype=bool)
+        for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):
 
-                    lab = np.where(lab > u, lab - 1, lab)
+            uniques = np.sort(algos.unique(lab))
 
+            # nothing unused
+            if len(uniques) == len(lev):
                 new_levels.append(lev)
                 new_labels.append(lab)
+                changed[i] = True
+                continue
+
+            unused = list(reversed(sorted(set(
+                np.arange(len(lev))) - set(uniques))))
+
+            # new levels are simple
+            lev = lev.take(uniques)
 
-            # nothing changed
-            if not changed.any():
-                return self
+            # new labels, we remove the unsued
+            # by decrementing the labels for that value
+            # prob a better way
+            for u in unused:
+
+                lab = np.where(lab > u, lab - 1, lab)
+
+            new_levels.append(lev)
+            new_labels.append(lab)
+
+        # nothing changed
+        if not changed.any():
+            return self
 
         return MultiIndex(new_levels, new_labels,
                           names=self.names, sortorder=self.sortorder,
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -2411,18 +2411,6 @@ def test_is_monotonic(self):
 
         self.assertFalse(i.is_monotonic)
 
-    def test_reconstruct_api(self):
-
-        mi = MultiIndex.from_arrays([
-            ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
-        ])
-
-        with pytest.raises(ValueError):
-            mi._reconstruct()
-
-        with pytest.raises(ValueError):
-            mi._reconstruct(sort=True, remove_unused=True)
-
     def test_reconstruct_sort(self):
 
         # starts off lexsorted & monotonic
@@ -2432,7 +2420,7 @@ def test_reconstruct_sort(self):
         assert mi.is_lexsorted()
         assert mi.is_monotonic
 
-        recons = mi._reconstruct(sort=True)
+        recons = mi.sort_monotonic()
         assert recons.is_lexsorted()
         assert recons.is_monotonic
         assert mi is recons
@@ -2447,7 +2435,7 @@ def test_reconstruct_sort(self):
         assert not mi.is_lexsorted()
         assert not mi.is_monotonic
 
-        recons = mi._reconstruct(sort=True)
+        recons = mi.sort_monotonic()
         assert not recons.is_lexsorted()
         assert not recons.is_monotonic
 
@@ -2461,7 +2449,7 @@ def test_reconstruct_sort(self):
         assert not mi.is_lexsorted()
         assert not mi.is_monotonic
 
-        recons = mi._reconstruct(sort=True)
+        recons = mi.sort_monotonic()
         assert not recons.is_lexsorted()
         assert not recons.is_monotonic
 
@@ -2489,11 +2477,11 @@ def test_reconstruct_remove_unused(self):
                                       [2, 3]],
                               labels=[[0, 1], [0, 1]],
                               names=['first', 'second'])
-        result = df2.index._reconstruct(remove_unused=True)
+        result = df2.index.remove_unused_levels()
         tm.assert_index_equal(result, expected)
 
         # idempotent
-        result2 = result._reconstruct(remove_unused=True)
+        result2 = result.remove_unused_levels()
         tm.assert_index_equal(result2, expected)
         assert result2 is result
 
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -2582,7 +2582,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
 
         # reconstruct
         result = df.sort_index().copy()
-        result.index = result.index._reconstruct(sort=True)
+        result.index = result.index.sort_monotonic()
         assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py
@@ -91,7 +91,7 @@ def test_multiindex_objects(self):
         mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                         labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                         names=['col1', 'col2'])
-        recons = mi._reconstruct(sort=True)
+        recons = mi.sort_monotonic()
 
         # these are equal
         assert mi.equals(recons)