pandas-dev · jreback · Dec 9, 2018 · Oct 13, 2018 · Oct 13, 2018 · Oct 13, 2018
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1702,6 +1702,7 @@ MultiIndex Constructors
    MultiIndex.from_arrays
    MultiIndex.from_tuples
    MultiIndex.from_product
+   MultiIndex.from_frame
 
 MultiIndex Attributes
 ~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -657,6 +657,10 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
 
     df.to_dict(orient='index')
 
+.. _whatsnew_0240.api_breaking.multiindex_to_frame_ordering
+
+The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. (:issue:`22420`)
+
 .. _whatsnew_0240.api.datetimelike.normalize:
 
 Tick DateOffset Normalize Restrictions
@@ -1337,6 +1341,7 @@ MultiIndex
 
 - Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`)
 - :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a ``Series`` or ``DataFrame`` with a :class:`MultiIndex` index) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`)
+- :cls:`MultiIndex` has gained the :meth:`MultiIndex.from_frame`, it allows constructing a :cls:`MultiIndex` object from a :cls:`DataFrame` (:issue:`22420`)
 - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`)
 
 I/O

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1,6 +1,7 @@
 
 # pylint: disable=E1101,E1103,W0232
 import datetime
+from collections import OrderedDict
 from sys import getsizeof
 import warnings
 
@@ -19,6 +20,7 @@
     is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
     pandas_dtype)
 from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype
+from pandas.core.dtypes.generic import ABCDataFrame
 from pandas.core.dtypes.missing import array_equivalent, isna
 
 import pandas.core.algorithms as algos
@@ -180,6 +182,7 @@ class MultiIndex(Index):
     from_arrays
     from_tuples
     from_product
+    from_frame
     set_levels
     set_labels
     to_frame
@@ -1184,11 +1187,17 @@ def to_frame(self, index=True, name=None):
         else:
             idx_names = self.names
 
-        result = DataFrame({(name or level):
-                            self._get_level_values(level)
-                            for name, level in
-                            zip(idx_names, range(len(self.levels)))},
-                           copy=False)
+        # Guarantee resulting column order
+        result = DataFrame(
+            OrderedDict([
+                ((level if name is None else name),
+                 self._get_level_values(level))
+                for name, level in zip(idx_names, range(len(self.levels)))
+            ]),
+            copy=False
+        )
+
+
         if index:
             result.index = self
         return result
@@ -1317,6 +1326,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
         MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables.
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
         """
         if not is_list_like(arrays):
             raise TypeError("Input must be a list / sequence of array-likes.")
@@ -1366,6 +1376,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None):
         MultiIndex.from_arrays : Convert list of arrays to MultiIndex
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
         """
         if not is_list_like(tuples):
             raise TypeError('Input must be a list / sequence of tuple-likes.')
@@ -1422,6 +1433,7 @@ def from_product(cls, iterables, sortorder=None, names=None):
         --------
         MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
         MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
         """
         from pandas.core.arrays.categorical import _factorize_from_iterables
         from pandas.core.reshape.util import cartesian_product
@@ -1435,6 +1447,89 @@ def from_product(cls, iterables, sortorder=None, names=None):
         labels = cartesian_product(labels)
         return MultiIndex(levels, labels, sortorder=sortorder, names=names)
 
+    @classmethod
+    def from_frame(cls, df, sortorder=None, names=None):
+        """
+        Make a MultiIndex from a DataFrame.
+
+        .. versionadded:: 0.24.0
+
+        Parameters
+        ----------
+        df : pd.DataFrame
+            DataFrame to be converted to MultiIndex.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
+        names : list-like, optonal
+            If no names are provided, use the column names, or tuple of column
+            names if the columns is a MultiIndex. If a sequence, overwrite
+            names with the given sequence.
+
+        Returns
+        -------
+        MultiIndex or Index
+            The MultiIndex representation of the given DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'],
+        ...                    [1, 'jolly'], [2, 'joy'], [2, 'joy']],
+        ...                   columns=['will_be', 'used'])
+        >>> df
+           will_be   used
+        0        0  happy
+        1        0  jolly
+        2        1  happy
+        3        1  jolly
+        4        2    joy
+        5        2    joy
+        >>> pd.MultiIndex.from_frame(df)
+        MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']],
+                   labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]],
+                   names=['will_be', 'used'])
+
+        >>> df = pd.DataFrame([['ahc', 'iam'], ['ahc', 'wim'], ['boh', 'amg'],
+        ...                    ['boh', 'iam'], ['oil', 'wim'], ['oil', 'amg']],
+        ...                   columns=['will_be', 'overriden'])
+        >>> df
+           will_be   overriden
+        0      ahc         iam
+        1      ahc         wim
+        2      boh         amg
+        3      boh         iam
+        4      oil         wim
+        5      oil         amg
+        >>> pd.MultiIndex.from_frame(df, names=['sure', 'will'])
+        MultiIndex(levels=[['ahc', 'boh', 'oil'], ['amg', 'iam', 'wim']],
+                   labels=[[0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]],
+                   names=['sure', 'will'])
+
+        See Also
+        --------
+        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_product : Make a MultiIndex from cartesian product
+                                  of iterables.
+        """
+        if not isinstance(df, ABCDataFrame):
+            raise TypeError("Input must be a DataFrame")
+
+        column_names, columns = lzip(*df.iteritems())
+
+        # Get MultiIndex names
+        if names is None:
+            names = column_names
+        elif is_list_like(names):
+            if len(names) != len(df.columns):
+                    raise ValueError("'names' should have same length as "
+                                     "number of columns in df.")
+        else:
+            raise TypeError("'names' must be a list / sequence of column "
+                            "names.")
+
+        return cls.from_arrays(columns, sortorder=sortorder, names=names)
+
     def _sort_levels_monotonic(self):
         """
         .. versionadded:: 0.20.0

diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import re
+from collections import OrderedDict
 
 import numpy as np
 import pytest
@@ -99,6 +100,9 @@ def test_copy_in_constructor():
     assert mi.levels[0][0] == val
 
 
+# ----------------------------------------------------------------------------
+# from_arrays
+# ----------------------------------------------------------------------------
 def test_from_arrays(idx):
     arrays = []
     for lev, lab in zip(idx.levels, idx.labels):
@@ -271,6 +275,9 @@ def test_from_arrays_different_lengths(idx1, idx2):
         MultiIndex.from_arrays([idx1, idx2])
 
 
+# ----------------------------------------------------------------------------
+# from_tuples
+# ----------------------------------------------------------------------------
 def test_from_tuples():
     msg = 'Cannot infer number of levels from empty list'
     with pytest.raises(TypeError, match=msg):
@@ -314,6 +321,28 @@ def test_from_tuples_index_values(idx):
     assert (result.values == idx.values).all()
 
 
+def test_tuples_with_name_string():
+    # GH 15110 and GH 14848
+
+    li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)]
+    with pytest.raises(ValueError):
+        pd.Index(li, name='abc')
+    with pytest.raises(ValueError):
+        pd.Index(li, name='a')
+
+
+def test_from_tuples_with_tuple_label():
+    # GH 15457
+    expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]],
+                            columns=['a', 'b', 'c']).set_index(['a', 'b'])
+    idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b'))
+    result = pd.DataFrame([2, 3], columns=['c'], index=idx)
+    tm.assert_frame_equal(expected, result)
+
+
+# ----------------------------------------------------------------------------
+# from_product
+# ----------------------------------------------------------------------------
 def test_from_product_empty_zero_levels():
     # 0 levels
     msg = "Must pass non-zero number of levels/labels"
@@ -463,20 +492,71 @@ def test_create_index_existing_name(idx):
     tm.assert_index_equal(result, expected)
 
 
-def test_tuples_with_name_string():
-    # GH 15110 and GH 14848
-
-    li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)]
-    with pytest.raises(ValueError):
-        pd.Index(li, name='abc')
-    with pytest.raises(ValueError):
-        pd.Index(li, name='a')
-
-
-def test_from_tuples_with_tuple_label():
-    # GH 15457
-    expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]],
-                            columns=['a', 'b', 'c']).set_index(['a', 'b'])
-    idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b'))
-    result = pd.DataFrame([2, 3], columns=['c'], index=idx)
-    tm.assert_frame_equal(expected, result)
+# ----------------------------------------------------------------------------
+# from_frame
+# ----------------------------------------------------------------------------
+def test_from_frame():
+    # GH 22420
+    df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']],
+                      columns=['L1', 'L2'])
+    expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'),
+                                          ('b', 'a'), ('b', 'b')],
+                                         names=['L1', 'L2'])
+    result = pd.MultiIndex.from_frame(df)
+    tm.assert_index_equal(expected, result)
+
+
+@pytest.mark.parametrize('non_frame', [
+    pd.Series([1, 2, 3, 4]),
+    [1, 2, 3, 4],
+    [[1, 2], [3, 4], [5, 6]],
+    pd.Index([1, 2, 3, 4]),
+    np.array([[1, 2], [3, 4], [5, 6]]),
+    27
+])
+def test_from_frame_non_frame(non_frame):
+    # GH 22420
+    with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'):
+        pd.MultiIndex.from_frame(non_frame)
+
+
+def test_from_frame_dtype_fidelity():
+    # GH 22420
+    df = pd.DataFrame(OrderedDict([
+        ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')),
+        ('a', [1, 1, 1, 2, 2, 2]),
+        ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)),
+        ('c', ['x', 'x', 'y', 'z', 'x', 'y'])
+    ]))
+    original_dtypes = df.dtypes.to_dict()
+
+    expected_mi = pd.MultiIndex.from_arrays([
+        pd.date_range('19910905', periods=6, tz='US/Eastern'),
+        [1, 1, 1, 2, 2, 2],
+        pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True),
+        ['x', 'x', 'y', 'z', 'x', 'y']
+    ], names=['dates', 'a', 'b', 'c'])
+    mi = pd.MultiIndex.from_frame(df)
+    mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)}
+
+    tm.assert_index_equal(expected_mi, mi)
+    assert original_dtypes == mi_dtypes
+
+
+@pytest.mark.parametrize('names_in,names_out', [
+    (None, [('L1', 'x'), ('L2', 'y')]),
+    (['x', 'y'], ['x', 'y']),
+    ('bad_input', None),
+])
+def test_from_frame_names(names_in, names_out):
+    # GH 22420
+    df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']],
+                      columns=pd.MultiIndex.from_tuples([('L1', 'x'),
+                                                         ('L2', 'y')]))
+    if names_out is None:
+        with tm.assert_raises_regex(TypeError, "'names' must be a list / "
+                                               "sequence of column names."):
+            pd.MultiIndex.from_frame(df, names=names_in)
+    else:
+        mi = pd.MultiIndex.from_frame(df, names=names_in)
+        assert mi.names == names_out
diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 
+from collections import OrderedDict
+
 import pytest
 import numpy as np
 
@@ -83,6 +85,39 @@ def test_to_frame():
     tm.assert_frame_equal(result, expected)
 
 
+def test_to_frame_dtype_fidelity():
+    # GH 22420
+    mi = pd.MultiIndex.from_arrays([
+        pd.date_range('19910905', periods=6, tz='US/Eastern'),
+        [1, 1, 1, 2, 2, 2],
+        pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True),
+        ['x', 'x', 'y', 'z', 'x', 'y']
+    ], names=['dates', 'a', 'b', 'c'])
+    original_dtypes = {name: mi.levels[i].dtype
+                       for i, name in enumerate(mi.names)}
+
+    expected_df = pd.DataFrame(OrderedDict([
+        ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')),
+        ('a', [1, 1, 1, 2, 2, 2]),
+        ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)),
+        ('c', ['x', 'x', 'y', 'z', 'x', 'y'])
+    ]))
+    df = mi.to_frame(index=False)
+    df_dtypes = df.dtypes.to_dict()
+
+    tm.assert_frame_equal(df, expected_df)
+    assert original_dtypes == df_dtypes
+
+
+def test_to_frame_resulting_column_order():
+    # GH 22420
+    expected = ['z', 0, 'a']
+    mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'],
+                                   ['q', 'w', 'e']], names=expected)
+    result = mi.to_frame().columns.tolist()
+    assert result == expected
+
+
 def test_to_hierarchical():
     index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (
         2, 'two')])