diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 39082ef7a4c69..0cc2cea774bbd 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -62,8 +62,9 @@ The :class:`MultiIndex` object is the hierarchical analogue of the standard can think of ``MultiIndex`` as an array of tuples where each tuple is unique. A ``MultiIndex`` can be created from a list of arrays (using :meth:`MultiIndex.from_arrays`), an array of tuples (using -:meth:`MultiIndex.from_tuples`), or a crossed set of iterables (using -:meth:`MultiIndex.from_product`). The ``Index`` constructor will attempt to return +:meth:`MultiIndex.from_tuples`), a crossed set of iterables (using +:meth:`MultiIndex.from_product`), or a :class:`DataFrame` (using +:meth:`MultiIndex.from_frame`). The ``Index`` constructor will attempt to return a ``MultiIndex`` when it is passed a list of tuples. The following examples demonstrate different ways to initialize MultiIndexes. @@ -89,6 +90,19 @@ to use the :meth:`MultiIndex.from_product` method: iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] pd.MultiIndex.from_product(iterables, names=['first', 'second']) +You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using +the method :meth:`MultiIndex.from_frame`. This is a complementary method to +:meth:`MultiIndex.to_frame`. + +.. versionadded:: 0.24.0 + +.. ipython:: python + + df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], + ['foo', 'one'], ['foo', 'two']], + columns=['first', 'second']) + pd.MultiIndex.from_frame(df) + As a convenience, you can pass a list of arrays directly into ``Series`` or ``DataFrame`` to construct a ``MultiIndex`` automatically: diff --git a/doc/source/api.rst b/doc/source/api.rst index 1a23587d2ebb5..49c89a53e7b17 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1703,6 +1703,7 @@ MultiIndex Constructors MultiIndex.from_arrays MultiIndex.from_tuples MultiIndex.from_product + MultiIndex.from_frame MultiIndex Attributes ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0b2b526dfe9e7..ab4b7f3c41fed 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -378,6 +378,7 @@ Backwards incompatible API changes - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) +- The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) .. _whatsnew_0240.api_breaking.deps: @@ -1433,6 +1434,7 @@ MultiIndex - Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) - :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a ``Series`` or ``DataFrame`` with a :class:`MultiIndex` index) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- :cls:`MultiIndex` has gained the :meth:`MultiIndex.from_frame`, it allows constructing a :cls:`MultiIndex` object from a :cls:`DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) I/O diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c4ae7ef54bfce..be0856e1a825a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,4 +1,5 @@ # pylint: disable=E1101,E1103,W0232 +from collections import OrderedDict import datetime from sys import getsizeof import warnings @@ -18,6 +19,7 @@ is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, pandas_dtype) from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype +from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import array_equivalent, isna import pandas.core.algorithms as algos @@ -125,25 +127,25 @@ class MultiIndex(Index): Parameters ---------- levels : sequence of arrays - The unique labels for each level + The unique labels for each level. codes : sequence of arrays - Integers for each level designating which label at each location + Integers for each level designating which label at each location. .. versionadded:: 0.24.0 labels : sequence of arrays - Integers for each level designating which label at each location + Integers for each level designating which label at each location. .. deprecated:: 0.24.0 Use ``codes`` instead sortorder : optional int Level of sortedness (must be lexicographically sorted by that - level) + level). names : optional sequence of objects - Names for each of the index levels. (name is accepted for compat) - copy : boolean, default False - Copy the meta-data - verify_integrity : boolean, default True - Check that the levels/codes are consistent and valid + Names for each of the index levels. (name is accepted for compat). + copy : bool, default False + Copy the meta-data. + verify_integrity : bool, default True + Check that the levels/codes are consistent and valid. Attributes ---------- @@ -158,6 +160,7 @@ class MultiIndex(Index): from_arrays from_tuples from_product + from_frame set_levels set_codes to_frame @@ -175,13 +178,9 @@ class MultiIndex(Index): MultiIndex.from_product : Create a MultiIndex from the cartesian product of iterables. MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. Index : The base pandas Index type. - Notes - ----- - See the `user guide - `_ for more. - Examples --------- A new ``MultiIndex`` is typically constructed using one of the helper @@ -196,6 +195,11 @@ class MultiIndex(Index): See further examples for how to construct a MultiIndex in the doc strings of the mentioned helper methods. + + Notes + ----- + See the `user guide + `_ for more. """ # initialize to zero-length tuples to make everything work @@ -288,7 +292,7 @@ def _verify_integrity(self, codes=None, levels=None): @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ - Convert arrays to MultiIndex + Convert arrays to MultiIndex. Parameters ---------- @@ -297,7 +301,9 @@ def from_arrays(cls, arrays, sortorder=None, names=None): len(arrays) is the number of levels. sortorder : int or None Level of sortedness (must be lexicographically sorted by that - level) + level). + names : list / sequence of str, optional + Names for the levels in the index. Returns ------- @@ -308,11 +314,15 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. Examples -------- >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['number', 'color']) """ if not is_list_like(arrays): raise TypeError("Input must be a list / sequence of array-likes.") @@ -337,7 +347,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): @classmethod def from_tuples(cls, tuples, sortorder=None, names=None): """ - Convert list of tuples to MultiIndex + Convert list of tuples to MultiIndex. Parameters ---------- @@ -345,7 +355,9 @@ def from_tuples(cls, tuples, sortorder=None, names=None): Each tuple is the index of one row/column. sortorder : int or None Level of sortedness (must be lexicographically sorted by that - level) + level). + names : list / sequence of str, optional + Names for the levels in the index. Returns ------- @@ -353,15 +365,19 @@ def from_tuples(cls, tuples, sortorder=None, names=None): See Also -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables + of iterables. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. Examples -------- >>> tuples = [(1, u'red'), (1, u'blue'), - (2, u'red'), (2, u'blue')] + ... (2, u'red'), (2, u'blue')] >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['number', 'color']) """ if not is_list_like(tuples): raise TypeError('Input must be a list / sequence of tuple-likes.') @@ -388,7 +404,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): @classmethod def from_product(cls, iterables, sortorder=None, names=None): """ - Make a MultiIndex from the cartesian product of multiple iterables + Make a MultiIndex from the cartesian product of multiple iterables. Parameters ---------- @@ -397,7 +413,7 @@ def from_product(cls, iterables, sortorder=None, names=None): sortorder : int or None Level of sortedness (must be lexicographically sorted by that level). - names : list / sequence of strings or None + names : list / sequence of str, optional Names for the levels in the index. Returns @@ -408,16 +424,17 @@ def from_product(cls, iterables, sortorder=None, names=None): -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. Examples -------- >>> numbers = [0, 1, 2] - >>> colors = [u'green', u'purple'] + >>> colors = ['green', 'purple'] >>> pd.MultiIndex.from_product([numbers, colors], - names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + ... names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], ['green', 'purple']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) + names=['number', 'color']) """ from pandas.core.arrays.categorical import _factorize_from_iterables from pandas.core.reshape.util import cartesian_product @@ -431,6 +448,68 @@ def from_product(cls, iterables, sortorder=None, names=None): codes = cartesian_product(codes) return MultiIndex(levels, codes, sortorder=sortorder, names=names) + @classmethod + def from_frame(cls, df, sortorder=None, names=None): + """ + Make a MultiIndex from a DataFrame. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + df : DataFrame + DataFrame to be converted to MultiIndex. + sortorder : int, optional + Level of sortedness (must be lexicographically sorted by that + level). + names : list-like, optional + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. + + Returns + ------- + MultiIndex + The MultiIndex representation of the given DataFrame. + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + + Examples + -------- + >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], + ... columns=['a', 'b']) + >>> df + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip + + >>> pd.MultiIndex.from_frame(df) + MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['a', 'b']) + + Using explicit names, instead of the column names + + >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['state', 'observation']) + """ + if not isinstance(df, ABCDataFrame): + raise TypeError("Input must be a DataFrame") + + column_names, columns = lzip(*df.iteritems()) + names = column_names if names is None else names + return cls.from_arrays(columns, sortorder=sortorder, names=names) + # -------------------------------------------------------------------- @property @@ -1386,11 +1465,16 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names - result = DataFrame({(name or level): - self._get_level_values(level) - for name, level in - zip(idx_names, range(len(self.levels)))}, - copy=False) + # Guarantee resulting column order + result = DataFrame( + OrderedDict([ + ((level if name is None else name), + self._get_level_values(level)) + for name, level in zip(idx_names, range(len(self.levels))) + ]), + copy=False + ) + if index: result.index = self return result diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index d80395e513497..e6678baf8a996 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from collections import OrderedDict import re import numpy as np @@ -108,6 +109,9 @@ def test_copy_in_constructor(): assert mi.levels[0][0] == val +# ---------------------------------------------------------------------------- +# from_arrays +# ---------------------------------------------------------------------------- def test_from_arrays(idx): arrays = [np.asarray(lev).take(level_codes) for lev, level_codes in zip(idx.levels, idx.codes)] @@ -278,6 +282,9 @@ def test_from_arrays_different_lengths(idx1, idx2): MultiIndex.from_arrays([idx1, idx2]) +# ---------------------------------------------------------------------------- +# from_tuples +# ---------------------------------------------------------------------------- def test_from_tuples(): msg = 'Cannot infer number of levels from empty list' with pytest.raises(TypeError, match=msg): @@ -321,6 +328,28 @@ def test_from_tuples_index_values(idx): assert (result.values == idx.values).all() +def test_tuples_with_name_string(): + # GH 15110 and GH 14848 + + li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] + with pytest.raises(ValueError): + pd.Index(li, name='abc') + with pytest.raises(ValueError): + pd.Index(li, name='a') + + +def test_from_tuples_with_tuple_label(): + # GH 15457 + expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], + columns=['a', 'b', 'c']).set_index(['a', 'b']) + idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) + result = pd.DataFrame([2, 3], columns=['c'], index=idx) + tm.assert_frame_equal(expected, result) + + +# ---------------------------------------------------------------------------- +# from_product +# ---------------------------------------------------------------------------- def test_from_product_empty_zero_levels(): # 0 levels msg = "Must pass non-zero number of levels/codes" @@ -470,20 +499,79 @@ def test_create_index_existing_name(idx): tm.assert_index_equal(result, expected) -def test_tuples_with_name_string(): - # GH 15110 and GH 14848 - - li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] - with pytest.raises(ValueError): - pd.Index(li, name='abc') - with pytest.raises(ValueError): - pd.Index(li, name='a') - - -def test_from_tuples_with_tuple_label(): - # GH 15457 - expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], - columns=['a', 'b', 'c']).set_index(['a', 'b']) - idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) - result = pd.DataFrame([2, 3], columns=['c'], index=idx) - tm.assert_frame_equal(expected, result) +# ---------------------------------------------------------------------------- +# from_frame +# ---------------------------------------------------------------------------- +def test_from_frame(): + # GH 22420 + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=['L1', 'L2']) + expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), + ('b', 'a'), ('b', 'b')], + names=['L1', 'L2']) + result = pd.MultiIndex.from_frame(df) + tm.assert_index_equal(expected, result) + + +@pytest.mark.parametrize('non_frame', [ + pd.Series([1, 2, 3, 4]), + [1, 2, 3, 4], + [[1, 2], [3, 4], [5, 6]], + pd.Index([1, 2, 3, 4]), + np.array([[1, 2], [3, 4], [5, 6]]), + 27 +]) +def test_from_frame_error(non_frame): + # GH 22420 + with pytest.raises(TypeError, match='Input must be a DataFrame'): + pd.MultiIndex.from_frame(non_frame) + + +def test_from_frame_dtype_fidelity(): + # GH 22420 + df = pd.DataFrame(OrderedDict([ + ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), + ('a', [1, 1, 1, 2, 2, 2]), + ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), + ('c', ['x', 'x', 'y', 'z', 'x', 'y']) + ])) + original_dtypes = df.dtypes.to_dict() + + expected_mi = pd.MultiIndex.from_arrays([ + pd.date_range('19910905', periods=6, tz='US/Eastern'), + [1, 1, 1, 2, 2, 2], + pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), + ['x', 'x', 'y', 'z', 'x', 'y'] + ], names=['dates', 'a', 'b', 'c']) + mi = pd.MultiIndex.from_frame(df) + mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + tm.assert_index_equal(expected_mi, mi) + assert original_dtypes == mi_dtypes + + +@pytest.mark.parametrize('names_in,names_out', [ + (None, [('L1', 'x'), ('L2', 'y')]), + (['x', 'y'], ['x', 'y']), +]) +def test_from_frame_valid_names(names_in, names_out): + # GH 22420 + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=pd.MultiIndex.from_tuples([('L1', 'x'), + ('L2', 'y')])) + mi = pd.MultiIndex.from_frame(df, names=names_in) + assert mi.names == names_out + + +@pytest.mark.parametrize('names_in,names_out', [ + ('bad_input', ValueError("Names should be list-like for a MultiIndex")), + (['a', 'b', 'c'], ValueError("Length of names must match number of " + "levels in MultiIndex.")) +]) +def test_from_frame_invalid_names(names_in, names_out): + # GH 22420 + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=pd.MultiIndex.from_tuples([('L1', 'x'), + ('L2', 'y')])) + with pytest.raises(type(names_out), match=names_out.args[0]): + pd.MultiIndex.from_frame(df, names=names_in) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index b72fadfeeab72..0c483873a335e 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +from collections import OrderedDict + import pytest import numpy as np @@ -83,6 +85,39 @@ def test_to_frame(): tm.assert_frame_equal(result, expected) +def test_to_frame_dtype_fidelity(): + # GH 22420 + mi = pd.MultiIndex.from_arrays([ + pd.date_range('19910905', periods=6, tz='US/Eastern'), + [1, 1, 1, 2, 2, 2], + pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), + ['x', 'x', 'y', 'z', 'x', 'y'] + ], names=['dates', 'a', 'b', 'c']) + original_dtypes = {name: mi.levels[i].dtype + for i, name in enumerate(mi.names)} + + expected_df = pd.DataFrame(OrderedDict([ + ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), + ('a', [1, 1, 1, 2, 2, 2]), + ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), + ('c', ['x', 'x', 'y', 'z', 'x', 'y']) + ])) + df = mi.to_frame(index=False) + df_dtypes = df.dtypes.to_dict() + + tm.assert_frame_equal(df, expected_df) + assert original_dtypes == df_dtypes + + +def test_to_frame_resulting_column_order(): + # GH 22420 + expected = ['z', 0, 'a'] + mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'], + ['q', 'w', 'e']], names=expected) + result = mi.to_frame().columns.tolist() + assert result == expected + + def test_to_hierarchical(): index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two')])