Skip to content

Commit 396b9db

Browse files
committed
ENH: add MultiIndex.to_dataframe
ENH: allow hashing of MultiIndex closes #12397
1 parent 9309eba commit 396b9db

File tree

6 files changed

+72
-12
lines changed

6 files changed

+72
-12
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1460,6 +1460,7 @@ MultiIndex Components
14601460
MultiIndex.set_levels
14611461
MultiIndex.set_labels
14621462
MultiIndex.to_hierarchical
1463+
MultiIndex.to_dataframe
14631464
MultiIndex.is_lexsorted
14641465
MultiIndex.droplevel
14651466
MultiIndex.swaplevel

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ Other enhancements
131131
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
132132
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
133133
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
134-
134+
- ``MultiIndex`` has gained a ``.to_dataframe()`` method to convert to a ``DataFrame`` (:issue:`12397`)
135135
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`)
136136
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
137137
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)

pandas/indexes/multi.py

+23
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,29 @@ def _to_safe_for_reshape(self):
827827
""" convert to object if we are a categorical """
828828
return self.set_levels([i._to_safe_for_reshape() for i in self.levels])
829829

830+
def to_dataframe(self, index=True):
831+
"""
832+
Create a DataFrame with the columns the levels of the MultiIndex
833+
834+
.. versionadded:: 0.20.0
835+
836+
Parameters
837+
----------
838+
index : boolean, default True
839+
return this MultiIndex as the index
840+
841+
Returns
842+
-------
843+
DataFrame
844+
"""
845+
846+
from pandas import DataFrame
847+
result = DataFrame({(name or level):self.get_level_values(level)
848+
for name, level in zip(self.names, range(len(self.levels)))})
849+
if index:
850+
result.index = self
851+
return result
852+
830853
def to_hierarchical(self, n_repeat, n_shuffle=1):
831854
"""
832855
Return a MultiIndex reshaped to conform to the

pandas/tests/indexes/test_multi.py

+34
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,40 @@ def test_format_sparse_config(self):
13481348

13491349
warnings.filters = warn_filters
13501350

1351+
def test_to_dataframe(self):
1352+
tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
1353+
1354+
index = MultiIndex.from_tuples(tuples)
1355+
result = index.to_dataframe(index=False)
1356+
expected = DataFrame(tuples)
1357+
tm.assert_frame_equal(result, expected)
1358+
1359+
result = index.to_dataframe()
1360+
expected.index = index
1361+
tm.assert_frame_equal(result, expected)
1362+
1363+
tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
1364+
index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
1365+
result = index.to_dataframe(index=False)
1366+
expected = DataFrame(tuples)
1367+
expected.columns = ['first', 'second']
1368+
tm.assert_frame_equal(result, expected)
1369+
1370+
result = index.to_dataframe()
1371+
expected.index = index
1372+
tm.assert_frame_equal(result, expected)
1373+
1374+
index = MultiIndex.from_product([range(5), pd.date_range('20130101', periods=3)])
1375+
result = index.to_dataframe(index=False)
1376+
expected = DataFrame({0: np.repeat(np.arange(5, dtype='int64'), 3),
1377+
1: np.tile(pd.date_range('20130101', periods=3), 5)})
1378+
tm.assert_frame_equal(result, expected)
1379+
1380+
index = MultiIndex.from_product([range(5), pd.date_range('20130101', periods=3)])
1381+
result = index.to_dataframe()
1382+
expected.index = index
1383+
tm.assert_frame_equal(result, expected)
1384+
13511385
def test_to_hierarchical(self):
13521386
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (
13531387
2, 'two')])

pandas/tools/hashing.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
import numpy as np
6-
from pandas import _hash, Series, factorize, Categorical, Index
6+
from pandas import _hash, Series, DataFrame, factorize, Categorical, Index, MultiIndex
77
from pandas.lib import is_bool_array
88
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
99
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
@@ -45,6 +45,12 @@ def adder(h, hashed_to_add):
4545
h = np.multiply(h, np.uint(3), h)
4646
return np.add(h, hashed_to_add, h)
4747

48+
if isinstance(obj, MultiIndex):
49+
# efficiently turn us into a DataFrame and hash
50+
return hash_pandas_object(obj.to_dataframe(index=False),
51+
index=False, encoding=encoding,
52+
hash_key=hash_key, categorize=categorize)
53+
4854
if isinstance(obj, ABCIndexClass):
4955
h = hash_array(obj.values, encoding, hash_key,
5056
categorize).astype('uint64')

pandas/tools/tests/test_hashing.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pandas as pd
33

4-
from pandas import DataFrame, Series, Index
4+
from pandas import DataFrame, Series, Index, MultiIndex
55
from pandas.tools.hashing import hash_array, hash_pandas_object
66
import pandas.util.testing as tm
77

@@ -72,7 +72,11 @@ def test_hash_pandas_object(self):
7272
tm.makeMixedDataFrame(),
7373
tm.makeTimeDataFrame(),
7474
tm.makeTimeSeries(),
75-
tm.makeTimedeltaIndex()]:
75+
tm.makeTimedeltaIndex(),
76+
MultiIndex.from_product(
77+
[range(5),
78+
['foo', 'bar', 'baz'],
79+
pd.date_range('20130101', periods=2)])]:
7680
self.check_equal(obj)
7781
self.check_not_equal_with_index(obj)
7882

@@ -140,14 +144,6 @@ def f():
140144
hash_pandas_object(obj)
141145
self.assertRaises(TypeError, f)
142146

143-
# MultiIndex are represented as tuples
144-
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
145-
[('a', 1), ('a', 2), ('b', 1)]))
146-
147-
def f():
148-
hash_pandas_object(obj)
149-
self.assertRaises(TypeError, f)
150-
151147
def test_alread_encoded(self):
152148
# if already encoded then ok
153149

0 commit comments

Comments
 (0)