-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: MultiIndex.from_frame #23141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: MultiIndex.from_frame #23141
Changes from 44 commits
79bdecb
fa82618
64b45d6
64c7bb1
3ee676c
fd266f5
4bc8f5b
9d92b70
45595ad
3530cd3
1c22791
cf78780
64c2750
ede030b
190c341
e0df632
78ff5c2
0252db9
d98c8a9
8a1906e
08c120f
8353c3f
9df3c11
6d4915e
b5df7b2
ab3259c
cf95261
63051d7
a75a4a5
8d23df9
c8d696d
7cf82d1
1a282e5
b3c6a90
c760359
bb69314
9e11180
96c6af3
a5236bf
c78f364
14bfea8
6960804
11c5947
904644a
30fe0df
ec60563
8fc6609
9b906c6
e416122
4ef9ec4
4240a1e
9159b2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||
# pylint: disable=E1101,E1103,W0232 | ||||||
import datetime | ||||||
from collections import OrderedDict | ||||||
from sys import getsizeof | ||||||
import warnings | ||||||
|
||||||
|
@@ -18,6 +19,7 @@ | |||||
is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, | ||||||
pandas_dtype) | ||||||
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype | ||||||
from pandas.core.dtypes.generic import ABCDataFrame | ||||||
from pandas.core.dtypes.missing import array_equivalent, isna | ||||||
|
||||||
import pandas.core.algorithms as algos | ||||||
|
@@ -179,6 +181,7 @@ class MultiIndex(Index): | |||||
from_arrays | ||||||
from_tuples | ||||||
from_product | ||||||
from_frame | ||||||
set_levels | ||||||
set_labels | ||||||
to_frame | ||||||
|
@@ -305,6 +308,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): | |||||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex. | ||||||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||||||
of iterables. | ||||||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||||||
""" | ||||||
if not is_list_like(arrays): | ||||||
raise TypeError("Input must be a list / sequence of array-likes.") | ||||||
|
@@ -354,6 +358,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): | |||||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex | ||||||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||||||
of iterables | ||||||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||||||
""" | ||||||
if not is_list_like(tuples): | ||||||
raise TypeError('Input must be a list / sequence of tuple-likes.') | ||||||
|
@@ -410,6 +415,7 @@ def from_product(cls, iterables, sortorder=None, names=None): | |||||
-------- | ||||||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex. | ||||||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex. | ||||||
MultiIndex.from_frame : Make a MultiIndex from a DataFrame. | ||||||
""" | ||||||
from pandas.core.arrays.categorical import _factorize_from_iterables | ||||||
from pandas.core.reshape.util import cartesian_product | ||||||
|
@@ -423,6 +429,78 @@ def from_product(cls, iterables, sortorder=None, names=None): | |||||
labels = cartesian_product(labels) | ||||||
return MultiIndex(levels, labels, sortorder=sortorder, names=names) | ||||||
|
||||||
@classmethod | ||||||
def from_frame(cls, df, sortorder=None, names=None): | ||||||
""" | ||||||
Make a MultiIndex from a DataFrame. | ||||||
|
||||||
.. versionadded:: 0.24.0 | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
df : pd.DataFrame | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||||||
DataFrame to be converted to MultiIndex. | ||||||
sortorder : int or None | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
And please explain what it means to not be provided ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||
Level of sortedness (must be lexicographically sorted by that | ||||||
level). | ||||||
names : list-like, optonal | ||||||
If no names are provided, use the column names, or tuple of column | ||||||
names if the columns is a MultiIndex. If a sequence, overwrite | ||||||
names with the given sequence. | ||||||
|
||||||
Returns | ||||||
------- | ||||||
MultiIndex or Index | ||||||
The MultiIndex representation of the given DataFrame. | ||||||
datapythonista marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
Examples | ||||||
-------- | ||||||
>>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'], | ||||||
... [1, 'jolly'], [2, 'joy'], [2, 'joy']], | ||||||
... columns=['will_be', 'used']) | ||||||
>>> df | ||||||
will_be used | ||||||
0 0 happy | ||||||
1 0 jolly | ||||||
2 1 happy | ||||||
3 1 jolly | ||||||
4 2 joy | ||||||
5 2 joy | ||||||
>>> pd.MultiIndex.from_frame(df) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a blank line between cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||||||
MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], | ||||||
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], | ||||||
names=['will_be', 'used']) | ||||||
|
||||||
>>> df = pd.DataFrame([['ahc', 'iam'], ['ahc', 'wim'], ['boh', 'amg'], | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a 1-line expln here (I think the first one is self-explanatorY) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||||||
... ['boh', 'iam'], ['oil', 'wim'], ['oil', 'amg']], | ||||||
... columns=['will_be', 'overriden']) | ||||||
>>> df | ||||||
will_be overriden | ||||||
0 ahc iam | ||||||
1 ahc wim | ||||||
2 boh amg | ||||||
3 boh iam | ||||||
4 oil wim | ||||||
5 oil amg | ||||||
>>> pd.MultiIndex.from_frame(df, names=['sure', 'will']) | ||||||
MultiIndex(levels=[['ahc', 'boh', 'oil'], ['amg', 'iam', 'wim']], | ||||||
labels=[[0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]], | ||||||
names=['sure', 'will']) | ||||||
|
||||||
See Also | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This goes before the Examples section. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed this docstring and all the other constructor methods docstrings (since I had to modify them to update the See Alsos). I also fixed the pd.MultiIndex docstring to the best of my abilities (since I had to make some small modifications to that too). However, there were still some issues with the MultiIndex docstring:
Any ideas on how I should address these? Looks like labels is there to serve as a deprecation reminder. |
||||||
-------- | ||||||
MultiIndex.from_arrays : Convert list of arrays to MultiIndex. | ||||||
MultiIndex.from_tuples : Convert list of tuples to MultiIndex. | ||||||
MultiIndex.from_product : Make a MultiIndex from cartesian product | ||||||
of iterables. | ||||||
""" | ||||||
if not isinstance(df, ABCDataFrame): | ||||||
raise TypeError("Input must be a DataFrame") | ||||||
|
||||||
column_names, columns = lzip(*df.iteritems()) | ||||||
names = column_names if names is None else names | ||||||
return cls.from_arrays(columns, sortorder=sortorder, names=names) | ||||||
|
||||||
# -------------------------------------------------------------------- | ||||||
|
||||||
@property | ||||||
|
@@ -1358,11 +1436,17 @@ def to_frame(self, index=True, name=None): | |||||
else: | ||||||
idx_names = self.names | ||||||
|
||||||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
result = DataFrame({(name or level): | ||||||
self._get_level_values(level) | ||||||
for name, level in | ||||||
zip(idx_names, range(len(self.levels)))}, | ||||||
copy=False) | ||||||
# Guarantee resulting column order | ||||||
result = DataFrame( | ||||||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
OrderedDict([ | ||||||
((level if name is None else name), | ||||||
self._get_level_values(level)) | ||||||
for name, level in zip(idx_names, range(len(self.levels))) | ||||||
]), | ||||||
copy=False | ||||||
) | ||||||
|
||||||
|
||||||
if index: | ||||||
result.index = self | ||||||
return result | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import re | ||
from collections import OrderedDict | ||
|
||
import numpy as np | ||
import pytest | ||
|
@@ -99,6 +100,9 @@ def test_copy_in_constructor(): | |
assert mi.levels[0][0] == val | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# from_arrays | ||
# ---------------------------------------------------------------------------- | ||
def test_from_arrays(idx): | ||
arrays = [np.asarray(lev).take(lab) | ||
for lev, lab in zip(idx.levels, idx.labels)] | ||
|
@@ -269,6 +273,9 @@ def test_from_arrays_different_lengths(idx1, idx2): | |
MultiIndex.from_arrays([idx1, idx2]) | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# from_tuples | ||
# ---------------------------------------------------------------------------- | ||
def test_from_tuples(): | ||
msg = 'Cannot infer number of levels from empty list' | ||
with pytest.raises(TypeError, match=msg): | ||
|
@@ -312,6 +319,28 @@ def test_from_tuples_index_values(idx): | |
assert (result.values == idx.values).all() | ||
|
||
|
||
def test_tuples_with_name_string(): | ||
# GH 15110 and GH 14848 | ||
|
||
li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='abc') | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='a') | ||
|
||
|
||
def test_from_tuples_with_tuple_label(): | ||
# GH 15457 | ||
expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], | ||
columns=['a', 'b', 'c']).set_index(['a', 'b']) | ||
idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) | ||
result = pd.DataFrame([2, 3], columns=['c'], index=idx) | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
# from_product | ||
# ---------------------------------------------------------------------------- | ||
def test_from_product_empty_zero_levels(): | ||
# 0 levels | ||
msg = "Must pass non-zero number of levels/labels" | ||
|
@@ -461,20 +490,72 @@ def test_create_index_existing_name(idx): | |
tm.assert_index_equal(result, expected) | ||
|
||
|
||
def test_tuples_with_name_string(): | ||
# GH 15110 and GH 14848 | ||
|
||
li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='abc') | ||
with pytest.raises(ValueError): | ||
pd.Index(li, name='a') | ||
|
||
|
||
def test_from_tuples_with_tuple_label(): | ||
# GH 15457 | ||
expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], | ||
columns=['a', 'b', 'c']).set_index(['a', 'b']) | ||
idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) | ||
result = pd.DataFrame([2, 3], columns=['c'], index=idx) | ||
tm.assert_frame_equal(expected, result) | ||
# ---------------------------------------------------------------------------- | ||
# from_frame | ||
# ---------------------------------------------------------------------------- | ||
def test_from_frame(): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH 22420 | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=['L1', 'L2']) | ||
expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), | ||
('b', 'a'), ('b', 'b')], | ||
names=['L1', 'L2']) | ||
result = pd.MultiIndex.from_frame(df) | ||
tm.assert_index_equal(expected, result) | ||
|
||
|
||
@pytest.mark.parametrize('non_frame', [ | ||
pd.Series([1, 2, 3, 4]), | ||
[1, 2, 3, 4], | ||
[[1, 2], [3, 4], [5, 6]], | ||
pd.Index([1, 2, 3, 4]), | ||
np.array([[1, 2], [3, 4], [5, 6]]), | ||
27 | ||
]) | ||
def test_from_frame_non_frame(non_frame): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rename to test_from_frame_error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
# GH 22420 | ||
with pytest.raises(TypeError, match='Input must be a DataFrame'): | ||
pd.MultiIndex.from_frame(non_frame) | ||
|
||
|
||
def test_from_frame_dtype_fidelity(): | ||
# GH 22420 | ||
df = pd.DataFrame(OrderedDict([ | ||
('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), | ||
('a', [1, 1, 1, 2, 2, 2]), | ||
('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), | ||
('c', ['x', 'x', 'y', 'z', 'x', 'y']) | ||
])) | ||
original_dtypes = df.dtypes.to_dict() | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
expected_mi = pd.MultiIndex.from_arrays([ | ||
pd.date_range('19910905', periods=6, tz='US/Eastern'), | ||
[1, 1, 1, 2, 2, 2], | ||
pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), | ||
['x', 'x', 'y', 'z', 'x', 'y'] | ||
], names=['dates', 'a', 'b', 'c']) | ||
mi = pd.MultiIndex.from_frame(df) | ||
mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} | ||
|
||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tm.assert_index_equal(expected_mi, mi) | ||
assert original_dtypes == mi_dtypes | ||
|
||
|
||
@pytest.mark.parametrize('names_in,names_out', [ | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(None, [('L1', 'x'), ('L2', 'y')]), | ||
(['x', 'y'], ['x', 'y']), | ||
('bad_input', ValueError("Names should be list-like for a MultiIndex")), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't you split this to 2 tests, with 1 the working cases, and 1 the error cases, easier to read There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
(['a', 'b', 'c'], ValueError("Length of names must match number of " | ||
"levels in MultiIndex.")) | ||
]) | ||
def test_from_frame_names(names_in, names_out): | ||
# GH 22420 | ||
df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], | ||
columns=pd.MultiIndex.from_tuples([('L1', 'x'), | ||
('L2', 'y')])) | ||
if isinstance(names_out, Exception): | ||
with pytest.raises(type(names_out), match=names_out.args[0]): | ||
pd.MultiIndex.from_frame(df, names=names_in) | ||
else: | ||
mi = pd.MultiIndex.from_frame(df, names=names_in) | ||
assert mi.names == names_out |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from collections import OrderedDict | ||
|
||
import pytest | ||
import numpy as np | ||
|
||
|
@@ -83,6 +85,39 @@ def test_to_frame(): | |
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_to_frame_dtype_fidelity(): | ||
ms7463 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH 22420 | ||
mi = pd.MultiIndex.from_arrays([ | ||
pd.date_range('19910905', periods=6, tz='US/Eastern'), | ||
[1, 1, 1, 2, 2, 2], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this a repeated test of the above, if so, then not necessary here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was at the suggestion of @TomAugspurger |
||
pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), | ||
['x', 'x', 'y', 'z', 'x', 'y'] | ||
], names=['dates', 'a', 'b', 'c']) | ||
original_dtypes = {name: mi.levels[i].dtype | ||
for i, name in enumerate(mi.names)} | ||
|
||
expected_df = pd.DataFrame(OrderedDict([ | ||
('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), | ||
('a', [1, 1, 1, 2, 2, 2]), | ||
('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), | ||
('c', ['x', 'x', 'y', 'z', 'x', 'y']) | ||
])) | ||
df = mi.to_frame(index=False) | ||
df_dtypes = df.dtypes.to_dict() | ||
|
||
tm.assert_frame_equal(df, expected_df) | ||
assert original_dtypes == df_dtypes | ||
|
||
|
||
def test_to_frame_resulting_column_order(): | ||
# GH 22420 | ||
expected = ['z', 0, 'a'] | ||
mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'], | ||
['q', 'w', 'e']], names=expected) | ||
result = mi.to_frame().columns.tolist() | ||
assert result == expected | ||
|
||
|
||
def test_to_hierarchical(): | ||
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( | ||
2, 'two')]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you use a
:meth:
ref forMultiIndex.to_frame()
and:attr:
for.names
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.