From 79bdecb789ecd4c0e006431659acedd2b12ff840 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 13 Oct 2018 18:11:31 -0400 Subject: [PATCH 01/41] ENH - add from_frame method and accompanying squeeze method to multiindex --- pandas/core/indexes/multi.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3cccb65503378..0d62f4c8db3fa 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1412,6 +1412,22 @@ def from_product(cls, iterables, sortorder=None, names=None): labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) + @classmethod + def from_frame(cls, df, squeeze=True): + """ + :param df + :param squeeze + Squeeze single level multiindex to be a regular index + """ + # just let column level names be the tuple of the meta df columns since they're not required to be strings + # columns = ['.'.join(col) for col in list(df)] + columns = list(df) + mi = cls.from_tuples(list(df.values), names=columns) + if squeeze: + return mi.squeeze() + else: + return mi + def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 @@ -1474,6 +1490,16 @@ def _sort_levels_monotonic(self): names=self.names, sortorder=self.sortorder, verify_integrity=False) + def squeeze(self): + """ + If multiindex is only composed of a single level, convert to a regular index. + Otherwise return a copy of the index. + """ + if len(self.levels) == 1: + return self.levels[0][self.labels[0]] + else: + return self.copy() + def remove_unused_levels(self): """ Create a new MultiIndex from the current that removes From fa82618b66a094c7d230ee32e13e1818ef890d80 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 13 Oct 2018 18:13:19 -0400 Subject: [PATCH 02/41] ENH - guarentee that order of labels is preserved in multiindex to_frame method --- pandas/core/indexes/multi.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0d62f4c8db3fa..d5d989c034301 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1189,11 +1189,8 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names - result = DataFrame({(name or level): - self._get_level_values(level) - for name, level in - zip(idx_names, range(len(self.levels)))}, - copy=False) + result = DataFrame(list(self), columns=[n or i for i, n in enumerate(idx_names)]) + if index: result.index = self return result From 64b45d6fb68903b06102c59799c520dab7ab5245 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 13 Oct 2018 18:23:22 -0400 Subject: [PATCH 03/41] CLN - adhere to PEP8 line length --- pandas/core/indexes/multi.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d5d989c034301..2107c30bbb8f1 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1189,7 +1189,10 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names - result = DataFrame(list(self), columns=[n or i for i, n in enumerate(idx_names)]) + result = DataFrame( + list(self), + columns=[n or i for i, n in enumerate(idx_names)] + ) if index: result.index = self @@ -1416,8 +1419,8 @@ def from_frame(cls, df, squeeze=True): :param squeeze Squeeze single level multiindex to be a regular index """ - # just let column level names be the tuple of the meta df columns since they're not required to be strings - # columns = ['.'.join(col) for col in list(df)] + # just let column level names be the tuple of the meta df columns + # since they're not required to be strings columns = list(df) mi = cls.from_tuples(list(df.values), names=columns) if squeeze: @@ -1489,7 +1492,7 @@ def _sort_levels_monotonic(self): def squeeze(self): """ - If multiindex is only composed of a single level, convert to a regular index. + If multiindex is only composed of a single level, convert to an index. Otherwise return a copy of the index. """ if len(self.levels) == 1: From 64c7bb161eee724258111c0dd40573f62828a67a Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 13 Oct 2018 18:24:26 -0400 Subject: [PATCH 04/41] CLN - remove trailing whitespace --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2107c30bbb8f1..81fd25192de82 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1190,7 +1190,7 @@ def to_frame(self, index=True, name=None): idx_names = self.names result = DataFrame( - list(self), + list(self), columns=[n or i for i, n in enumerate(idx_names)] ) From 3ee676c4fb2ad6e7c57a34e41e57e4000a6389d5 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 15 Oct 2018 23:16:56 -0400 Subject: [PATCH 05/41] ENH - raise TypeError on inappropriate input --- pandas/core/indexes/multi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 81fd25192de82..f57a165d583cc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1421,6 +1421,8 @@ def from_frame(cls, df, squeeze=True): """ # just let column level names be the tuple of the meta df columns # since they're not required to be strings + if not isinstance(df, pd.DataFrame): + raise TypeError("Input must be a DataFrame") columns = list(df) mi = cls.from_tuples(list(df.values), names=columns) if squeeze: From fd266f58ae59e3f073711f0cab4944ac7103519e Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 15 Oct 2018 23:17:41 -0400 Subject: [PATCH 06/41] TST - add tests for mi.from_frame and mi.squeeze --- .../tests/indexes/multi/test_constructor.py | 42 +++++++++++++++++++ pandas/tests/indexes/multi/test_conversion.py | 15 +++++++ 2 files changed, 57 insertions(+) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index ab2e4c1d863a7..6bd9695f7991d 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -472,3 +472,45 @@ def test_from_tuples_with_tuple_label(): idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) result = pd.DataFrame([2, 3], columns=['c'], index=idx) tm.assert_frame_equal(expected, result) + + +def test_from_frame(): + expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('a', 'c'), + ('b', 'a'), ('b', 'b'), ('c', 'a'), + ('c', 'b')], + names=['L1', 'L2']) + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['a', 'c'], ['b', 'a'], + ['b', 'b'], ['c', 'a'], ['c', 'b']], + columns=['L1', 'L2']) + result = pd.MultiIndex.from_frame(df) + tm.assert_index_equal(expected, result) + + +def test_from_frame(): + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=['L1', 'L2']) + expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), + ('b', 'a'), ('b', 'b')], + names=['L1', 'L2']) + result = pd.MultiIndex.from_frame(df) + tm.assert_index_equal(expected, result) + + +def test_from_frame_with_squeeze(): + df = pd.DataFrame([['a',], ['a',], ['b',], ['b',]], columns=['L1',]) + expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') + result = pd.MultiIndex.from_frame(df) + tm.assert_index_equal(expected, result) + + +def test_from_frame_with_no_squeeze(): + df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1',]) + expected = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], + names=['L1',]) + result = pd.MultiIndex.from_frame(df, squeeze=False) + tm.assert_index_equal(expected, result) + + +def test_from_frame_non_frame(): + with pytest.raises(TypeError): + pd.MultiIndex.from_frame([1,2,3,4]) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 8c9566b7e651f..73165038073e3 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -169,3 +169,18 @@ def test_to_series_with_arguments(idx): assert s.values is not idx.values assert s.index is not idx assert s.name != idx.name + + +def test_squeeze(): + mi = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], + names=['L1',]) + expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') + result = mi.squeeze() + tm.assert_index_equal(expected, result) + + mi = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('b', 'a'), + ('b', 'b')], + names=['L1', 'L2']) + expected = mi.copy() + result = mi.squeeze() + tm.assert_index_equal(expected, result) \ No newline at end of file From 4bc8f5b554bf778a9a0da60ee937ea940add6117 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 15 Oct 2018 23:23:11 -0400 Subject: [PATCH 07/41] CLN - pep8 adherence in tests --- pandas/tests/indexes/multi/test_constructor.py | 14 +++++++------- pandas/tests/indexes/multi/test_conversion.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 6bd9695f7991d..6515a43db8220 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -478,10 +478,10 @@ def test_from_frame(): expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('a', 'c'), ('b', 'a'), ('b', 'b'), ('c', 'a'), ('c', 'b')], - names=['L1', 'L2']) - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['a', 'c'], ['b', 'a'], + names=['L1', 'L2']) + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['a', 'c'], ['b', 'a'], ['b', 'b'], ['c', 'a'], ['c', 'b']], - columns=['L1', 'L2']) + columns=['L1', 'L2']) result = pd.MultiIndex.from_frame(df) tm.assert_index_equal(expected, result) @@ -491,20 +491,20 @@ def test_from_frame(): columns=['L1', 'L2']) expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')], - names=['L1', 'L2']) + names=['L1', 'L2']) result = pd.MultiIndex.from_frame(df) tm.assert_index_equal(expected, result) def test_from_frame_with_squeeze(): - df = pd.DataFrame([['a',], ['a',], ['b',], ['b',]], columns=['L1',]) + df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') result = pd.MultiIndex.from_frame(df) tm.assert_index_equal(expected, result) def test_from_frame_with_no_squeeze(): - df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1',]) + df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) expected = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], names=['L1',]) result = pd.MultiIndex.from_frame(df, squeeze=False) @@ -513,4 +513,4 @@ def test_from_frame_with_no_squeeze(): def test_from_frame_non_frame(): with pytest.raises(TypeError): - pd.MultiIndex.from_frame([1,2,3,4]) + pd.MultiIndex.from_frame([1, 2, 3, 4]) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 73165038073e3..7696f89f73f0e 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -173,7 +173,7 @@ def test_to_series_with_arguments(idx): def test_squeeze(): mi = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], - names=['L1',]) + names=['L1']) expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') result = mi.squeeze() tm.assert_index_equal(expected, result) @@ -183,4 +183,4 @@ def test_squeeze(): names=['L1', 'L2']) expected = mi.copy() result = mi.squeeze() - tm.assert_index_equal(expected, result) \ No newline at end of file + tm.assert_index_equal(expected, result) From 9d92b70b4681399d6b71f6d18cf5740bf4f0ff0e Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 15 Oct 2018 23:25:17 -0400 Subject: [PATCH 08/41] CLN - last missed pep8 fix --- pandas/tests/indexes/multi/test_constructor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 6515a43db8220..abe412317d920 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -506,7 +506,7 @@ def test_from_frame_with_squeeze(): def test_from_frame_with_no_squeeze(): df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) expected = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], - names=['L1',]) + names=['L1']) result = pd.MultiIndex.from_frame(df, squeeze=False) tm.assert_index_equal(expected, result) From 45595ad82ec49bbab81903839f948a86e1f11c7a Mon Sep 17 00:00:00 2001 From: ArtinSarraf Date: Tue, 16 Oct 2018 09:03:33 -0400 Subject: [PATCH 09/41] BUG - remove pd.DataFrame in favor of local import --- pandas/core/indexes/multi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f57a165d583cc..d82c23a1e75e8 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1419,9 +1419,10 @@ def from_frame(cls, df, squeeze=True): :param squeeze Squeeze single level multiindex to be a regular index """ + from pandas import DataFrame # just let column level names be the tuple of the meta df columns # since they're not required to be strings - if not isinstance(df, pd.DataFrame): + if not isinstance(df, DataFrame): raise TypeError("Input must be a DataFrame") columns = list(df) mi = cls.from_tuples(list(df.values), names=columns) From 3530cd3b97c51fd9190f8eb298c6d2b0f87dcd79 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 17 Oct 2018 20:22:44 -0400 Subject: [PATCH 10/41] DOC - add more detailed docstrings for from_frame and squeeze --- pandas/core/indexes/multi.py | 42 +++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d82c23a1e75e8..8e0d050ed061b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1415,9 +1415,29 @@ def from_product(cls, iterables, sortorder=None, names=None): @classmethod def from_frame(cls, df, squeeze=True): """ - :param df - :param squeeze - Squeeze single level multiindex to be a regular index + Make a MultiIndex from a dataframe + + Parameters + ---------- + df : pd.DataFrame + DataFrame to be converted to MultiIndex + squeeze : bool + If df is a single column, squeeze multiindex to be a regular index. + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> df = pd.DataFrame([[0, u'green'], [0, u'purple'], [1, u'green'], + [1, u'purple'], [2, u'green'], [2, u'purple']], + columns=[u'number', u'color']) + >>> pd.MultiIndex.from_frame(df) + MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=[u'number', u'color']) + """ from pandas import DataFrame # just let column level names be the tuple of the meta df columns @@ -1495,8 +1515,20 @@ def _sort_levels_monotonic(self): def squeeze(self): """ - If multiindex is only composed of a single level, convert to an index. - Otherwise return a copy of the index. + Squeeze a single level multiindex to be a regular Index instane. If + the MultiIndex is more than a single level, return a copy of the + MultiIndex. + + Returns + ------- + index : Index | MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_tuples([('a',), ('b',), ('c',)]) + >>> mi.squeeze() + Index(['a', 'b', 'c'], dtype='object') + """ if len(self.levels) == 1: return self.levels[0][self.labels[0]] From 1c227915634c0b6161c6dd1788b696ee3959f334 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 28 Oct 2018 13:17:01 -0400 Subject: [PATCH 11/41] DOC - update MultiIndex.from_frame and squeeze doctests to comply with the Pandas docstring guide --- pandas/core/indexes/multi.py | 46 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8e0d050ed061b..3068aa87d9887 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1294,6 +1294,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_tuples : Convert list of tuples to MultiIndex MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. """ if not is_list_like(arrays): raise TypeError("Input must be a list / sequence of array-likes.") @@ -1343,6 +1344,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): MultiIndex.from_arrays : Convert list of arrays to MultiIndex MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. """ if not is_list_like(tuples): raise TypeError('Input must be a list / sequence of tuple-likes.') @@ -1399,6 +1401,7 @@ def from_product(cls, iterables, sortorder=None, names=None): -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex MultiIndex.from_tuples : Convert list of tuples to MultiIndex + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. """ from pandas.core.arrays.categorical import _factorize_from_iterables from pandas.core.reshape.util import cartesian_product @@ -1415,29 +1418,45 @@ def from_product(cls, iterables, sortorder=None, names=None): @classmethod def from_frame(cls, df, squeeze=True): """ - Make a MultiIndex from a dataframe + Make a MultiIndex from a DataFrame. Parameters ---------- df : pd.DataFrame - DataFrame to be converted to MultiIndex - squeeze : bool + DataFrame to be converted to MultiIndex. + squeeze : bool, default True If df is a single column, squeeze multiindex to be a regular index. Returns ------- - index : MultiIndex + MultiIndex or Index + The MultiIndex representation of the given DataFrame. Returns an + Index if the DataFrame is single column and squeeze is True. Examples -------- - >>> df = pd.DataFrame([[0, u'green'], [0, u'purple'], [1, u'green'], - [1, u'purple'], [2, u'green'], [2, u'purple']], - columns=[u'number', u'color']) + >>> df = pd.DataFrame([[0, 'green'], [0, 'purple'], [1, 'green'], + ... [1, 'purple'], [2, 'green'], [2, 'purple']], + ... columns=['number', 'color']) + >>> df + number color + 0 0 green + 1 0 purple + 2 1 green + 3 1 purple + 4 2 green + 5 2 purple >>> pd.MultiIndex.from_frame(df) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + MultiIndex(levels=[[0, 1, 2], ['green', 'purple']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) + names=['number', 'color']) + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_tuples : Convert list of tuples to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables """ from pandas import DataFrame # just let column level names be the tuple of the meta df columns @@ -1515,20 +1534,19 @@ def _sort_levels_monotonic(self): def squeeze(self): """ - Squeeze a single level multiindex to be a regular Index instane. If - the MultiIndex is more than a single level, return a copy of the - MultiIndex. + Squeeze a single level MultiIndex to be a regular Index instane. Returns ------- - index : Index | MultiIndex + Index or MultiIndex + Returns Index equivalent of single level MultiIndex. Returns + copy of MultiIndex if multilevel. Examples -------- >>> mi = pd.MultiIndex.from_tuples([('a',), ('b',), ('c',)]) >>> mi.squeeze() Index(['a', 'b', 'c'], dtype='object') - """ if len(self.levels) == 1: return self.levels[0][self.labels[0]] From cf787806e0b8da07f652b3885e54bab781228ea0 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 28 Oct 2018 18:24:15 -0400 Subject: [PATCH 12/41] CLN - cleanup docstrings and source --- pandas/core/indexes/multi.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3068aa87d9887..02006cf17dee3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1425,7 +1425,7 @@ def from_frame(cls, df, squeeze=True): df : pd.DataFrame DataFrame to be converted to MultiIndex. squeeze : bool, default True - If df is a single column, squeeze multiindex to be a regular index. + If df is a single column, squeeze MultiIndex to be a regular Index. Returns ------- @@ -1465,10 +1465,7 @@ def from_frame(cls, df, squeeze=True): raise TypeError("Input must be a DataFrame") columns = list(df) mi = cls.from_tuples(list(df.values), names=columns) - if squeeze: - return mi.squeeze() - else: - return mi + return mi.squeeze() if squeeze else mi def _sort_levels_monotonic(self): """ @@ -1534,7 +1531,7 @@ def _sort_levels_monotonic(self): def squeeze(self): """ - Squeeze a single level MultiIndex to be a regular Index instane. + Squeeze a single level MultiIndex to be a regular Index instance. Returns ------- @@ -1545,6 +1542,9 @@ def squeeze(self): Examples -------- >>> mi = pd.MultiIndex.from_tuples([('a',), ('b',), ('c',)]) + >>> mi + MultiIndex(levels=[['a', 'b', 'c']], + labels=[[0, 1, 2]]) >>> mi.squeeze() Index(['a', 'b', 'c'], dtype='object') """ From 64c275060227f812f43f09b3e008db7e49de0087 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 28 Oct 2018 18:24:55 -0400 Subject: [PATCH 13/41] TST - reorganize some of the multiindex tests --- .../tests/indexes/multi/test_constructor.py | 45 +++++++++---------- pandas/tests/indexes/multi/test_conversion.py | 4 +- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index abe412317d920..1d54f6af75232 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -474,18 +474,6 @@ def test_from_tuples_with_tuple_label(): tm.assert_frame_equal(expected, result) -def test_from_frame(): - expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('a', 'c'), - ('b', 'a'), ('b', 'b'), ('c', 'a'), - ('c', 'b')], - names=['L1', 'L2']) - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['a', 'c'], ['b', 'a'], - ['b', 'b'], ['c', 'a'], ['c', 'b']], - columns=['L1', 'L2']) - result = pd.MultiIndex.from_frame(df) - tm.assert_index_equal(expected, result) - - def test_from_frame(): df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], columns=['L1', 'L2']) @@ -496,21 +484,28 @@ def test_from_frame(): tm.assert_index_equal(expected, result) -def test_from_frame_with_squeeze(): - df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) - expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') - result = pd.MultiIndex.from_frame(df) - tm.assert_index_equal(expected, result) - - -def test_from_frame_with_no_squeeze(): - df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) - expected = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], - names=['L1']) - result = pd.MultiIndex.from_frame(df, squeeze=False) +@pytest.mark.parametrize('squeeze,input_type,expected', [ + (True, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), + ('b', 'a'), ('b', 'b')], + names=['L1', 'L2'])), + (True, 'single', pd.Index(['a', 'a', 'b', 'b'], name='L1')), + (False, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), + ('b', 'a'), ('b', 'b')], + names=['L1', 'L2'])), + (False, 'single', pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], + names=['L1'])) +]) +def test_from_frame_squeeze(squeeze, input_type, expected): + if input_type == 'multi': + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=['L1', 'L2']) + elif input_type == 'single': + df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) + + result = pd.MultiIndex.from_frame(df, squeeze=squeeze) tm.assert_index_equal(expected, result) def test_from_frame_non_frame(): - with pytest.raises(TypeError): + with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'): pd.MultiIndex.from_frame([1, 2, 3, 4]) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 7696f89f73f0e..4d53b475dd8ad 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -171,13 +171,15 @@ def test_to_series_with_arguments(idx): assert s.name != idx.name -def test_squeeze(): +def test_squeeze_single_level(): mi = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], names=['L1']) expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') result = mi.squeeze() tm.assert_index_equal(expected, result) + +def test_squeeze_multi_level(): mi = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')], names=['L1', 'L2']) From ede030bd2892e002e832242c8b7f26738c5bce10 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 28 Oct 2018 18:30:55 -0400 Subject: [PATCH 14/41] CLN - adhere to pep8 line length --- pandas/tests/indexes/multi/test_constructor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 1d54f6af75232..b4ad4affa396d 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -492,7 +492,8 @@ def test_from_frame(): (False, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')], names=['L1', 'L2'])), - (False, 'single', pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], + (False, 'single', pd.MultiIndex.from_tuples([('a',), ('a',), + ('b',), ('b',)], names=['L1'])) ]) def test_from_frame_squeeze(squeeze, input_type, expected): From 190c341c067a9ee85367263dbdb4c5b640e2550f Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 09:36:55 -0400 Subject: [PATCH 15/41] BUG - ensure dtypes are preserved in from_frame and to_frame --- pandas/core/indexes/multi.py | 64 +++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 02006cf17dee3..f5e1d12e54010 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2,6 +2,7 @@ # pylint: disable=E1101,E1103,W0232 import datetime import warnings +from collections import OrderedDict from sys import getsizeof import numpy as np @@ -1190,10 +1191,14 @@ def to_frame(self, index=True, name=None): idx_names = self.names result = DataFrame( - list(self), - columns=[n or i for i, n in enumerate(idx_names)] + OrderedDict([ + ((name or level), self._get_level_values(level)) + for name, level in zip(idx_names, range(len(self.levels))) + ]), + copy=False ) + if index: result.index = self return result @@ -1416,7 +1421,7 @@ def from_product(cls, iterables, sortorder=None, names=None): return MultiIndex(levels, labels, sortorder=sortorder, names=names) @classmethod - def from_frame(cls, df, squeeze=True): + def from_frame(cls, df, squeeze=True, names=None): """ Make a MultiIndex from a DataFrame. @@ -1426,6 +1431,11 @@ def from_frame(cls, df, squeeze=True): DataFrame to be converted to MultiIndex. squeeze : bool, default True If df is a single column, squeeze MultiIndex to be a regular Index. + names : list / sequence / callable, optonal + If no names provided, use column names, or tuple of column names if + the columns is a MultiIndex. If sequence, overwrite names with the + given sequence. If callable, pass each column name or tuples of + names to the callable. Returns ------- @@ -1435,21 +1445,21 @@ def from_frame(cls, df, squeeze=True): Examples -------- - >>> df = pd.DataFrame([[0, 'green'], [0, 'purple'], [1, 'green'], - ... [1, 'purple'], [2, 'green'], [2, 'purple']], - ... columns=['number', 'color']) + >>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'], + ... [1, 'jolly'], [2, 'joy'], [2, 'joy']], + ... columns=['number', 'mood']) >>> df - number color - 0 0 green - 1 0 purple - 2 1 green - 3 1 purple - 4 2 green - 5 2 purple + number mood + 0 0 happy + 1 0 jolly + 2 1 happy + 3 1 jolly + 4 2 joy + 5 2 joy >>> pd.MultiIndex.from_frame(df) - MultiIndex(levels=[[0, 1, 2], ['green', 'purple']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], + names=['number', 'mood']) See Also -------- @@ -1459,12 +1469,26 @@ def from_frame(cls, df, squeeze=True): of iterables """ from pandas import DataFrame - # just let column level names be the tuple of the meta df columns - # since they're not required to be strings if not isinstance(df, DataFrame): raise TypeError("Input must be a DataFrame") - columns = list(df) - mi = cls.from_tuples(list(df.values), names=columns) + + # Get MultiIndex names + if names is None: + names = list(df) + else: + if callable(names): + names = [names(x) for x in list(df)] + else: + if not is_list_like(names): + raise TypeError("'names' must be a list / sequence " + "of column names, or a callable.") + + if len(names) != len(list(df)): + raise ValueError("'names' should have same length as " + "number of columns in df.") + + # This way will preserve dtype of columns + mi = cls.from_arrays([df[x] for x in df], names=names) return mi.squeeze() if squeeze else mi def _sort_levels_monotonic(self): From e0df6321a187ee55a17df25c4308a9a724eee990 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 09:37:40 -0400 Subject: [PATCH 16/41] TST - add tests for ensuring dtype fidelity and custom names for from_frame --- .../tests/indexes/multi/test_constructor.py | 36 +++++++++++++++++++ pandas/tests/indexes/multi/test_conversion.py | 13 +++++++ 2 files changed, 49 insertions(+) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index b4ad4affa396d..46d6261ec48d1 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -510,3 +510,39 @@ def test_from_frame_squeeze(squeeze, input_type, expected): def test_from_frame_non_frame(): with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'): pd.MultiIndex.from_frame([1, 2, 3, 4]) + + +def test_from_frame_dtype_fidelity(): + df = pd.DataFrame({ + 'dates': pd.date_range('19910905', periods=6), + 'a': [1,1,1,2,2,2], + 'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), + 'c': ['x', 'x', 'y', 'z', 'x', 'y'] + }) + original_dtypes = df.dtypes.to_dict() + mi = pd.MultiIndex.from_frame(df) + mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + assert original_dtypes == mi_dtypes + + +def test_from_frame_names_as_list(): + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=['L1', 'L2']) + mi = pd.MultiIndex.from_frame(df, names=['a', 'b']) + assert mi.names == ['a', 'b'] + + +def test_from_frame_names_as_callable(): + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=pd.MultiIndex.from_tuples([('L1', 'x'), + ('L2', 'y')])) + mi = pd.MultiIndex.from_frame(df, names=lambda x: '_'.join(x)) + assert mi.names == ['L1_x', 'L2_y'] + + +def test_from_frame_names_bad_input(): + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=['L1', 'L2']) + with tm.assert_raises_regex(TypeError, "names' must be a list / sequence " + "of column names, or a callable."): + pd.MultiIndex.from_frame(df, names='bad') diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 4d53b475dd8ad..8f10b328ff490 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -82,6 +82,19 @@ def test_to_frame(): tm.assert_frame_equal(result, expected) +def test_to_frame_dtype_fidelity(): + mi = pd.MultiIndex.from_arrays([ + pd.date_range('19910905', periods=6), + [1,1,1,2,2,2], + pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), + ['x', 'x', 'y', 'z', 'x', 'y'] + ], names=['dates', 'a', 'b', 'c']) + original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + df = mi.to_frame() + df_dtypes = df.dtypes.to_dict() + assert original_dtypes == df_dtypes + + def test_to_hierarchical(): index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two')]) From 78ff5c20abfdad0957ddf393aef579472e48ddf2 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 09:57:46 -0400 Subject: [PATCH 17/41] CLN - pep8 adherence --- pandas/tests/indexes/multi/test_constructor.py | 2 +- pandas/tests/indexes/multi/test_conversion.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 46d6261ec48d1..bfbcb6100b722 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -515,7 +515,7 @@ def test_from_frame_non_frame(): def test_from_frame_dtype_fidelity(): df = pd.DataFrame({ 'dates': pd.date_range('19910905', periods=6), - 'a': [1,1,1,2,2,2], + 'a': [1, 1, 1, 2, 2, 2], 'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), 'c': ['x', 'x', 'y', 'z', 'x', 'y'] }) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 8f10b328ff490..9582f819d12eb 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -85,11 +85,12 @@ def test_to_frame(): def test_to_frame_dtype_fidelity(): mi = pd.MultiIndex.from_arrays([ pd.date_range('19910905', periods=6), - [1,1,1,2,2,2], + [1, 1, 1, 2, 2, 2], pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), ['x', 'x', 'y', 'z', 'x', 'y'] ], names=['dates', 'a', 'b', 'c']) - original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + original_dtypes = {name: mi.levels[i].dtype + for i, name in enumerate(mi.names)} df = mi.to_frame() df_dtypes = df.dtypes.to_dict() assert original_dtypes == df_dtypes From 0252db9ea290d0e3290a63357042760741aa7e9c Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 14:16:57 -0400 Subject: [PATCH 18/41] DOC - add examples and change order of kwargs for from_frame --- pandas/core/indexes/multi.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f5e1d12e54010..e8538f323de2b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1421,7 +1421,7 @@ def from_product(cls, iterables, sortorder=None, names=None): return MultiIndex(levels, labels, sortorder=sortorder, names=names) @classmethod - def from_frame(cls, df, squeeze=True, names=None): + def from_frame(cls, df, names=None, squeeze=True): """ Make a MultiIndex from a DataFrame. @@ -1429,13 +1429,13 @@ def from_frame(cls, df, squeeze=True, names=None): ---------- df : pd.DataFrame DataFrame to be converted to MultiIndex. - squeeze : bool, default True - If df is a single column, squeeze MultiIndex to be a regular Index. - names : list / sequence / callable, optonal + names : list-like / callable, optonal If no names provided, use column names, or tuple of column names if the columns is a MultiIndex. If sequence, overwrite names with the given sequence. If callable, pass each column name or tuples of names to the callable. + squeeze : bool, default True + If df is a single column, squeeze MultiIndex to be a regular Index. Returns ------- @@ -1460,6 +1460,30 @@ def from_frame(cls, df, squeeze=True, names=None): MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], names=['number', 'mood']) + >>> df = pd.DataFrame([[0], [1]], columns=['number']) + >>> df + number + 0 0 + 1 1 + >>> pd.MultiIndex.from_frame(df) + Int64Index([0, 1], dtype='int64', name='number') + >>> pd.MultiIndex.from_frame(df, squeeze=False) + MultiIndex(levels=[[0, 1]], + labels=[[0, 1]], + names=['number']) + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd'], ['e', 'f']], + ... columns=pd.MultiIndex.from_tuples([('L1', 'x'), + ... ('L2', 'y')])) + >>> df + L1 L2 + x y + 0 a b + 1 c d + 2 e f + >>> pd.MultiIndex.from_frame(df, names=lambda x: '_'.join(x)) + MultiIndex(levels=[['a', 'c', 'e'], ['b', 'd', 'f']], + labels=[[0, 1, 2], [0, 1, 2]], + names=['L1_x', 'L2_y']) See Also -------- From d98c8a914b23e66bc7050536ef15e5e191dda0ee Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 14:17:47 -0400 Subject: [PATCH 19/41] TST - parameterize tests --- .../tests/indexes/multi/test_constructor.py | 45 +++++++++++-------- pandas/tests/indexes/multi/test_conversion.py | 15 +++++-- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index bfbcb6100b722..b3746a4611171 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -514,35 +514,42 @@ def test_from_frame_non_frame(): def test_from_frame_dtype_fidelity(): df = pd.DataFrame({ - 'dates': pd.date_range('19910905', periods=6), + 'dates': pd.date_range('19910905', periods=6, tz='US/Eastern'), 'a': [1, 1, 1, 2, 2, 2], 'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), 'c': ['x', 'x', 'y', 'z', 'x', 'y'] }) original_dtypes = df.dtypes.to_dict() + + expected_mi= pd.MultiIndex.from_arrays([ + pd.date_range('19910905', periods=6, tz='US/Eastern'), + [1, 1, 1, 2, 2, 2], + pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), + ['x', 'x', 'y', 'z', 'x', 'y'] + ], names=['dates', 'a', 'b', 'c']) mi = pd.MultiIndex.from_frame(df) mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + tm.assert_index_equal(expected_mi, mi) assert original_dtypes == mi_dtypes -def test_from_frame_names_as_list(): - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=['L1', 'L2']) - mi = pd.MultiIndex.from_frame(df, names=['a', 'b']) - assert mi.names == ['a', 'b'] - - -def test_from_frame_names_as_callable(): +@pytest.mark.parametrize('names_in,names_out', [ + (None, [('L1', 'x'), ('L2', 'y')]), + (['x', 'y'], ['x', 'y']), + (lambda x: '_'.join(x), ['L1_x', 'L2_y']), + ('bad_input', None), +]) +def test_from_frame_names(names_in, names_out): df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=pd.MultiIndex.from_tuples([('L1', 'x'), + columns=pd.MultiIndex.from_tuples([('L1', 'x'), ('L2', 'y')])) - mi = pd.MultiIndex.from_frame(df, names=lambda x: '_'.join(x)) - assert mi.names == ['L1_x', 'L2_y'] - + if names_out is None: + with tm.assert_raises_regex(TypeError, "'names' must be a list / " + "sequence of column names, " + "or a callable."): + pd.MultiIndex.from_frame(df, names=names_in) + else: + mi = pd.MultiIndex.from_frame(df, names=names_in) + assert mi.names == names_out -def test_from_frame_names_bad_input(): - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=['L1', 'L2']) - with tm.assert_raises_regex(TypeError, "names' must be a list / sequence " - "of column names, or a callable."): - pd.MultiIndex.from_frame(df, names='bad') diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 9582f819d12eb..ab0643ce92303 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -84,15 +84,24 @@ def test_to_frame(): def test_to_frame_dtype_fidelity(): mi = pd.MultiIndex.from_arrays([ - pd.date_range('19910905', periods=6), + pd.date_range('19910905', periods=6, tz='US/Eastern'), [1, 1, 1, 2, 2, 2], pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), ['x', 'x', 'y', 'z', 'x', 'y'] ], names=['dates', 'a', 'b', 'c']) - original_dtypes = {name: mi.levels[i].dtype + original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} - df = mi.to_frame() + + expected_df = pd.DataFrame({ + 'dates': pd.date_range('19910905', periods=6, tz='US/Eastern'), + 'a': [1, 1, 1, 2, 2, 2], + 'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), + 'c': ['x', 'x', 'y', 'z', 'x', 'y'] + }) + df = mi.to_frame(index=False) df_dtypes = df.dtypes.to_dict() + + tm.assert_frame_equal(df, expected_df) assert original_dtypes == df_dtypes From 8a1906e5871dd9668569dcce1bbef585caca1e97 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 14:27:22 -0400 Subject: [PATCH 20/41] CLN - pep8 adherence --- pandas/tests/indexes/multi/test_constructor.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index b3746a4611171..176396259dfe7 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -521,7 +521,7 @@ def test_from_frame_dtype_fidelity(): }) original_dtypes = df.dtypes.to_dict() - expected_mi= pd.MultiIndex.from_arrays([ + expected_mi = pd.MultiIndex.from_arrays([ pd.date_range('19910905', periods=6, tz='US/Eastern'), [1, 1, 1, 2, 2, 2], pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), @@ -529,7 +529,7 @@ def test_from_frame_dtype_fidelity(): ], names=['dates', 'a', 'b', 'c']) mi = pd.MultiIndex.from_frame(df) mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} - + tm.assert_index_equal(expected_mi, mi) assert original_dtypes == mi_dtypes @@ -542,7 +542,7 @@ def test_from_frame_dtype_fidelity(): ]) def test_from_frame_names(names_in, names_out): df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=pd.MultiIndex.from_tuples([('L1', 'x'), + columns=pd.MultiIndex.from_tuples([('L1', 'x'), ('L2', 'y')])) if names_out is None: with tm.assert_raises_regex(TypeError, "'names' must be a list / " @@ -552,4 +552,3 @@ def test_from_frame_names(names_in, names_out): else: mi = pd.MultiIndex.from_frame(df, names=names_in) assert mi.names == names_out - From 08c120fa1dc8908a34b5d5864abec2dadd97c7b1 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 3 Nov 2018 14:28:15 -0400 Subject: [PATCH 21/41] CLN - pep8 adherence --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e8538f323de2b..29f6527b33983 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1472,7 +1472,7 @@ def from_frame(cls, df, names=None, squeeze=True): labels=[[0, 1]], names=['number']) >>> df = pd.DataFrame([['a', 'b'], ['c', 'd'], ['e', 'f']], - ... columns=pd.MultiIndex.from_tuples([('L1', 'x'), + ... columns=pd.MultiIndex.from_tuples([('L1', 'x'), ... ('L2', 'y')])) >>> df L1 L2 From 8353c3f3a31bf55bb8ab1251747ef6b58a3e4fde Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 4 Nov 2018 12:08:17 -0500 Subject: [PATCH 22/41] DOC/CLN - add versionadded tags, add to whatsnew page, and clean up if-else logic branches to be simpler to follow --- doc/source/whatsnew/v0.24.0.txt | 3 +++ pandas/core/indexes/multi.py | 28 +++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 700bf4ddc3a37..aada0ca142f3d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -892,6 +892,9 @@ MultiIndex - Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) - :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- :meth:`MultiIndex.from_frame` added. Allows constructing a `MultiIndex` object from a `DataFrame` (:issue:`22420`) +- :meth:`MultiIndex.squeeze` added. Allows a `MultiIndex` with only a single level to be converted to an `Index` object (:issue:`22420`) +- :meth:`MultiIndex.to_frame` will now guarantee the preservation of the sort order of the level names in the resulting `DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) I/O diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 29f6527b33983..e818a9019c89f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1421,14 +1421,19 @@ def from_product(cls, iterables, sortorder=None, names=None): return MultiIndex(levels, labels, sortorder=sortorder, names=names) @classmethod - def from_frame(cls, df, names=None, squeeze=True): + def from_frame(cls, df, sortorder=None, names=None, squeeze=True): """ Make a MultiIndex from a DataFrame. + .. versionadded:: 0.24.0 + Parameters ---------- df : pd.DataFrame DataFrame to be converted to MultiIndex. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). names : list-like / callable, optonal If no names provided, use column names, or tuple of column names if the columns is a MultiIndex. If sequence, overwrite names with the @@ -1499,20 +1504,19 @@ def from_frame(cls, df, names=None, squeeze=True): # Get MultiIndex names if names is None: names = list(df) - else: - if callable(names): - names = [names(x) for x in list(df)] - else: - if not is_list_like(names): - raise TypeError("'names' must be a list / sequence " - "of column names, or a callable.") - - if len(names) != len(list(df)): + elif callable(names): + names = [names(x) for x in list(df)] + elif is_list_like(names): + if len(names) != len(list(df)): raise ValueError("'names' should have same length as " "number of columns in df.") + # else: use the passed in sequence + else: + raise TypeError("'names' must be a list / sequence of column " + "names, or a callable.") # This way will preserve dtype of columns - mi = cls.from_arrays([df[x] for x in df], names=names) + mi = cls.from_arrays([df[x] for x in df], sortorder=sortorder, names=names) return mi.squeeze() if squeeze else mi def _sort_levels_monotonic(self): @@ -1581,6 +1585,8 @@ def squeeze(self): """ Squeeze a single level MultiIndex to be a regular Index instance. + .. versionadded:: 0.24.0 + Returns ------- Index or MultiIndex From 9df3c11e64c5ec2cdb688cb9762a0f8683ccbd09 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sat, 10 Nov 2018 10:32:52 -0500 Subject: [PATCH 23/41] CLN - squeeze -> _squeeze --- pandas/core/indexes/multi.py | 10 ++++++---- pandas/tests/indexes/multi/test_conversion.py | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e818a9019c89f..27d53bcf17b3c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1516,8 +1516,10 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): "names, or a callable.") # This way will preserve dtype of columns - mi = cls.from_arrays([df[x] for x in df], sortorder=sortorder, names=names) - return mi.squeeze() if squeeze else mi + mi = cls.from_arrays([df[x] for x in df], + sortorder=sortorder, + names=names) + return mi._squeeze() if squeeze else mi def _sort_levels_monotonic(self): """ @@ -1581,7 +1583,7 @@ def _sort_levels_monotonic(self): names=self.names, sortorder=self.sortorder, verify_integrity=False) - def squeeze(self): + def _squeeze(self): """ Squeeze a single level MultiIndex to be a regular Index instance. @@ -1599,7 +1601,7 @@ def squeeze(self): >>> mi MultiIndex(levels=[['a', 'b', 'c']], labels=[[0, 1, 2]]) - >>> mi.squeeze() + >>> mi._squeeze() Index(['a', 'b', 'c'], dtype='object') """ if len(self.levels) == 1: diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index ab0643ce92303..5c9350a291ab9 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -198,7 +198,7 @@ def test_squeeze_single_level(): mi = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], names=['L1']) expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') - result = mi.squeeze() + result = mi._squeeze() tm.assert_index_equal(expected, result) @@ -207,5 +207,5 @@ def test_squeeze_multi_level(): ('b', 'b')], names=['L1', 'L2']) expected = mi.copy() - result = mi.squeeze() + result = mi._squeeze() tm.assert_index_equal(expected, result) From 6d4915e21be8e844498a3e5d24f0c21c4f7e1454 Mon Sep 17 00:00:00 2001 From: ArtinSarraf Date: Sat, 10 Nov 2018 18:53:56 -0500 Subject: [PATCH 24/41] DOC - squeeze -> _squeeze in whatsnew --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index aada0ca142f3d..f62379a722c17 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -893,7 +893,7 @@ MultiIndex - Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) - :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) - :meth:`MultiIndex.from_frame` added. Allows constructing a `MultiIndex` object from a `DataFrame` (:issue:`22420`) -- :meth:`MultiIndex.squeeze` added. Allows a `MultiIndex` with only a single level to be converted to an `Index` object (:issue:`22420`) +- :meth:`MultiIndex._squeeze` added. Allows a `MultiIndex` with only a single level to be converted to an `Index` object (:issue:`22420`) - :meth:`MultiIndex.to_frame` will now guarantee the preservation of the sort order of the level names in the resulting `DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) From b5df7b26fdfe8b16dd03bc0b50e53321393a8f35 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 11 Nov 2018 18:55:03 -0500 Subject: [PATCH 25/41] BUG - allow repeat column names in from_frame, and falsey column names in to_frame --- pandas/core/indexes/multi.py | 74 +++++++++--------------------------- 1 file changed, 19 insertions(+), 55 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 27d53bcf17b3c..db9bd744e3abe 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -192,6 +192,7 @@ class MultiIndex(Index): from_arrays from_tuples from_product + from_frame set_levels set_labels to_frame @@ -1190,9 +1191,10 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names + # Guarantee resulting column order result = DataFrame( OrderedDict([ - ((name or level), self._get_level_values(level)) + ((level if name is None else name), self._get_level_values(level)) for name, level in zip(idx_names, range(len(self.levels))) ]), copy=False @@ -1434,11 +1436,10 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): sortorder : int or None Level of sortedness (must be lexicographically sorted by that level). - names : list-like / callable, optonal - If no names provided, use column names, or tuple of column names if - the columns is a MultiIndex. If sequence, overwrite names with the - given sequence. If callable, pass each column name or tuples of - names to the callable. + names : list-like, optonal + If no names are provided, use the column names, or tuple of column + names if the columns is a MultiIndex. If a sequence, overwrite + names with the given sequence. squeeze : bool, default True If df is a single column, squeeze MultiIndex to be a regular Index. @@ -1465,6 +1466,7 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], names=['number', 'mood']) + >>> df = pd.DataFrame([[0], [1]], columns=['number']) >>> df number @@ -1476,19 +1478,6 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): MultiIndex(levels=[[0, 1]], labels=[[0, 1]], names=['number']) - >>> df = pd.DataFrame([['a', 'b'], ['c', 'd'], ['e', 'f']], - ... columns=pd.MultiIndex.from_tuples([('L1', 'x'), - ... ('L2', 'y')])) - >>> df - L1 L2 - x y - 0 a b - 1 c d - 2 e f - >>> pd.MultiIndex.from_frame(df, names=lambda x: '_'.join(x)) - MultiIndex(levels=[['a', 'c', 'e'], ['b', 'd', 'f']], - labels=[[0, 1, 2], [0, 1, 2]], - names=['L1_x', 'L2_y']) See Also -------- @@ -1497,29 +1486,30 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ - from pandas import DataFrame - if not isinstance(df, DataFrame): + from pandas.core.dtypes.generic import ABCDataFrame + if not isinstance(df, ABCDataFrame): raise TypeError("Input must be a DataFrame") # Get MultiIndex names if names is None: - names = list(df) - elif callable(names): - names = [names(x) for x in list(df)] + names = df.columns elif is_list_like(names): - if len(names) != len(list(df)): + if len(names) != len(df.columns): raise ValueError("'names' should have same length as " "number of columns in df.") - # else: use the passed in sequence else: raise TypeError("'names' must be a list / sequence of column " - "names, or a callable.") + "names.") # This way will preserve dtype of columns - mi = cls.from_arrays([df[x] for x in df], + mi = cls.from_arrays([df.iloc[:, x] for x in range(len(df.columns))], sortorder=sortorder, names=names) - return mi._squeeze() if squeeze else mi + + if squeeze and len(mi.levels) == 1: + return mi.get_level_values(0) + else: + return mi def _sort_levels_monotonic(self): """ @@ -1583,32 +1573,6 @@ def _sort_levels_monotonic(self): names=self.names, sortorder=self.sortorder, verify_integrity=False) - def _squeeze(self): - """ - Squeeze a single level MultiIndex to be a regular Index instance. - - .. versionadded:: 0.24.0 - - Returns - ------- - Index or MultiIndex - Returns Index equivalent of single level MultiIndex. Returns - copy of MultiIndex if multilevel. - - Examples - -------- - >>> mi = pd.MultiIndex.from_tuples([('a',), ('b',), ('c',)]) - >>> mi - MultiIndex(levels=[['a', 'b', 'c']], - labels=[[0, 1, 2]]) - >>> mi._squeeze() - Index(['a', 'b', 'c'], dtype='object') - """ - if len(self.levels) == 1: - return self.levels[0][self.labels[0]] - else: - return self.copy() - def remove_unused_levels(self): """ Create a new MultiIndex from the current that removes From ab3259c5ce113188be6cf94e0d83d5b493ab5a91 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 11 Nov 2018 18:55:41 -0500 Subject: [PATCH 26/41] DOC - whatsnew formatting --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.24.0.txt | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index f57531fffaaaa..5879829f5a70f 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1686,6 +1686,7 @@ MultiIndex Constructors MultiIndex.from_arrays MultiIndex.from_tuples MultiIndex.from_product + MultiIndex.from_frame MultiIndex Attributes ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index aada0ca142f3d..848838f9d4a50 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -427,6 +427,18 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with df.to_dict(orient='index') +.. _whatsnew_0240.api_breaking.multiindex_to_frame_ordering + +``MultiIndex.to_frame()`` Resulting Column Sort Order Guaranteed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. +(:issue:`22420`) + +Previous Behavior: + +The column order would be determined by dictionary ordering, which before Python 3.7, could not be guaranteed. + .. _whatsnew_0240.api.datetimelike.normalize: Tick DateOffset Normalize Restrictions @@ -892,9 +904,7 @@ MultiIndex - Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) - :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) -- :meth:`MultiIndex.from_frame` added. Allows constructing a `MultiIndex` object from a `DataFrame` (:issue:`22420`) -- :meth:`MultiIndex.squeeze` added. Allows a `MultiIndex` with only a single level to be converted to an `Index` object (:issue:`22420`) -- :meth:`MultiIndex.to_frame` will now guarantee the preservation of the sort order of the level names in the resulting `DataFrame` (:issue:`22420`) +- :cls:`MultiIndex` has gained the :meth:`MultiIndex.from_frame`, it allows constructing a :cls:`MultiIndex` object from a :cls:`DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) I/O From cf95261c890615e475cd25825d59a2accb6f064c Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 11 Nov 2018 18:56:37 -0500 Subject: [PATCH 27/41] TST - reorganize and add tests for more incompatible from_frame types --- .../tests/indexes/multi/test_constructor.py | 67 +++++++++++++------ pandas/tests/indexes/multi/test_conversion.py | 10 +++ 2 files changed, 56 insertions(+), 21 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 176396259dfe7..33f369fddccb0 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -91,6 +91,9 @@ def test_copy_in_constructor(): assert mi.levels[0][0] == val +# ---------------------------------------------------------------------------- +# from_arrays +# ---------------------------------------------------------------------------- def test_from_arrays(idx): arrays = [] for lev, lab in zip(idx.levels, idx.labels): @@ -263,6 +266,9 @@ def test_from_arrays_different_lengths(idx1, idx2): MultiIndex.from_arrays, [idx1, idx2]) +# ---------------------------------------------------------------------------- +# from_tuples +# ---------------------------------------------------------------------------- def test_from_tuples(): tm.assert_raises_regex(TypeError, 'Cannot infer number of levels ' 'from empty list', @@ -306,6 +312,28 @@ def test_from_tuples_index_values(idx): assert (result.values == idx.values).all() +def test_tuples_with_name_string(): + # GH 15110 and GH 14848 + + li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] + with pytest.raises(ValueError): + pd.Index(li, name='abc') + with pytest.raises(ValueError): + pd.Index(li, name='a') + + +def test_from_tuples_with_tuple_label(): + # GH 15457 + expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], + columns=['a', 'b', 'c']).set_index(['a', 'b']) + idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) + result = pd.DataFrame([2, 3], columns=['c'], index=idx) + tm.assert_frame_equal(expected, result) + + +# ---------------------------------------------------------------------------- +# from_product +# ---------------------------------------------------------------------------- def test_from_product_empty_zero_levels(): # 0 levels with tm.assert_raises_regex( @@ -455,26 +483,11 @@ def test_create_index_existing_name(idx): tm.assert_index_equal(result, expected) -def test_tuples_with_name_string(): - # GH 15110 and GH 14848 - - li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] - with pytest.raises(ValueError): - pd.Index(li, name='abc') - with pytest.raises(ValueError): - pd.Index(li, name='a') - - -def test_from_tuples_with_tuple_label(): - # GH 15457 - expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], - columns=['a', 'b', 'c']).set_index(['a', 'b']) - idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) - result = pd.DataFrame([2, 3], columns=['c'], index=idx) - tm.assert_frame_equal(expected, result) - - +# ---------------------------------------------------------------------------- +# from_frame +# ---------------------------------------------------------------------------- def test_from_frame(): + # GH 22420 df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], columns=['L1', 'L2']) expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), @@ -497,6 +510,7 @@ def test_from_frame(): names=['L1'])) ]) def test_from_frame_squeeze(squeeze, input_type, expected): + # GH 22420 if input_type == 'multi': df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], columns=['L1', 'L2']) @@ -507,12 +521,22 @@ def test_from_frame_squeeze(squeeze, input_type, expected): tm.assert_index_equal(expected, result) -def test_from_frame_non_frame(): +@pytest.mark.parametrize('non_frame', [ + pd.Series([1, 2, 3, 4]), + [1, 2, 3, 4], + [[1, 2], [3, 4], [5, 6]], + pd.Index([1, 2, 3, 4]), + np.array([[1, 2], [3, 4], [5, 6]]), + 27 +]) +def test_from_frame_non_frame(non_frame): + # GH 22420 with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'): - pd.MultiIndex.from_frame([1, 2, 3, 4]) + pd.MultiIndex.from_frame(non_frame) def test_from_frame_dtype_fidelity(): + # GH 22420 df = pd.DataFrame({ 'dates': pd.date_range('19910905', periods=6, tz='US/Eastern'), 'a': [1, 1, 1, 2, 2, 2], @@ -541,6 +565,7 @@ def test_from_frame_dtype_fidelity(): ('bad_input', None), ]) def test_from_frame_names(names_in, names_out): + # GH 22420 df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], columns=pd.MultiIndex.from_tuples([('L1', 'x'), ('L2', 'y')])) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 5c9350a291ab9..663e696091534 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -83,6 +83,7 @@ def test_to_frame(): def test_to_frame_dtype_fidelity(): + # GH 22420 mi = pd.MultiIndex.from_arrays([ pd.date_range('19910905', periods=6, tz='US/Eastern'), [1, 1, 1, 2, 2, 2], @@ -105,6 +106,15 @@ def test_to_frame_dtype_fidelity(): assert original_dtypes == df_dtypes +def test_to_frame_resulting_column_order(): + # GH 22420 + expected = ['z', 0, 'a'] + mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'], + ['q', 'w', 'e']], names=expected) + result = mi.to_frame().columns.tolist() + assert result == expected + + def test_to_hierarchical(): index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two')]) From a75a4a54f11e889c9d9de1b93036c186a7a5cd76 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 11 Nov 2018 19:00:08 -0500 Subject: [PATCH 28/41] CLN - remove squeeze tests --- pandas/tests/indexes/multi/test_conversion.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 663e696091534..fdf21caa10879 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -202,20 +202,3 @@ def test_to_series_with_arguments(idx): assert s.values is not idx.values assert s.index is not idx assert s.name != idx.name - - -def test_squeeze_single_level(): - mi = pd.MultiIndex.from_tuples([('a',), ('a',), ('b',), ('b',)], - names=['L1']) - expected = pd.Index(['a', 'a', 'b', 'b'], name='L1') - result = mi._squeeze() - tm.assert_index_equal(expected, result) - - -def test_squeeze_multi_level(): - mi = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), ('b', 'a'), - ('b', 'b')], - names=['L1', 'L2']) - expected = mi.copy() - result = mi._squeeze() - tm.assert_index_equal(expected, result) From 8d23df9dc24304422927a1371575d961dd65475f Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 11 Nov 2018 19:04:09 -0500 Subject: [PATCH 29/41] CLN - remove squeeze parameter from from_frame --- pandas/core/indexes/multi.py | 30 ++++--------------- .../tests/indexes/multi/test_constructor.py | 24 --------------- 2 files changed, 5 insertions(+), 49 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index db9bd744e3abe..c607930c45f84 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1423,7 +1423,7 @@ def from_product(cls, iterables, sortorder=None, names=None): return MultiIndex(levels, labels, sortorder=sortorder, names=names) @classmethod - def from_frame(cls, df, sortorder=None, names=None, squeeze=True): + def from_frame(cls, df, sortorder=None, names=None): """ Make a MultiIndex from a DataFrame. @@ -1440,14 +1440,11 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite names with the given sequence. - squeeze : bool, default True - If df is a single column, squeeze MultiIndex to be a regular Index. Returns ------- MultiIndex or Index - The MultiIndex representation of the given DataFrame. Returns an - Index if the DataFrame is single column and squeeze is True. + The MultiIndex representation of the given DataFrame. Examples -------- @@ -1467,18 +1464,6 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], names=['number', 'mood']) - >>> df = pd.DataFrame([[0], [1]], columns=['number']) - >>> df - number - 0 0 - 1 1 - >>> pd.MultiIndex.from_frame(df) - Int64Index([0, 1], dtype='int64', name='number') - >>> pd.MultiIndex.from_frame(df, squeeze=False) - MultiIndex(levels=[[0, 1]], - labels=[[0, 1]], - names=['number']) - See Also -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex @@ -1502,14 +1487,9 @@ def from_frame(cls, df, sortorder=None, names=None, squeeze=True): "names.") # This way will preserve dtype of columns - mi = cls.from_arrays([df.iloc[:, x] for x in range(len(df.columns))], - sortorder=sortorder, - names=names) - - if squeeze and len(mi.levels) == 1: - return mi.get_level_values(0) - else: - return mi + return cls.from_arrays([df.iloc[:, x] for x in range(len(df.columns))], + sortorder=sortorder, + names=names) def _sort_levels_monotonic(self): """ diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 33f369fddccb0..d9ecd0e1bf0c2 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -497,30 +497,6 @@ def test_from_frame(): tm.assert_index_equal(expected, result) -@pytest.mark.parametrize('squeeze,input_type,expected', [ - (True, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), - ('b', 'a'), ('b', 'b')], - names=['L1', 'L2'])), - (True, 'single', pd.Index(['a', 'a', 'b', 'b'], name='L1')), - (False, 'multi', pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), - ('b', 'a'), ('b', 'b')], - names=['L1', 'L2'])), - (False, 'single', pd.MultiIndex.from_tuples([('a',), ('a',), - ('b',), ('b',)], - names=['L1'])) -]) -def test_from_frame_squeeze(squeeze, input_type, expected): - # GH 22420 - if input_type == 'multi': - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=['L1', 'L2']) - elif input_type == 'single': - df = pd.DataFrame([['a'], ['a'], ['b'], ['b']], columns=['L1']) - - result = pd.MultiIndex.from_frame(df, squeeze=squeeze) - tm.assert_index_equal(expected, result) - - @pytest.mark.parametrize('non_frame', [ pd.Series([1, 2, 3, 4]), [1, 2, 3, 4], From 7cf82d128032127f53516afc611e9a93ca2681f9 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 11 Nov 2018 19:36:09 -0500 Subject: [PATCH 30/41] TST - remove callable name option --- pandas/tests/indexes/multi/test_constructor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 4914b0ff2e863..06a70e629bd0f 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -540,7 +540,6 @@ def test_from_frame_dtype_fidelity(): @pytest.mark.parametrize('names_in,names_out', [ (None, [('L1', 'x'), ('L2', 'y')]), (['x', 'y'], ['x', 'y']), - (lambda x: '_'.join(x), ['L1_x', 'L2_y']), ('bad_input', None), ]) def test_from_frame_names(names_in, names_out): @@ -550,8 +549,7 @@ def test_from_frame_names(names_in, names_out): ('L2', 'y')])) if names_out is None: with tm.assert_raises_regex(TypeError, "'names' must be a list / " - "sequence of column names, " - "or a callable."): + "sequence of column names."): pd.MultiIndex.from_frame(df, names=names_in) else: mi = pd.MultiIndex.from_frame(df, names=names_in) From 1a282e56b3aa102eb579ed660f5e5cd93cfc65ad Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 13 Nov 2018 22:16:22 -0500 Subject: [PATCH 31/41] ENH - from_data initial commit --- pandas/core/indexes/multi.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8af5795b129dd..3f3f3a909414c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1496,6 +1496,34 @@ def from_frame(cls, df, sortorder=None, names=None): sortorder=sortorder, names=names) + @classmethod + def from_data(cls, data, orient='columns', sortorder=None, names=None): + from pandas import DataFrame + + is_df = isinstance(data, DataFrame) + try: + df = DataFrame(data) + except ValueError: + raise TypeError("'from_data' input must be valid DataFrame input.") + if orient == 'rows': + df = df.T + + if not is_df: + df.columns = [None for _ in range(len(df.columns))] + + if names is None: + pass + elif is_list_like(names): + if len(names) != len(df.columns): + raise ValueError("'names' should have same length as " + "number of columns in df.") + else: + raise TypeError("'names' must be a list / sequence of column " + "names.") + return cls.from_arrays([df.iloc[:, x] for x in range(len(df.columns))], + sortorder=sortorder, + names=names) + def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 From b3c6a9022b58e20bc9456ddb175ff9d5ef6d120d Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 18 Nov 2018 23:23:57 -0500 Subject: [PATCH 32/41] DOC - reduce whatsnew entry for to_frame --- doc/source/whatsnew/v0.24.0.txt | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 4439954876bf8..e3787af72125b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -593,15 +593,7 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with .. _whatsnew_0240.api_breaking.multiindex_to_frame_ordering -``MultiIndex.to_frame()`` Resulting Column Sort Order Guaranteed -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. -(:issue:`22420`) - -Previous Behavior: - -The column order would be determined by dictionary ordering, which before Python 3.7, could not be guaranteed. +The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. (:issue:`22420`) .. _whatsnew_0240.api.datetimelike.normalize: From c760359038aa88096d75f427b47636e288df1ffe Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 18 Nov 2018 23:24:32 -0500 Subject: [PATCH 33/41] CLN/DOC - add examples to from_frame docstring and make code more readable --- pandas/core/indexes/multi.py | 82 +++++++++++++++--------------------- 1 file changed, 35 insertions(+), 47 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3f3f3a909414c..5dd0dbe14d484 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -26,6 +26,7 @@ is_list_like, pandas_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import isna, array_equivalent from pandas.errors import PerformanceWarning, UnsortedIndexError @@ -1199,7 +1200,8 @@ def to_frame(self, index=True, name=None): # Guarantee resulting column order result = DataFrame( OrderedDict([ - ((level if name is None else name), self._get_level_values(level)) + ((level if name is None else name), + self._get_level_values(level)) for name, level in zip(idx_names, range(len(self.levels))) ]), copy=False @@ -1455,34 +1457,51 @@ def from_frame(cls, df, sortorder=None, names=None): -------- >>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'], ... [1, 'jolly'], [2, 'joy'], [2, 'joy']], - ... columns=['number', 'mood']) + ... columns=['will_be', 'used']) >>> df - number mood - 0 0 happy - 1 0 jolly - 2 1 happy - 3 1 jolly - 4 2 joy - 5 2 joy + will_be used + 0 0 happy + 1 0 jolly + 2 1 happy + 3 1 jolly + 4 2 joy + 5 2 joy >>> pd.MultiIndex.from_frame(df) MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], - names=['number', 'mood']) + names=['will_be', 'used']) + + >>> df = pd.DataFrame([['ahc', 'iam'], ['ahc', 'wim'], ['boh', 'amg'], + ... ['boh', 'iam'], ['oil', 'wim'], ['oil', 'amg']], + ... columns=['will_be', 'overriden']) + >>> df + will_be overriden + 0 ahc iam + 1 ahc wim + 2 boh amg + 3 boh iam + 4 oil wim + 5 oil amg + >>> pd.MultiIndex.from_frame(df, names=['sure', 'will']) + MultiIndex(levels=[['ahc', 'boh', 'oil'], ['amg', 'iam', 'wim']], + labels=[[0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]], + names=['sure', 'will']) See Also -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex - MultiIndex.from_tuples : Convert list of tuples to MultiIndex + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables + of iterables. """ - from pandas.core.dtypes.generic import ABCDataFrame if not isinstance(df, ABCDataFrame): raise TypeError("Input must be a DataFrame") + column_names, columns = lzip(*df.iteritems()) + # Get MultiIndex names if names is None: - names = df.columns + names = column_names elif is_list_like(names): if len(names) != len(df.columns): raise ValueError("'names' should have same length as " @@ -1491,38 +1510,7 @@ def from_frame(cls, df, sortorder=None, names=None): raise TypeError("'names' must be a list / sequence of column " "names.") - # This way will preserve dtype of columns - return cls.from_arrays([df.iloc[:, x] for x in range(len(df.columns))], - sortorder=sortorder, - names=names) - - @classmethod - def from_data(cls, data, orient='columns', sortorder=None, names=None): - from pandas import DataFrame - - is_df = isinstance(data, DataFrame) - try: - df = DataFrame(data) - except ValueError: - raise TypeError("'from_data' input must be valid DataFrame input.") - if orient == 'rows': - df = df.T - - if not is_df: - df.columns = [None for _ in range(len(df.columns))] - - if names is None: - pass - elif is_list_like(names): - if len(names) != len(df.columns): - raise ValueError("'names' should have same length as " - "number of columns in df.") - else: - raise TypeError("'names' must be a list / sequence of column " - "names.") - return cls.from_arrays([df.iloc[:, x] for x in range(len(df.columns))], - sortorder=sortorder, - names=names) + return cls.from_arrays(columns, sortorder=sortorder, names=names) def _sort_levels_monotonic(self): """ From 9e11180ae61679c2c4dff7ef8114a9d8830ea62c Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Mon, 19 Nov 2018 19:31:58 -0500 Subject: [PATCH 34/41] TST - use OrderedDict for dataframe construction --- pandas/tests/indexes/multi/test_constructor.py | 13 +++++++------ pandas/tests/indexes/multi/test_conversion.py | 14 ++++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index f490e374866f1..b79295ac8cd96 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +from collections import OrderedDict import numpy as np import pytest @@ -521,12 +522,12 @@ def test_from_frame_non_frame(non_frame): def test_from_frame_dtype_fidelity(): # GH 22420 - df = pd.DataFrame({ - 'dates': pd.date_range('19910905', periods=6, tz='US/Eastern'), - 'a': [1, 1, 1, 2, 2, 2], - 'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), - 'c': ['x', 'x', 'y', 'z', 'x', 'y'] - }) + df = pd.DataFrame(OrderedDict([ + ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), + ('a', [1, 1, 1, 2, 2, 2]), + ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), + ('c', ['x', 'x', 'y', 'z', 'x', 'y']) + ])) original_dtypes = df.dtypes.to_dict() expected_mi = pd.MultiIndex.from_arrays([ diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index f9052be983686..0c1dc52cc2b09 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +from collections import OrderedDict + import pytest import numpy as np @@ -94,12 +96,12 @@ def test_to_frame_dtype_fidelity(): original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} - expected_df = pd.DataFrame({ - 'dates': pd.date_range('19910905', periods=6, tz='US/Eastern'), - 'a': [1, 1, 1, 2, 2, 2], - 'b': pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), - 'c': ['x', 'x', 'y', 'z', 'x', 'y'] - }) + expected_df = pd.DataFrame(OrderedDict([ + ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), + ('a', [1, 1, 1, 2, 2, 2]), + ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), + ('c', ['x', 'x', 'y', 'z', 'x', 'y']) + ])) df = mi.to_frame(index=False) df_dtypes = df.dtypes.to_dict() From a5236bf8231eb81c79b20a05c37454f53006aaf2 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Fri, 30 Nov 2018 21:37:27 -0500 Subject: [PATCH 35/41] CLN - clean up code and use pytest.raises --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/indexes/multi.py | 13 +------------ pandas/tests/indexes/multi/test_constructor.py | 11 ++++++----- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 90d8dcf36d14a..a805d1b196f4d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -657,7 +657,7 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with df.to_dict(orient='index') -.. _whatsnew_0240.api_breaking.multiindex_to_frame_ordering +.. _whatsnew_0240.api_breaking.multiindex_to_frame_ordering: The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. (:issue:`22420`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b4403e1300809..d43ecf40f1983 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1516,18 +1516,7 @@ def from_frame(cls, df, sortorder=None, names=None): raise TypeError("Input must be a DataFrame") column_names, columns = lzip(*df.iteritems()) - - # Get MultiIndex names - if names is None: - names = column_names - elif is_list_like(names): - if len(names) != len(df.columns): - raise ValueError("'names' should have same length as " - "number of columns in df.") - else: - raise TypeError("'names' must be a list / sequence of column " - "names.") - + names = column_names if names is None else names return cls.from_arrays(columns, sortorder=sortorder, names=names) def _sort_levels_monotonic(self): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index b79295ac8cd96..94cbe114a6163 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -516,7 +516,7 @@ def test_from_frame(): ]) def test_from_frame_non_frame(non_frame): # GH 22420 - with tm.assert_raises_regex(TypeError, 'Input must be a DataFrame'): + with pytest.raises(TypeError, match='Input must be a DataFrame'): pd.MultiIndex.from_frame(non_frame) @@ -546,16 +546,17 @@ def test_from_frame_dtype_fidelity(): @pytest.mark.parametrize('names_in,names_out', [ (None, [('L1', 'x'), ('L2', 'y')]), (['x', 'y'], ['x', 'y']), - ('bad_input', None), + ('bad_input', ValueError("Names should be list-like for a MultiIndex")), + (['a', 'b', 'c'], ValueError("Length of names must match number of " + "levels in MultiIndex.")) ]) def test_from_frame_names(names_in, names_out): # GH 22420 df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], columns=pd.MultiIndex.from_tuples([('L1', 'x'), ('L2', 'y')])) - if names_out is None: - with tm.assert_raises_regex(TypeError, "'names' must be a list / " - "sequence of column names."): + if isinstance(names_out, Exception): + with pytest.raises(type(names_out), match=names_out.args[0]): pd.MultiIndex.from_frame(df, names=names_in) else: mi = pd.MultiIndex.from_frame(df, names=names_in) From 14bfea8bc410aa894a253af4a3b2d70513bbd5fb Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Sun, 2 Dec 2018 13:44:23 -0500 Subject: [PATCH 36/41] DOC - move to_frame breaking changes to backwards incompatible section of whatsnew --- doc/source/whatsnew/v0.24.0.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1f0ffafc6be71..23b3596b7cd08 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -376,6 +376,7 @@ Backwards incompatible API changes - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) +- The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. (:issue:`22420`) .. _whatsnew_0240.api_breaking.deps: @@ -729,10 +730,6 @@ Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with df.to_dict(orient='index') -.. _whatsnew_0240.api_breaking.multiindex_to_frame_ordering: - -The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. (:issue:`22420`) - .. _whatsnew_0240.api.datetimelike.normalize: Tick DateOffset Normalize Restrictions From 30fe0df9367e8a4fc5b0fafa1c73123886c0dcea Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Tue, 4 Dec 2018 21:20:53 -0500 Subject: [PATCH 37/41] DOC - add advanced.rst section --- doc/source/advanced.rst | 16 ++++++++++++-- .../tests/indexes/multi/test_constructor.py | 21 ++++++++++++------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 24a1ac7be7d1d..b9d7b5c3ab711 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -57,8 +57,9 @@ The :class:`MultiIndex` object is the hierarchical analogue of the standard can think of ``MultiIndex`` as an array of tuples where each tuple is unique. A ``MultiIndex`` can be created from a list of arrays (using :meth:`MultiIndex.from_arrays`), an array of tuples (using -:meth:`MultiIndex.from_tuples`), or a crossed set of iterables (using -:meth:`MultiIndex.from_product`). The ``Index`` constructor will attempt to return +:meth:`MultiIndex.from_tuples`), a crossed set of iterables (using +:meth:`MultiIndex.from_product`), or a :class:`DataFrame` (using +:meth:`MultiIndex.from_frame`). The ``Index`` constructor will attempt to return a ``MultiIndex`` when it is passed a list of tuples. The following examples demonstrate different ways to initialize MultiIndexes. @@ -84,6 +85,17 @@ to use the :meth:`MultiIndex.from_product` method: iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] pd.MultiIndex.from_product(iterables, names=['first', 'second']) +You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using +the method :meth:`MultiIndex.from_frame`. This is a complementary method to +:meth:`MultiIndex.to_frame`. + +.. ipython:: python + + df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], + ['foo', 'one'], ['foo', 'two']], + columns=['first', 'second']) + pd.MultiIndex.from_frame(df) + As a convenience, you can pass a list of arrays directly into ``Series`` or ``DataFrame`` to construct a ``MultiIndex`` automatically: diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index b69ea9df86c56..9eb4d433ba145 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -544,18 +544,25 @@ def test_from_frame_dtype_fidelity(): @pytest.mark.parametrize('names_in,names_out', [ (None, [('L1', 'x'), ('L2', 'y')]), (['x', 'y'], ['x', 'y']), +]) +def test_from_frame_valid_names(names_in, names_out): + # GH 22420 + df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], + columns=pd.MultiIndex.from_tuples([('L1', 'x'), + ('L2', 'y')])) + mi = pd.MultiIndex.from_frame(df, names=names_in) + assert mi.names == names_out + + +@pytest.mark.parametrize('names_in,names_out', [ ('bad_input', ValueError("Names should be list-like for a MultiIndex")), (['a', 'b', 'c'], ValueError("Length of names must match number of " "levels in MultiIndex.")) ]) -def test_from_frame_names(names_in, names_out): +def test_from_frame_invalid_names(names_in, names_out): # GH 22420 df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], columns=pd.MultiIndex.from_tuples([('L1', 'x'), ('L2', 'y')])) - if isinstance(names_out, Exception): - with pytest.raises(type(names_out), match=names_out.args[0]): - pd.MultiIndex.from_frame(df, names=names_in) - else: - mi = pd.MultiIndex.from_frame(df, names=names_in) - assert mi.names == names_out + with pytest.raises(type(names_out), match=names_out.args[0]): + pd.MultiIndex.from_frame(df, names=names_in) From 9b906c68beee509764ed2a54b4872588b4a4ba26 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 5 Dec 2018 22:08:35 -0500 Subject: [PATCH 38/41] DOC/CLN - cleanup documentation --- doc/source/advanced.rst | 2 + doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/indexes/multi.py | 46 ++++++++----------- .../tests/indexes/multi/test_constructor.py | 2 +- 4 files changed, 23 insertions(+), 29 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index ea9fcf3e694e4..0cc2cea774bbd 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -94,6 +94,8 @@ You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using the method :meth:`MultiIndex.from_frame`. This is a complementary method to :meth:`MultiIndex.to_frame`. +.. versionadded:: 0.24.0 + .. ipython:: python df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d25a8324bd92a..421b511aa783e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -378,7 +378,7 @@ Backwards incompatible API changes - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) -- The column order of the resultant ``DataFrame`` from ``MultiIndex.to_frame()`` is now guaranteed to match the ``MultiIndex.names`` order. (:issue:`22420`) +- The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae69aab193423..9f978019cb537 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -463,37 +463,29 @@ def from_frame(cls, df, sortorder=None, names=None): Examples -------- - >>> df = pd.DataFrame([[0, 'happy'], [0, 'jolly'], [1, 'happy'], - ... [1, 'jolly'], [2, 'joy'], [2, 'joy']], - ... columns=['will_be', 'used']) + >>> df = pd.DataFrame([['ahc', 'happy'], ['ahc', 'jolly'], + ... ['boh', 'happy'], ['boh', 'jolly'], + ... ['oil', 'joy'], ['oil', 'joy']], + ... columns=['a', 'b']) >>> df - will_be used - 0 0 happy - 1 0 jolly - 2 1 happy - 3 1 jolly - 4 2 joy - 5 2 joy + a b + 0 ahc happy + 1 ahc jolly + 2 boh happy + 3 boh jolly + 4 oil joy + 5 oil joy + >>> pd.MultiIndex.from_frame(df) - MultiIndex(levels=[[0, 1, 2], ['happy', 'jolly', 'joy']], + MultiIndex(levels=[['ahc', 'boh', 'oil'], ['happy', 'jolly', 'joy']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], - names=['will_be', 'used']) + names=['a', 'b']) - >>> df = pd.DataFrame([['ahc', 'iam'], ['ahc', 'wim'], ['boh', 'amg'], - ... ['boh', 'iam'], ['oil', 'wim'], ['oil', 'amg']], - ... columns=['will_be', 'overriden']) - >>> df - will_be overriden - 0 ahc iam - 1 ahc wim - 2 boh amg - 3 boh iam - 4 oil wim - 5 oil amg - >>> pd.MultiIndex.from_frame(df, names=['sure', 'will']) - MultiIndex(levels=[['ahc', 'boh', 'oil'], ['amg', 'iam', 'wim']], - labels=[[0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]], - names=['sure', 'will']) + # Use explicit names, instead of column names + >>> pd.MultiIndex.from_frame(df, names=['X', 'Y']) + MultiIndex(levels=[['ahc', 'boh', 'oil'], ['happy', 'jolly', 'joy']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], + names=['X', 'Y']) See Also -------- diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index b1cbd996cbde8..562159dbd0f1a 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -521,7 +521,7 @@ def test_from_frame(): np.array([[1, 2], [3, 4], [5, 6]]), 27 ]) -def test_from_frame_non_frame(non_frame): +def test_from_frame_error(non_frame): # GH 22420 with pytest.raises(TypeError, match='Input must be a DataFrame'): pd.MultiIndex.from_frame(non_frame) From e41612273a9bc0b4e32ae3d8dec5861e4a8c4b39 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Wed, 5 Dec 2018 22:36:42 -0500 Subject: [PATCH 39/41] CLN - fix linting error according to pandas-dev.pandas test --- pandas/core/indexes/multi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9f978019cb537..4b0a14430cd29 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1466,7 +1466,6 @@ def to_frame(self, index=True, name=None): copy=False ) - if index: result.index = self return result From 4ef9ec40bae8f0a48752f193cb21f4aeb1cc6e65 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Thu, 6 Dec 2018 20:58:53 -0500 Subject: [PATCH 40/41] DOC - fix docstrings --- pandas/core/indexes/multi.py | 195 ++++++++++++++++++----------------- 1 file changed, 102 insertions(+), 93 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4b0a14430cd29..893d936771fa8 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -127,53 +127,25 @@ class MultiIndex(Index): Parameters ---------- levels : sequence of arrays - The unique labels for each level + The unique labels for each level. codes : sequence of arrays - Integers for each level designating which label at each location + Integers for each level designating which label at each location. .. versionadded:: 0.24.0 labels : sequence of arrays - Integers for each level designating which label at each location + Integers for each level designating which label at each location. .. deprecated:: 0.24.0 Use ``codes`` instead sortorder : optional int Level of sortedness (must be lexicographically sorted by that - level) + level). names : optional sequence of objects - Names for each of the index levels. (name is accepted for compat) - copy : boolean, default False - Copy the meta-data - verify_integrity : boolean, default True - Check that the levels/codes are consistent and valid - - Examples - --------- - A new ``MultiIndex`` is typically constructed using one of the helper - methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` - and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): - - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - MultiIndex(levels=[[1, 2], ['blue', 'red']], - labels=[[0, 0, 1, 1], [1, 0, 1, 0]], - names=['number', 'color']) - - See further examples for how to construct a MultiIndex in the doc strings - of the mentioned helper methods. - - Notes - ----- - See the `user guide - `_ for more. - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_product : Create a MultiIndex from the cartesian product - of iterables. - MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. - Index : The base pandas Index type. + Names for each of the index levels. (name is accepted for compat). + copy : bool, default False + Copy the meta-data. + verify_integrity : bool, default True + Check that the levels/codes are consistent and valid. Attributes ---------- @@ -199,6 +171,35 @@ class MultiIndex(Index): swaplevel reorder_levels remove_unused_levels + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_product : Create a MultiIndex from the cartesian product + of iterables. + MultiIndex.from_tuples : Convert list of tuples to a MultiIndex. + MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + Index : The base pandas Index type. + + Examples + --------- + A new ``MultiIndex`` is typically constructed using one of the helper + methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product` + and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``): + + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['number', 'color']) + + See further examples for how to construct a MultiIndex in the doc strings + of the mentioned helper methods. + + Notes + ----- + See the `user guide + `_ for more. """ # initialize to zero-length tuples to make everything work @@ -291,7 +292,7 @@ def _verify_integrity(self, codes=None, levels=None): @classmethod def from_arrays(cls, arrays, sortorder=None, names=None): """ - Convert arrays to MultiIndex + Convert arrays to MultiIndex. Parameters ---------- @@ -300,23 +301,28 @@ def from_arrays(cls, arrays, sortorder=None, names=None): len(arrays) is the number of levels. sortorder : int or None Level of sortedness (must be lexicographically sorted by that - level) + level). + names : list / sequence of str, optional + Names for the levels in the index. Returns ------- index : MultiIndex - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - See Also -------- MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['number', 'color']) """ if not is_list_like(arrays): raise TypeError("Input must be a list / sequence of array-likes.") @@ -341,7 +347,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): @classmethod def from_tuples(cls, tuples, sortorder=None, names=None): """ - Convert list of tuples to MultiIndex + Convert list of tuples to MultiIndex. Parameters ---------- @@ -349,24 +355,29 @@ def from_tuples(cls, tuples, sortorder=None, names=None): Each tuple is the index of one row/column. sortorder : int or None Level of sortedness (must be lexicographically sorted by that - level) + level). + names : list / sequence of str, optional + Names for the levels in the index. Returns ------- index : MultiIndex - Examples - -------- - >>> tuples = [(1, u'red'), (1, u'blue'), - (2, u'red'), (2, u'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - See Also -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables + of iterables. MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> tuples = [(1, u'red'), (1, u'blue'), + ... (2, u'red'), (2, u'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['number', 'color']) """ if not is_list_like(tuples): raise TypeError('Input must be a list / sequence of tuple-likes.') @@ -393,7 +404,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): @classmethod def from_product(cls, iterables, sortorder=None, names=None): """ - Make a MultiIndex from the cartesian product of multiple iterables + Make a MultiIndex from the cartesian product of multiple iterables. Parameters ---------- @@ -402,28 +413,28 @@ def from_product(cls, iterables, sortorder=None, names=None): sortorder : int or None Level of sortedness (must be lexicographically sorted by that level). - names : list / sequence of strings or None + names : list / sequence of str, optional Names for the levels in the index. Returns ------- index : MultiIndex - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = [u'green', u'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) - See Also -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_frame : Make a MultiIndex from a DataFrame. + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = ['green', 'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + ... names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], ['green', 'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=['number', 'color']) """ from pandas.core.arrays.categorical import _factorize_from_iterables from pandas.core.reshape.util import cartesian_product @@ -446,53 +457,51 @@ def from_frame(cls, df, sortorder=None, names=None): Parameters ---------- - df : pd.DataFrame + df : DataFrame DataFrame to be converted to MultiIndex. - sortorder : int or None + sortorder : int, optional Level of sortedness (must be lexicographically sorted by that level). - names : list-like, optonal + names : list-like, optional If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite names with the given sequence. Returns ------- - MultiIndex or Index + MultiIndex The MultiIndex representation of the given DataFrame. + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + Examples -------- - >>> df = pd.DataFrame([['ahc', 'happy'], ['ahc', 'jolly'], - ... ['boh', 'happy'], ['boh', 'jolly'], - ... ['oil', 'joy'], ['oil', 'joy']], + >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], + ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) >>> df - a b - 0 ahc happy - 1 ahc jolly - 2 boh happy - 3 boh jolly - 4 oil joy - 5 oil joy + a b + 0 HI Temp + 1 HI Precip + 2 NJ Temp + 3 NJ Precip >>> pd.MultiIndex.from_frame(df) - MultiIndex(levels=[['ahc', 'boh', 'oil'], ['happy', 'jolly', 'joy']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], + MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], names=['a', 'b']) - # Use explicit names, instead of column names - >>> pd.MultiIndex.from_frame(df, names=['X', 'Y']) - MultiIndex(levels=[['ahc', 'boh', 'oil'], ['happy', 'jolly', 'joy']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 2, 2]], - names=['X', 'Y']) + Using explicit names, instead of the column names - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. + >>> pd.MultiIndex.from_frame(df, names=['state', 'observation']) + MultiIndex(levels=[['HI', 'NJ'], ['Precip', 'Temp']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]], + names=['state', 'observation']) """ if not isinstance(df, ABCDataFrame): raise TypeError("Input must be a DataFrame") From 4240a1eebaf91653d4951703d17970f9ece2a543 Mon Sep 17 00:00:00 2001 From: Artin Sarraf Date: Thu, 6 Dec 2018 21:10:53 -0500 Subject: [PATCH 41/41] CLN - fix import order with isort --- pandas/core/indexes/multi.py | 2 +- pandas/tests/indexes/multi/test_constructor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 893d936771fa8..c7f14634757f7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,6 @@ # pylint: disable=E1101,E1103,W0232 -import datetime from collections import OrderedDict +import datetime from sys import getsizeof import warnings diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 562159dbd0f1a..e6678baf8a996 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -import re from collections import OrderedDict +import re import numpy as np import pytest