From 3194d97ea48c6b10689c07e94630e80d40c53f37 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 11 Jan 2022 18:48:05 +0000 Subject: [PATCH 1/8] Add allow_duplicates to to_frame --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/indexes/multi.py | 28 +++++++++++++++---- pandas/tests/indexes/multi/test_conversion.py | 22 +++++++++++++++ 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5dd6fa0d4f72d..0ff215ec68a6e 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -226,6 +226,7 @@ Other enhancements - :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). - Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`) - :meth:`DataFrame.to_sql` now returns an ``int`` of the number of written rows (:issue:`23998`) +- :meth:`MultiIndex.to_frame` now supports the argument ``allow_duplicates`` and raises on duplicate labels if it is missing or False (:issue:`45245`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 676fa4962cd56..b0cdfa9e7e901 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1710,7 +1710,12 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def to_frame(self, index: bool = True, name=lib.no_default) -> DataFrame: + def to_frame( + self, + index: bool = True, + name=lib.no_default, + allow_duplicates: bool = False, + ) -> DataFrame: """ Create a DataFrame with the levels of the MultiIndex as columns. @@ -1725,6 +1730,11 @@ def to_frame(self, index: bool = True, name=lib.no_default) -> DataFrame: name : list / sequence of str, optional The passed names should substitute index level names. + allow_duplicates : bool, optional default False + Allow duplicate column labels to be created. + + .. versionadded:: 1.4.0 + Returns ------- DataFrame : a DataFrame containing the original MultiIndex data. @@ -1774,14 +1784,20 @@ def to_frame(self, index: bool = True, name=lib.no_default) -> DataFrame: else: idx_names = self.names + idx_names = [ + level if name is None else name for level, name in enumerate(idx_names) + ] + + if not allow_duplicates and len(set(idx_names)) != len(idx_names): + raise ValueError( + "Cannot create duplicate column labels if allow_duplicates is False" + ) + # Guarantee resulting column order - PY36+ dict maintains insertion order result = DataFrame( - { - (level if lvlname is None else lvlname): self._get_level_values(level) - for lvlname, level in zip(idx_names, range(len(self.levels))) - }, + {level: self._get_level_values(level) for level in range(len(self.levels))}, copy=False, - ) + ).set_axis(idx_names, axis=1) if index: result.index = self diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 072055e4824a7..3c2ca045d6f99 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -126,6 +126,28 @@ def test_to_frame_resulting_column_order(): assert result == expected +def test_to_frame_duplicate_labels(): + # GH 45245 + data = [(1, 2), (3, 4)] + names = ["a", "a"] + index = MultiIndex.from_tuples(data, names=names) + with pytest.raises(ValueError, match="Cannot create duplicate column labels"): + index.to_frame() + + result = index.to_frame(allow_duplicates=True) + expected = DataFrame(data, index=index, columns=names) + tm.assert_frame_equal(result, expected) + + names = [None, 0] + index = MultiIndex.from_tuples(data, names=names) + with pytest.raises(ValueError, match="Cannot create duplicate column labels"): + index.to_frame() + + result = index.to_frame(allow_duplicates=True) + expected = DataFrame(data, index=index, columns=[0, 0]) + tm.assert_frame_equal(result, expected) + + def test_to_flat_index(idx): expected = pd.Index( ( From a3359b9e3a481934acee13a62d382c449ffba30c Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sun, 16 Jan 2022 19:14:29 +0000 Subject: [PATCH 2/8] Review changes --- doc/source/whatsnew/v1.4.0.rst | 2 +- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/indexes/multi.py | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0ff215ec68a6e..5c7a9e35aea63 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -226,7 +226,7 @@ Other enhancements - :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). - Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`) - :meth:`DataFrame.to_sql` now returns an ``int`` of the number of written rows (:issue:`23998`) -- :meth:`MultiIndex.to_frame` now supports the argument ``allow_duplicates`` and raises on duplicate labels if it is missing or False (:issue:`45245`) + .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 0173807cb9bd0..89ac122861366 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -30,7 +30,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`MultiIndex.to_frame` now supports the argument ``allow_duplicates`` and raises on duplicate labels if it is missing or False (:issue:`45245`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b0cdfa9e7e901..e434305c5bf1d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1733,7 +1733,7 @@ def to_frame( allow_duplicates : bool, optional default False Allow duplicate column labels to be created. - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 Returns ------- @@ -1797,7 +1797,8 @@ def to_frame( result = DataFrame( {level: self._get_level_values(level) for level in range(len(self.levels))}, copy=False, - ).set_axis(idx_names, axis=1) + ) + result.columns = idx_names if index: result.index = self From 0444265794a29676578f91b32678b306a43b72b4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 17 Jan 2022 14:21:30 +0000 Subject: [PATCH 3/8] Update v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f341ac7bfa7e6..485ad4cedcacb 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -227,7 +227,6 @@ Other enhancements - Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`) - :meth:`DataFrame.to_sql` now returns an ``int`` of the number of written rows (:issue:`23998`) - .. --------------------------------------------------------------------------- .. _whatsnew_140.notable_bug_fixes: From 31110eb8ebd3b9b89257ddd5a21741e946fac3e8 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 17 Jan 2022 18:45:01 +0000 Subject: [PATCH 4/8] factor out _make_labels --- pandas/core/indexes/base.py | 12 +++++++++++- pandas/core/indexes/multi.py | 6 +----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bb262a9ba416a..91712754528fa 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1353,6 +1353,16 @@ def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]: attrs.append(("length", len(self))) return attrs + @final + def _make_labels(self) -> Hashable | Sequence[Hashable]: + """ + Return a name or list of names with None replaced by the level number. + """ + if self._is_multi: + return[level if name is None else name for level, name in enumerate(self.names)] + else: + return 0 if self.name is None else self.name + @final def _mpl_repr(self) -> np.ndarray: # how to represent ourselves to matplotlib @@ -1628,7 +1638,7 @@ def to_frame( from pandas import DataFrame if name is lib.no_default: - name = self.name or 0 + name = self._make_labels() result = DataFrame({name: self._values.copy()}) if index: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e434305c5bf1d..0daab29a5b31a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1782,11 +1782,7 @@ def to_frame( ) idx_names = name else: - idx_names = self.names - - idx_names = [ - level if name is None else name for level, name in enumerate(idx_names) - ] + idx_names = self._make_labels() if not allow_duplicates and len(set(idx_names)) != len(idx_names): raise ValueError( From 4c0b994712f26c18578825d05823d9d8c657e8a6 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Mon, 17 Jan 2022 18:46:28 +0000 Subject: [PATCH 5/8] Update base.py --- pandas/core/indexes/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 91712754528fa..af6381f9b453b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1359,7 +1359,9 @@ def _make_labels(self) -> Hashable | Sequence[Hashable]: Return a name or list of names with None replaced by the level number. """ if self._is_multi: - return[level if name is None else name for level, name in enumerate(self.names)] + return [ + level if name is None else name for level, name in enumerate(self.names) + ] else: return 0 if self.name is None else self.name From e69fc478ecfa6486521d1af03344fbe60608da88 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Wed, 19 Jan 2022 11:12:48 +0000 Subject: [PATCH 6/8] Trigger CI From 7f8fd325c9aea419987b24daf9ccf214e9cb2985 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 20 Jan 2022 08:27:38 +0000 Subject: [PATCH 7/8] _get_level_names --- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/multi.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index af6381f9b453b..3c7260c116fcc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1354,7 +1354,7 @@ def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]: return attrs @final - def _make_labels(self) -> Hashable | Sequence[Hashable]: + def _get_level_names(self) -> Hashable | Sequence[Hashable]: """ Return a name or list of names with None replaced by the level number. """ @@ -1640,7 +1640,7 @@ def to_frame( from pandas import DataFrame if name is lib.no_default: - name = self._make_labels() + name = self._get_level_names() result = DataFrame({name: self._values.copy()}) if index: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0daab29a5b31a..9da513c60c997 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1782,7 +1782,7 @@ def to_frame( ) idx_names = name else: - idx_names = self._make_labels() + idx_names = self._get_level_names() if not allow_duplicates and len(set(idx_names)) != len(idx_names): raise ValueError( From 29ac6b6c7676296a3d464bc1388a6b8e15fc8d45 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 20 Jan 2022 11:34:44 +0000 Subject: [PATCH 8/8] Trigger CI