Skip to content

ENH: Add allow_duplicates to MultiIndex.to_frame #45318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ enhancement2

Other enhancements
^^^^^^^^^^^^^^^^^^
- :meth:`MultiIndex.to_frame` now supports the argument ``allow_duplicates`` and raises on duplicate labels if it is missing or False (:issue:`45245`)
- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
- Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,18 @@ def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]:
attrs.append(("length", len(self)))
return attrs

@final
def _get_level_names(self) -> Hashable | Sequence[Hashable]:
"""
Return a name or list of names with None replaced by the level number.
"""
if self._is_multi:
return [
level if name is None else name for level, name in enumerate(self.names)
]
else:
return 0 if self.name is None else self.name

@final
def _mpl_repr(self) -> np.ndarray:
# how to represent ourselves to matplotlib
Expand Down Expand Up @@ -1629,7 +1641,7 @@ def to_frame(
from pandas import DataFrame

if name is lib.no_default:
name = self.name or 0
name = self._get_level_names()
result = DataFrame({name: self._values.copy()})

if index:
Expand Down
25 changes: 19 additions & 6 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1710,7 +1710,12 @@ def unique(self, level=None):
level = self._get_level_number(level)
return self._get_level_values(level=level, unique=True)

def to_frame(self, index: bool = True, name=lib.no_default) -> DataFrame:
def to_frame(
self,
index: bool = True,
name=lib.no_default,
allow_duplicates: bool = False,
) -> DataFrame:
"""
Create a DataFrame with the levels of the MultiIndex as columns.

Expand All @@ -1725,6 +1730,11 @@ def to_frame(self, index: bool = True, name=lib.no_default) -> DataFrame:
name : list / sequence of str, optional
The passed names should substitute index level names.

allow_duplicates : bool, optional default False
Allow duplicate column labels to be created.

.. versionadded:: 1.5.0

Returns
-------
DataFrame : a DataFrame containing the original MultiIndex data.
Expand Down Expand Up @@ -1772,16 +1782,19 @@ def to_frame(self, index: bool = True, name=lib.no_default) -> DataFrame:
)
idx_names = name
else:
idx_names = self.names
idx_names = self._get_level_names()

if not allow_duplicates and len(set(idx_names)) != len(idx_names):
raise ValueError(
"Cannot create duplicate column labels if allow_duplicates is False"
)

# Guarantee resulting column order - PY36+ dict maintains insertion order
result = DataFrame(
{
(level if lvlname is None else lvlname): self._get_level_values(level)
for lvlname, level in zip(idx_names, range(len(self.levels)))
},
{level: self._get_level_values(level) for level in range(len(self.levels))},
copy=False,
)
result.columns = idx_names

if index:
result.index = self
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/indexes/multi/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,28 @@ def test_to_frame_resulting_column_order():
assert result == expected


def test_to_frame_duplicate_labels():
# GH 45245
data = [(1, 2), (3, 4)]
names = ["a", "a"]
index = MultiIndex.from_tuples(data, names=names)
with pytest.raises(ValueError, match="Cannot create duplicate column labels"):
index.to_frame()

result = index.to_frame(allow_duplicates=True)
expected = DataFrame(data, index=index, columns=names)
tm.assert_frame_equal(result, expected)

names = [None, 0]
index = MultiIndex.from_tuples(data, names=names)
with pytest.raises(ValueError, match="Cannot create duplicate column labels"):
index.to_frame()

result = index.to_frame(allow_duplicates=True)
expected = DataFrame(data, index=index, columns=[0, 0])
tm.assert_frame_equal(result, expected)


def test_to_flat_index(idx):
expected = pd.Index(
(
Expand Down