Skip to content

CLN: Remove mangle_dupe_cols argument #48037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ Other API changes
- Operations with :class:`Timestamp` or :class:`Timedelta` that would previously raise ``OverflowError`` instead raise ``OutOfBoundsDatetime`` or ``OutOfBoundsTimedelta`` where appropriate (:issue:`47268`)
- When :func:`read_sas` previously returned ``None``, it now returns an empty :class:`DataFrame` (:issue:`47410`)
- :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`)
- Removed ``mangle_dupe_cols`` argument from :func:`read_csv`, :func:`read_fwf` and :func:`read_excel`. The argument was never fully implemented and only supported value ``True`` (:issue:`47718`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.deprecations:
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/parsers.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class TextReader:
skiprows=...,
skipfooter: int = ..., # int64_t
verbose: bool = ...,
mangle_dupe_cols: bool = ...,
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
skip_blank_lines: bool = ...,
encoding_errors: bytes | str = ...,
Expand Down
7 changes: 2 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ cdef class TextReader:
object handle
object orig_header
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
bint mangle_dupe_cols, allow_leading_cols
bint allow_leading_cols
uint64_t parser_start # this is modified after __init__
list clocks
const char *encoding_errors
Expand Down Expand Up @@ -367,7 +367,6 @@ cdef class TextReader:
skiprows=None,
skipfooter=0, # int64_t
bint verbose=False,
bint mangle_dupe_cols=True,
float_precision=None,
bint skip_blank_lines=True,
encoding_errors=b"strict"):
Expand All @@ -383,8 +382,6 @@ cdef class TextReader:
self.parser = parser_new()
self.parser.chunksize = tokenize_chunksize

self.mangle_dupe_cols = mangle_dupe_cols

# For timekeeping
self.clocks = []

Expand Down Expand Up @@ -672,7 +669,7 @@ cdef class TextReader:

this_header.append(name)

if not self.has_mi_columns and self.mangle_dupe_cols:
if not self.has_mi_columns:
# Ensure that regular columns are used before unnamed ones
# to keep given names and mangle unnamed columns
col_loop_order = [i for i in range(len(this_header))
Expand Down
12 changes: 0 additions & 12 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,6 @@
.. deprecated:: 1.3.0
convert_float will be removed in a future version

mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
{storage_options}

.. versionadded:: 1.2.0
Expand Down Expand Up @@ -386,7 +382,6 @@ def read_excel(
comment: str | None = ...,
skipfooter: int = ...,
convert_float: bool | None = ...,
mangle_dupe_cols: bool = ...,
storage_options: StorageOptions = ...,
) -> DataFrame:
...
Expand Down Expand Up @@ -425,7 +420,6 @@ def read_excel(
comment: str | None = ...,
skipfooter: int = ...,
convert_float: bool | None = ...,
mangle_dupe_cols: bool = ...,
storage_options: StorageOptions = ...,
) -> dict[IntStrT, DataFrame]:
...
Expand Down Expand Up @@ -465,7 +459,6 @@ def read_excel(
comment: str | None = None,
skipfooter: int = 0,
convert_float: bool | None = None,
mangle_dupe_cols: bool = True,
storage_options: StorageOptions = None,
) -> DataFrame | dict[IntStrT, DataFrame]:

Expand Down Expand Up @@ -504,7 +497,6 @@ def read_excel(
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
)
finally:
# make sure to close opened file handles
Expand Down Expand Up @@ -709,7 +701,6 @@ def parse(
comment: str | None = None,
skipfooter: int = 0,
convert_float: bool | None = None,
mangle_dupe_cols: bool = True,
**kwds,
):

Expand Down Expand Up @@ -877,7 +868,6 @@ def parse(
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)

Expand Down Expand Up @@ -1686,7 +1676,6 @@ def parse(
comment: str | None = None,
skipfooter: int = 0,
convert_float: bool | None = None,
mangle_dupe_cols: bool = True,
**kwds,
) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
"""
Expand Down Expand Up @@ -1719,7 +1708,6 @@ def parse(
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)

Expand Down
37 changes: 17 additions & 20 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def __init__(self, kwds) -> None:

self.true_values = kwds.get("true_values")
self.false_values = kwds.get("false_values")
self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
self.cache_dates = kwds.pop("cache_dates", True)

Expand Down Expand Up @@ -325,33 +324,32 @@ def extract(r):
return names, index_names, col_names, passed_names

@final
def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
# see gh-7160 and gh-9424: this helps to provide
# immediate alleviation of the duplicate names
# issue and appears to be satisfactory to users,
# but ultimately, not needing to butcher the names
# would be nice!
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts: DefaultDict[Hashable, int] = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names, self.index_col)
names = list(names) # so we can index
counts: DefaultDict[Hashable, int] = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

for i, col in enumerate(names):
cur_count = counts[col]
for i, col in enumerate(names):
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1
while cur_count > 0:
counts[col] = cur_count + 1

if is_potential_mi:
# for mypy
assert isinstance(col, tuple)
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
cur_count = counts[col]
if is_potential_mi:
# for mypy
assert isinstance(col, tuple)
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
cur_count = counts[col]

names[i] = col
counts[col] = cur_count + 1
names[i] = col
counts[col] = cur_count + 1

return names

Expand Down Expand Up @@ -1135,7 +1133,6 @@ def converter(*date_cols):
"encoding": None,
"squeeze": None,
"compression": None,
"mangle_dupe_cols": True,
"infer_datetime_format": False,
"skip_blank_lines": True,
"encoding_errors": "strict",
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def read(
except StopIteration:
if self._first_chunk:
self._first_chunk = False
names = self._maybe_dedup_names(self.orig_names)
names = self._dedup_names(self.orig_names)
index, columns, col_dict = self._get_empty_meta(
names,
self.index_col,
Expand Down Expand Up @@ -295,7 +295,7 @@ def read(
if self.usecols is not None:
names = self._filter_usecols(names)

names = self._maybe_dedup_names(names)
names = self._dedup_names(names)

# rename dict keys
data_tups = sorted(data.items())
Expand All @@ -317,7 +317,7 @@ def read(
# assert for mypy, orig_names is List or None, None would error in list(...)
assert self.orig_names is not None
names = list(self.orig_names)
names = self._maybe_dedup_names(names)
names = self._dedup_names(names)

if self.usecols is not None:
names = self._filter_usecols(names)
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def read(
columns: Sequence[Hashable] = list(self.orig_names)
if not len(content): # pragma: no cover
# DataFrame with the right metadata, even though it's length 0
names = self._maybe_dedup_names(self.orig_names)
names = self._dedup_names(self.orig_names)
# error: Cannot determine type of 'index_col'
index, columns, col_dict = self._get_empty_meta(
names,
Expand Down Expand Up @@ -295,7 +295,7 @@ def _exclude_implicit_index(
self,
alldata: list[np.ndarray],
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
names = self._maybe_dedup_names(self.orig_names)
names = self._dedup_names(self.orig_names)

offset = 0
if self._implicit_index:
Expand Down Expand Up @@ -426,7 +426,7 @@ def _infer_columns(
else:
this_columns.append(c)

if not have_mi_columns and self.mangle_dupe_cols:
if not have_mi_columns:
counts: DefaultDict = defaultdict(int)
# Ensure that regular columns are used before unnamed ones
# to keep given names and mangle unnamed columns
Expand Down
17 changes: 0 additions & 17 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,6 @@

.. deprecated:: 1.4.0
Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.
dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Expand Down Expand Up @@ -618,7 +614,6 @@ def read_csv(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -678,7 +673,6 @@ def read_csv(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -738,7 +732,6 @@ def read_csv(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -798,7 +791,6 @@ def read_csv(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -867,7 +859,6 @@ def read_csv(
usecols=None,
squeeze: bool | None = None,
prefix: str | lib.NoDefault = lib.no_default,
mangle_dupe_cols: bool = True,
# General Parsing Configuration
dtype: DtypeArg | None = None,
engine: CSVEngine | None = None,
Expand Down Expand Up @@ -956,7 +947,6 @@ def read_table(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1016,7 +1006,6 @@ def read_table(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1076,7 +1065,6 @@ def read_table(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1136,7 +1124,6 @@ def read_table(
usecols=...,
squeeze: bool | None = ...,
prefix: str | lib.NoDefault = ...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1205,7 +1192,6 @@ def read_table(
usecols=None,
squeeze: bool | None = None,
prefix: str | lib.NoDefault = lib.no_default,
mangle_dupe_cols: bool = True,
# General Parsing Configuration
dtype: DtypeArg | None = None,
engine: CSVEngine | None = None,
Expand Down Expand Up @@ -1468,9 +1454,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
f"The {repr(argname)} option is not supported with the "
f"'pyarrow' engine"
)
elif argname == "mangle_dupe_cols" and value is False:
# GH12935
raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
else:
options[argname] = value

Expand Down
10 changes: 0 additions & 10 deletions pandas/tests/io/excel/test_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -975,12 +975,6 @@ def test_duplicated_columns(self, path):
result = pd.read_excel(path, sheet_name="test1", index_col=0)
tm.assert_frame_equal(result, expected)

# Explicitly, we pass in the parameter.
result = pd.read_excel(
path, sheet_name="test1", index_col=0, mangle_dupe_cols=True
)
tm.assert_frame_equal(result, expected)

# see gh-11007, gh-10970
df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"])
df.to_excel(path, "test1")
Expand All @@ -998,10 +992,6 @@ def test_duplicated_columns(self, path):
expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
tm.assert_frame_equal(result, expected)

msg = "Setting mangle_dupe_cols=False is not supported yet"
with pytest.raises(ValueError, match=msg):
pd.read_excel(path, sheet_name="test1", header=None, mangle_dupe_cols=False)

def test_swapped_columns(self, path):
# Test for issue #5427.
write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]})
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,11 @@


@skip_pyarrow
@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
def test_basic(all_parsers, kwargs):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
def test_basic(all_parsers):
parser = all_parsers

data = "a,a,b,b,b\n1,2,3,4,5"
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
result = parser.read_csv(StringIO(data), sep=",")

expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
tm.assert_frame_equal(result, expected)
Expand Down
Loading