pandas-dev · datapythonista · Aug 11, 2022 · Aug 11, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -548,6 +548,7 @@ Other API changes
 - Operations with :class:`Timestamp` or :class:`Timedelta` that would previously raise ``OverflowError`` instead raise ``OutOfBoundsDatetime`` or ``OutOfBoundsTimedelta`` where appropriate (:issue:`47268`)
 - When :func:`read_sas` previously returned ``None``, it now returns an empty :class:`DataFrame` (:issue:`47410`)
 - :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`)
+- Removed ``mangle_dupe_cols`` argument from :func:`read_csv`, :func:`read_fwf` and :func:`read_excel`. The argument was never fully implemented and only supported value ``True`` (:issue:`47718`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.deprecations:

diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
@@ -58,7 +58,6 @@ class TextReader:
         skiprows=...,
         skipfooter: int = ...,  # int64_t
         verbose: bool = ...,
-        mangle_dupe_cols: bool = ...,
         float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
         skip_blank_lines: bool = ...,
         encoding_errors: bytes | str = ...,

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -312,7 +312,7 @@ cdef class TextReader:
         object handle
         object orig_header
         bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
-        bint mangle_dupe_cols, allow_leading_cols
+        bint allow_leading_cols
         uint64_t parser_start  # this is modified after __init__
         list clocks
         const char *encoding_errors
@@ -367,7 +367,6 @@ cdef class TextReader:
                   skiprows=None,
                   skipfooter=0,         # int64_t
                   bint verbose=False,
-                  bint mangle_dupe_cols=True,
                   float_precision=None,
                   bint skip_blank_lines=True,
                   encoding_errors=b"strict"):
@@ -383,8 +382,6 @@ cdef class TextReader:
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
 
-        self.mangle_dupe_cols = mangle_dupe_cols
-
         # For timekeeping
         self.clocks = []
 
@@ -672,7 +669,7 @@ cdef class TextReader:
 
                     this_header.append(name)
 
-                if not self.has_mi_columns and self.mangle_dupe_cols:
+                if not self.has_mi_columns:
                     # Ensure that regular columns are used before unnamed ones
                     # to keep given names and mangle unnamed columns
                     col_loop_order = [i for i in range(len(this_header))

@@ -275,10 +275,6 @@
     .. deprecated:: 1.3.0
         convert_float will be removed in a future version
 
-mangle_dupe_cols : bool, default True
-    Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
-    'X'...'X'. Passing in False will cause data to be overwritten if there
-    are duplicate names in the columns.
 {storage_options}
 
     .. versionadded:: 1.2.0
@@ -386,7 +382,6 @@ def read_excel(
     comment: str | None = ...,
     skipfooter: int = ...,
     convert_float: bool | None = ...,
-    mangle_dupe_cols: bool = ...,
     storage_options: StorageOptions = ...,
 ) -> DataFrame:
     ...
@@ -425,7 +420,6 @@ def read_excel(
     comment: str | None = ...,
     skipfooter: int = ...,
     convert_float: bool | None = ...,
-    mangle_dupe_cols: bool = ...,
     storage_options: StorageOptions = ...,
 ) -> dict[IntStrT, DataFrame]:
     ...
@@ -465,7 +459,6 @@ def read_excel(
     comment: str | None = None,
     skipfooter: int = 0,
     convert_float: bool | None = None,
-    mangle_dupe_cols: bool = True,
     storage_options: StorageOptions = None,
 ) -> DataFrame | dict[IntStrT, DataFrame]:
 
@@ -504,7 +497,6 @@ def read_excel(
             comment=comment,
             skipfooter=skipfooter,
             convert_float=convert_float,
-            mangle_dupe_cols=mangle_dupe_cols,
         )
     finally:
         # make sure to close opened file handles
@@ -709,7 +701,6 @@ def parse(
         comment: str | None = None,
         skipfooter: int = 0,
         convert_float: bool | None = None,
-        mangle_dupe_cols: bool = True,
         **kwds,
     ):
 
@@ -877,7 +868,6 @@ def parse(
                     comment=comment,
                     skipfooter=skipfooter,
                     usecols=usecols,
-                    mangle_dupe_cols=mangle_dupe_cols,
                     **kwds,
                 )
 
@@ -1686,7 +1676,6 @@ def parse(
         comment: str | None = None,
         skipfooter: int = 0,
         convert_float: bool | None = None,
-        mangle_dupe_cols: bool = True,
         **kwds,
     ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
         """
@@ -1719,7 +1708,6 @@ def parse(
             comment=comment,
             skipfooter=skipfooter,
             convert_float=convert_float,
-            mangle_dupe_cols=mangle_dupe_cols,
             **kwds,
         )
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -112,7 +112,6 @@ def __init__(self, kwds) -> None:
 
         self.true_values = kwds.get("true_values")
         self.false_values = kwds.get("false_values")
-        self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
         self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
         self.cache_dates = kwds.pop("cache_dates", True)
 
@@ -325,33 +324,32 @@ def extract(r):
         return names, index_names, col_names, passed_names
 
     @final
-    def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
+    def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
         # see gh-7160 and gh-9424: this helps to provide
         # immediate alleviation of the duplicate names
         # issue and appears to be satisfactory to users,
         # but ultimately, not needing to butcher the names
         # would be nice!
-        if self.mangle_dupe_cols:
-            names = list(names)  # so we can index
-            counts: DefaultDict[Hashable, int] = defaultdict(int)
-            is_potential_mi = _is_potential_multi_index(names, self.index_col)
+        names = list(names)  # so we can index
+        counts: DefaultDict[Hashable, int] = defaultdict(int)
+        is_potential_mi = _is_potential_multi_index(names, self.index_col)
 
-            for i, col in enumerate(names):
-                cur_count = counts[col]
+        for i, col in enumerate(names):
+            cur_count = counts[col]
 
-                while cur_count > 0:
-                    counts[col] = cur_count + 1
+            while cur_count > 0:
+                counts[col] = cur_count + 1
 
-                    if is_potential_mi:
-                        # for mypy
-                        assert isinstance(col, tuple)
-                        col = col[:-1] + (f"{col[-1]}.{cur_count}",)
-                    else:
-                        col = f"{col}.{cur_count}"
-                    cur_count = counts[col]
+                if is_potential_mi:
+                    # for mypy
+                    assert isinstance(col, tuple)
+                    col = col[:-1] + (f"{col[-1]}.{cur_count}",)
+                else:
+                    col = f"{col}.{cur_count}"
+                cur_count = counts[col]
 
-                names[i] = col
-                counts[col] = cur_count + 1
+            names[i] = col
+            counts[col] = cur_count + 1
 
         return names
 
@@ -1135,7 +1133,6 @@ def converter(*date_cols):
     "encoding": None,
     "squeeze": None,
     "compression": None,
-    "mangle_dupe_cols": True,
     "infer_datetime_format": False,
     "skip_blank_lines": True,
     "encoding_errors": "strict",

diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -248,7 +248,7 @@ def read(
         except StopIteration:
             if self._first_chunk:
                 self._first_chunk = False
-                names = self._maybe_dedup_names(self.orig_names)
+                names = self._dedup_names(self.orig_names)
                 index, columns, col_dict = self._get_empty_meta(
                     names,
                     self.index_col,
@@ -295,7 +295,7 @@ def read(
             if self.usecols is not None:
                 names = self._filter_usecols(names)
 
-            names = self._maybe_dedup_names(names)
+            names = self._dedup_names(names)
 
             # rename dict keys
             data_tups = sorted(data.items())
@@ -317,7 +317,7 @@ def read(
             # assert for mypy, orig_names is List or None, None would error in list(...)
             assert self.orig_names is not None
             names = list(self.orig_names)
-            names = self._maybe_dedup_names(names)
+            names = self._dedup_names(names)
 
             if self.usecols is not None:
                 names = self._filter_usecols(names)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -261,7 +261,7 @@ def read(
         columns: Sequence[Hashable] = list(self.orig_names)
         if not len(content):  # pragma: no cover
             # DataFrame with the right metadata, even though it's length 0
-            names = self._maybe_dedup_names(self.orig_names)
+            names = self._dedup_names(self.orig_names)
             # error: Cannot determine type of 'index_col'
             index, columns, col_dict = self._get_empty_meta(
                 names,
@@ -295,7 +295,7 @@ def _exclude_implicit_index(
         self,
         alldata: list[np.ndarray],
     ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
-        names = self._maybe_dedup_names(self.orig_names)
+        names = self._dedup_names(self.orig_names)
 
         offset = 0
         if self._implicit_index:
@@ -426,7 +426,7 @@ def _infer_columns(
                     else:
                         this_columns.append(c)
 
-                if not have_mi_columns and self.mangle_dupe_cols:
+                if not have_mi_columns:
                     counts: DefaultDict = defaultdict(int)
                     # Ensure that regular columns are used before unnamed ones
                     # to keep given names and mangle unnamed columns

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -158,10 +158,6 @@
 
     .. deprecated:: 1.4.0
        Use a list comprehension on the DataFrame's columns after calling ``read_csv``.
-mangle_dupe_cols : bool, default True
-    Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
-    'X'...'X'. Passing in False will cause data to be overwritten if there
-    are duplicate names in the columns.
 dtype : Type name or dict of column -> type, optional
     Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
     'c': 'Int64'}}
@@ -618,7 +614,6 @@ def read_csv(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -678,7 +673,6 @@ def read_csv(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -738,7 +732,6 @@ def read_csv(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -798,7 +791,6 @@ def read_csv(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -867,7 +859,6 @@ def read_csv(
     usecols=None,
     squeeze: bool | None = None,
     prefix: str | lib.NoDefault = lib.no_default,
-    mangle_dupe_cols: bool = True,
     # General Parsing Configuration
     dtype: DtypeArg | None = None,
     engine: CSVEngine | None = None,
@@ -956,7 +947,6 @@ def read_table(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1016,7 +1006,6 @@ def read_table(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1076,7 +1065,6 @@ def read_table(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1136,7 +1124,6 @@ def read_table(
     usecols=...,
     squeeze: bool | None = ...,
     prefix: str | lib.NoDefault = ...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1205,7 +1192,6 @@ def read_table(
     usecols=None,
     squeeze: bool | None = None,
     prefix: str | lib.NoDefault = lib.no_default,
-    mangle_dupe_cols: bool = True,
     # General Parsing Configuration
     dtype: DtypeArg | None = None,
     engine: CSVEngine | None = None,
@@ -1468,9 +1454,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
                     f"The {repr(argname)} option is not supported with the "
                     f"'pyarrow' engine"
                 )
-            elif argname == "mangle_dupe_cols" and value is False:
-                # GH12935
-                raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
             else:
                 options[argname] = value
 

diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -975,12 +975,6 @@ def test_duplicated_columns(self, path):
         result = pd.read_excel(path, sheet_name="test1", index_col=0)
         tm.assert_frame_equal(result, expected)
 
-        # Explicitly, we pass in the parameter.
-        result = pd.read_excel(
-            path, sheet_name="test1", index_col=0, mangle_dupe_cols=True
-        )
-        tm.assert_frame_equal(result, expected)
-
         # see gh-11007, gh-10970
         df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"])
         df.to_excel(path, "test1")
@@ -998,10 +992,6 @@ def test_duplicated_columns(self, path):
         expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
         tm.assert_frame_equal(result, expected)
 
-        msg = "Setting mangle_dupe_cols=False is not supported yet"
-        with pytest.raises(ValueError, match=msg):
-            pd.read_excel(path, sheet_name="test1", header=None, mangle_dupe_cols=False)
-
     def test_swapped_columns(self, path):
         # Test for issue #5427.
         write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]})

diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -14,14 +14,11 @@
 
 
 @skip_pyarrow
-@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
-def test_basic(all_parsers, kwargs):
-    # TODO: add test for condition "mangle_dupe_cols=False"
-    # once it is actually supported (gh-12935)
+def test_basic(all_parsers):
     parser = all_parsers
 
     data = "a,a,b,b,b\n1,2,3,4,5"
-    result = parser.read_csv(StringIO(data), sep=",", **kwargs)
+    result = parser.read_csv(StringIO(data), sep=",")
 
     expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
     tm.assert_frame_equal(result, expected)