DEP: Enforce deprecation of mangle_dupe_cols (#49977)

phofl · web-flow · commit 4d9145fed2c0 · 2022-11-30T15:40:46.000-08:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -155,15 +155,6 @@ usecols : list-like or callable, default ``None``
   when using the c engine. The Python engine loads the data first before deciding
   which columns to drop.
 
-mangle_dupe_cols : boolean, default ``True``
-  Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'.
-  Passing in ``False`` will cause data to be overwritten if there are duplicate
-  names in the columns.
-
-  .. deprecated:: 1.5.0
-     The argument was never implemented, and a new argument where the
-     renaming pattern can be specified will be added instead.
-
 General parsing configuration
 +++++++++++++++++++++++++++++
 
@@ -587,10 +578,6 @@ If the header is in a row other than the first, pass the row number to
 Duplicate names parsing
 '''''''''''''''''''''''
 
-  .. deprecated:: 1.5.0
-     ``mangle_dupe_cols`` was never implemented, and a new argument where the
-     renaming pattern can be specified will be added instead.
-
 If the file or header contains duplicate names, pandas will by default
 distinguish between them so as to prevent overwriting data:
 
@@ -599,8 +586,7 @@ distinguish between them so as to prevent overwriting data:
    data = "a,b,a\n0,1,2\n3,4,5"
    pd.read_csv(StringIO(data))
 
-There is no more duplicate data because ``mangle_dupe_cols=True`` by default,
-which modifies a series of duplicate columns 'X', ..., 'X' to become
+There is no more duplicate data because duplicate columns 'X', ..., 'X' become
 'X', 'X.1', ..., 'X.N'.
 
 .. _io.usecols:
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -434,6 +434,7 @@ Removal of prior version deprecations/changes
 - Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`)
 - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
 - Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
+- Remove keyword ``mangle_dupe_cols`` from :func:`read_csv` and :func:`read_table` (:issue:`48137`)
 - Removed ``errors`` keyword from :meth:`DataFrame.where`, :meth:`Series.where`, :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`47728`)
 - Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
 - Disallow passing non-keyword arguments to :meth:`DataFrame.drop` and :meth:`Series.drop` except ``labels`` (:issue:`41486`)
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
@@ -58,7 +58,6 @@ class TextReader:
         skiprows=...,
         skipfooter: int = ...,  # int64_t
         verbose: bool = ...,
-        mangle_dupe_cols: bool = ...,
         float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
         skip_blank_lines: bool = ...,
         encoding_errors: bytes | str = ...,
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -317,7 +317,7 @@ cdef class TextReader:
         object handle
         object orig_header
         bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
-        bint mangle_dupe_cols, allow_leading_cols
+        bint allow_leading_cols
         uint64_t parser_start  # this is modified after __init__
         list clocks
         const char *encoding_errors
@@ -373,7 +373,6 @@ cdef class TextReader:
                   skiprows=None,
                   skipfooter=0,         # int64_t
                   bint verbose=False,
-                  bint mangle_dupe_cols=True,
                   float_precision=None,
                   bint skip_blank_lines=True,
                   encoding_errors=b"strict",
@@ -390,8 +389,6 @@ cdef class TextReader:
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
 
-        self.mangle_dupe_cols = mangle_dupe_cols
-
         # For timekeeping
         self.clocks = []
 
@@ -680,7 +677,7 @@ cdef class TextReader:
 
                     this_header.append(name)
 
-                if not self.has_mi_columns and self.mangle_dupe_cols:
+                if not self.has_mi_columns:
                     # Ensure that regular columns are used before unnamed ones
                     # to keep given names and mangle unnamed columns
                     col_loop_order = [i for i in range(len(this_header))
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -125,7 +125,6 @@ def __init__(self, kwds) -> None:
 
         self.true_values = kwds.get("true_values")
         self.false_values = kwds.get("false_values")
-        self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
         self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
         self.cache_dates = kwds.pop("cache_dates", True)
 
@@ -333,34 +332,28 @@ def extract(r):
         return names, index_names, col_names, passed_names
 
     @final
-    def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
-        # see gh-7160 and gh-9424: this helps to provide
-        # immediate alleviation of the duplicate names
-        # issue and appears to be satisfactory to users,
-        # but ultimately, not needing to butcher the names
-        # would be nice!
-        if self.mangle_dupe_cols:
-            names = list(names)  # so we can index
-            counts: DefaultDict[Hashable, int] = defaultdict(int)
-            is_potential_mi = _is_potential_multi_index(names, self.index_col)
-
-            for i, col in enumerate(names):
-                cur_count = counts[col]
-
-                while cur_count > 0:
-                    counts[col] = cur_count + 1
+    def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
+        names = list(names)  # so we can index
+        counts: DefaultDict[Hashable, int] = defaultdict(int)
+        is_potential_mi = _is_potential_multi_index(names, self.index_col)
 
-                    if is_potential_mi:
-                        # for mypy
-                        assert isinstance(col, tuple)
-                        col = col[:-1] + (f"{col[-1]}.{cur_count}",)
-                    else:
-                        col = f"{col}.{cur_count}"
-                    cur_count = counts[col]
+        for i, col in enumerate(names):
+            cur_count = counts[col]
 
-                names[i] = col
+            while cur_count > 0:
                 counts[col] = cur_count + 1
 
+                if is_potential_mi:
+                    # for mypy
+                    assert isinstance(col, tuple)
+                    col = col[:-1] + (f"{col[-1]}.{cur_count}",)
+                else:
+                    col = f"{col}.{cur_count}"
+                cur_count = counts[col]
+
+            names[i] = col
+            counts[col] = cur_count + 1
+
         return names
 
     @final
@@ -1182,7 +1175,6 @@ def converter(*date_cols):
     "verbose": False,
     "encoding": None,
     "compression": None,
-    "mangle_dupe_cols": True,
     "infer_datetime_format": False,
     "skip_blank_lines": True,
     "encoding_errors": "strict",
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -227,7 +227,7 @@ def read(
         except StopIteration:
             if self._first_chunk:
                 self._first_chunk = False
-                names = self._maybe_dedup_names(self.orig_names)
+                names = self._dedup_names(self.orig_names)
                 index, columns, col_dict = self._get_empty_meta(
                     names,
                     self.index_col,
@@ -281,7 +281,7 @@ def read(
             if self.usecols is not None:
                 names = self._filter_usecols(names)
 
-            names = self._maybe_dedup_names(names)
+            names = self._dedup_names(names)
 
             # rename dict keys
             data_tups = sorted(data.items())
@@ -303,7 +303,7 @@ def read(
             # assert for mypy, orig_names is List or None, None would error in list(...)
             assert self.orig_names is not None
             names = list(self.orig_names)
-            names = self._maybe_dedup_names(names)
+            names = self._dedup_names(names)
 
             if self.usecols is not None:
                 names = self._filter_usecols(names)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -259,7 +259,7 @@ def read(
         columns: Sequence[Hashable] = list(self.orig_names)
         if not len(content):  # pragma: no cover
             # DataFrame with the right metadata, even though it's length 0
-            names = self._maybe_dedup_names(self.orig_names)
+            names = self._dedup_names(self.orig_names)
             # error: Cannot determine type of 'index_col'
             index, columns, col_dict = self._get_empty_meta(
                 names,
@@ -293,7 +293,7 @@ def _exclude_implicit_index(
         self,
         alldata: list[np.ndarray],
     ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
-        names = self._maybe_dedup_names(self.orig_names)
+        names = self._dedup_names(self.orig_names)
 
         offset = 0
         if self._implicit_index:
@@ -424,7 +424,7 @@ def _infer_columns(
                     else:
                         this_columns.append(c)
 
-                if not have_mi_columns and self.mangle_dupe_cols:
+                if not have_mi_columns:
                     counts: DefaultDict = defaultdict(int)
                     # Ensure that regular columns are used before unnamed ones
                     # to keep given names and mangle unnamed columns
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -41,10 +41,7 @@
     AbstractMethodError,
     ParserWarning,
 )
-from pandas.util._decorators import (
-    Appender,
-    deprecate_kwarg,
-)
+from pandas.util._decorators import Appender
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
@@ -152,14 +149,6 @@
     example of a valid callable argument would be ``lambda x: x.upper() in
     ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
     parsing time and lower memory usage.
-mangle_dupe_cols : bool, default True
-    Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
-    'X'...'X'. Passing in False will cause data to be overwritten if there
-    are duplicate names in the columns.
-
-    .. deprecated:: 1.5.0
-        Not implemented, and a new argument to specify the pattern for the
-        names of duplicated columns will be added instead
 dtype : Type name or dict of column -> type, optional
     Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
     'c': 'Int64'}}
@@ -604,7 +593,6 @@ def read_csv(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -661,7 +649,6 @@ def read_csv(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -718,7 +705,6 @@ def read_csv(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -775,7 +761,6 @@ def read_csv(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -821,7 +806,6 @@ def read_csv(
     ...
 
 
-@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
 @Appender(
     _doc_read_csv_and_table.format(
         func_name="read_csv",
@@ -842,7 +826,6 @@ def read_csv(
     names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
     index_col: IndexLabel | Literal[False] | None = None,
     usecols=None,
-    mangle_dupe_cols: bool = True,
     # General Parsing Configuration
     dtype: DtypeArg | None = None,
     engine: CSVEngine | None = None,
@@ -923,7 +906,6 @@ def read_table(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -980,7 +962,6 @@ def read_table(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1037,7 +1018,6 @@ def read_table(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1094,7 +1074,6 @@ def read_table(
     names: Sequence[Hashable] | None | lib.NoDefault = ...,
     index_col: IndexLabel | Literal[False] | None = ...,
     usecols=...,
-    mangle_dupe_cols: bool = ...,
     dtype: DtypeArg | None = ...,
     engine: CSVEngine | None = ...,
     converters=...,
@@ -1140,7 +1119,6 @@ def read_table(
     ...
 
 
-@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
 @Appender(
     _doc_read_csv_and_table.format(
         func_name="read_table",
@@ -1161,7 +1139,6 @@ def read_table(
     names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
     index_col: IndexLabel | Literal[False] | None = None,
     usecols=None,
-    mangle_dupe_cols: bool = True,
     # General Parsing Configuration
     dtype: DtypeArg | None = None,
     engine: CSVEngine | None = None,
@@ -1406,9 +1383,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
                     f"The {repr(argname)} option is not supported with the "
                     f"'pyarrow' engine"
                 )
-            if argname == "mangle_dupe_cols" and value is False:
-                # GH12935
-                raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
             options[argname] = value
 
         for argname, default in _c_parser_defaults.items():
diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -14,22 +14,11 @@
 
 
 @skip_pyarrow
-@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
-def test_basic(all_parsers, kwargs):
-    # TODO: add test for condition "mangle_dupe_cols=False"
-    # once it is actually supported (gh-12935)
+def test_basic(all_parsers):
     parser = all_parsers
 
     data = "a,a,b,b,b\n1,2,3,4,5"
-    if "mangle_dupe_cols" in kwargs:
-        with tm.assert_produces_warning(
-            FutureWarning,
-            match="the 'mangle_dupe_cols' keyword is deprecated",
-            check_stacklevel=False,
-        ):
-            result = parser.read_csv(StringIO(data), sep=",", **kwargs)
-    else:
-        result = parser.read_csv(StringIO(data), sep=",", **kwargs)
+    result = parser.read_csv(StringIO(data), sep=",")
 
     expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -34,14 +34,10 @@ class TestUnsupportedFeatures:
     def test_mangle_dupe_cols_false(self):
         # see gh-12935
         data = "a b c\n1 2 3"
-        msg = "is not supported"
 
         for engine in ("c", "python"):
-            with tm.assert_produces_warning(
-                FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated"
-            ):
-                with pytest.raises(ValueError, match=msg):
-                    read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False)
+            with pytest.raises(TypeError, match="unexpected keyword"):
+                read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
 
     def test_c_engine(self):
         # see gh-6607