From 4f595bafd0af2f76ea0f5cffe03d4ebbed155da9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 30 Nov 2022 22:42:07 +0100 Subject: [PATCH] DEP: Enforce deprecation of mangle_dupe_cols --- doc/source/user_guide/io.rst | 16 +------- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/parsers.pyi | 1 - pandas/_libs/parsers.pyx | 7 +--- pandas/io/parsers/base_parser.py | 44 +++++++++------------ pandas/io/parsers/c_parser_wrapper.py | 6 +-- pandas/io/parsers/python_parser.py | 6 +-- pandas/io/parsers/readers.py | 28 +------------ pandas/tests/io/parser/test_mangle_dupes.py | 15 +------ pandas/tests/io/parser/test_unsupported.py | 8 +--- 10 files changed, 33 insertions(+), 99 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 53bcf6ffd7a8a..a073087f6ec8f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -155,15 +155,6 @@ usecols : list-like or callable, default ``None`` when using the c engine. The Python engine loads the data first before deciding which columns to drop. -mangle_dupe_cols : boolean, default ``True`` - Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'. - Passing in ``False`` will cause data to be overwritten if there are duplicate - names in the columns. - - .. deprecated:: 1.5.0 - The argument was never implemented, and a new argument where the - renaming pattern can be specified will be added instead. - General parsing configuration +++++++++++++++++++++++++++++ @@ -587,10 +578,6 @@ If the header is in a row other than the first, pass the row number to Duplicate names parsing ''''''''''''''''''''''' - .. deprecated:: 1.5.0 - ``mangle_dupe_cols`` was never implemented, and a new argument where the - renaming pattern can be specified will be added instead. - If the file or header contains duplicate names, pandas will by default distinguish between them so as to prevent overwriting data: @@ -599,8 +586,7 @@ distinguish between them so as to prevent overwriting data: data = "a,b,a\n0,1,2\n3,4,5" pd.read_csv(StringIO(data)) -There is no more duplicate data because ``mangle_dupe_cols=True`` by default, -which modifies a series of duplicate columns 'X', ..., 'X' to become +There is no more duplicate data because duplicate columns 'X', ..., 'X' become 'X', 'X.1', ..., 'X.N'. .. _io.usecols: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1fb9a81e85a83..df82bcd37e971 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -434,6 +434,7 @@ Removal of prior version deprecations/changes - Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`) - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`) - Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`) +- Remove keyword ``mangle_dupe_cols`` from :func:`read_csv` and :func:`read_table` (:issue:`48137`) - Removed ``errors`` keyword from :meth:`DataFrame.where`, :meth:`Series.where`, :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`47728`) - Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`) - Disallow passing non-keyword arguments to :meth:`DataFrame.drop` and :meth:`Series.drop` except ``labels`` (:issue:`41486`) diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index d888511916e27..60f5304c39ad9 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -58,7 +58,6 @@ class TextReader: skiprows=..., skipfooter: int = ..., # int64_t verbose: bool = ..., - mangle_dupe_cols: bool = ..., float_precision: Literal["round_trip", "legacy", "high"] | None = ..., skip_blank_lines: bool = ..., encoding_errors: bytes | str = ..., diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 92874ef201246..85d74e201d5bb 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -317,7 +317,7 @@ cdef class TextReader: object handle object orig_header bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns - bint mangle_dupe_cols, allow_leading_cols + bint allow_leading_cols uint64_t parser_start # this is modified after __init__ list clocks const char *encoding_errors @@ -373,7 +373,6 @@ cdef class TextReader: skiprows=None, skipfooter=0, # int64_t bint verbose=False, - bint mangle_dupe_cols=True, float_precision=None, bint skip_blank_lines=True, encoding_errors=b"strict", @@ -390,8 +389,6 @@ cdef class TextReader: self.parser = parser_new() self.parser.chunksize = tokenize_chunksize - self.mangle_dupe_cols = mangle_dupe_cols - # For timekeeping self.clocks = [] @@ -680,7 +677,7 @@ cdef class TextReader: this_header.append(name) - if not self.has_mi_columns and self.mangle_dupe_cols: + if not self.has_mi_columns: # Ensure that regular columns are used before unnamed ones # to keep given names and mangle unnamed columns col_loop_order = [i for i in range(len(this_header)) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index b0f3754271894..7b9794dd434e6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -125,7 +125,6 @@ def __init__(self, kwds) -> None: self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") - self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) self.infer_datetime_format = kwds.pop("infer_datetime_format", False) self.cache_dates = kwds.pop("cache_dates", True) @@ -333,34 +332,28 @@ def extract(r): return names, index_names, col_names, passed_names @final - def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: - # see gh-7160 and gh-9424: this helps to provide - # immediate alleviation of the duplicate names - # issue and appears to be satisfactory to users, - # but ultimately, not needing to butcher the names - # would be nice! - if self.mangle_dupe_cols: - names = list(names) # so we can index - counts: DefaultDict[Hashable, int] = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names, self.index_col) - - for i, col in enumerate(names): - cur_count = counts[col] - - while cur_count > 0: - counts[col] = cur_count + 1 + def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: + names = list(names) # so we can index + counts: DefaultDict[Hashable, int] = defaultdict(int) + is_potential_mi = _is_potential_multi_index(names, self.index_col) - if is_potential_mi: - # for mypy - assert isinstance(col, tuple) - col = col[:-1] + (f"{col[-1]}.{cur_count}",) - else: - col = f"{col}.{cur_count}" - cur_count = counts[col] + for i, col in enumerate(names): + cur_count = counts[col] - names[i] = col + while cur_count > 0: counts[col] = cur_count + 1 + if is_potential_mi: + # for mypy + assert isinstance(col, tuple) + col = col[:-1] + (f"{col[-1]}.{cur_count}",) + else: + col = f"{col}.{cur_count}" + cur_count = counts[col] + + names[i] = col + counts[col] = cur_count + 1 + return names @final @@ -1182,7 +1175,6 @@ def converter(*date_cols): "verbose": False, "encoding": None, "compression": None, - "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index c1f2e6ddb2388..e0daf157d3d3a 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -227,7 +227,7 @@ def read( except StopIteration: if self._first_chunk: self._first_chunk = False - names = self._maybe_dedup_names(self.orig_names) + names = self._dedup_names(self.orig_names) index, columns, col_dict = self._get_empty_meta( names, self.index_col, @@ -281,7 +281,7 @@ def read( if self.usecols is not None: names = self._filter_usecols(names) - names = self._maybe_dedup_names(names) + names = self._dedup_names(names) # rename dict keys data_tups = sorted(data.items()) @@ -303,7 +303,7 @@ def read( # assert for mypy, orig_names is List or None, None would error in list(...) assert self.orig_names is not None names = list(self.orig_names) - names = self._maybe_dedup_names(names) + names = self._dedup_names(names) if self.usecols is not None: names = self._filter_usecols(names) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 121c52ba1c323..aebf285e669bb 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -259,7 +259,7 @@ def read( columns: Sequence[Hashable] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - names = self._maybe_dedup_names(self.orig_names) + names = self._dedup_names(self.orig_names) # error: Cannot determine type of 'index_col' index, columns, col_dict = self._get_empty_meta( names, @@ -293,7 +293,7 @@ def _exclude_implicit_index( self, alldata: list[np.ndarray], ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: - names = self._maybe_dedup_names(self.orig_names) + names = self._dedup_names(self.orig_names) offset = 0 if self._implicit_index: @@ -424,7 +424,7 @@ def _infer_columns( else: this_columns.append(c) - if not have_mi_columns and self.mangle_dupe_cols: + if not have_mi_columns: counts: DefaultDict = defaultdict(int) # Ensure that regular columns are used before unnamed ones # to keep given names and mangle unnamed columns diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 575390e9b97a4..d9c2403a19d0c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -41,10 +41,7 @@ AbstractMethodError, ParserWarning, ) -from pandas.util._decorators import ( - Appender, - deprecate_kwarg, -) +from pandas.util._decorators import Appender from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -152,14 +149,6 @@ example of a valid callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster parsing time and lower memory usage. -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. - - .. deprecated:: 1.5.0 - Not implemented, and a new argument to specify the pattern for the - names of duplicated columns will be added instead dtype : Type name or dict of column -> type, optional Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 'c': 'Int64'}} @@ -604,7 +593,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -661,7 +649,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -718,7 +705,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -775,7 +761,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -821,7 +806,6 @@ def read_csv( ... -@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) @Appender( _doc_read_csv_and_table.format( func_name="read_csv", @@ -842,7 +826,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, - mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -923,7 +906,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -980,7 +962,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1037,7 +1018,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1094,7 +1074,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1140,7 +1119,6 @@ def read_table( ... -@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None) @Appender( _doc_read_csv_and_table.format( func_name="read_table", @@ -1161,7 +1139,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, - mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1406,9 +1383,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: f"The {repr(argname)} option is not supported with the " f"'pyarrow' engine" ) - if argname == "mangle_dupe_cols" and value is False: - # GH12935 - raise ValueError("Setting mangle_dupe_cols=False is not supported yet") options[argname] = value for argname, default in _c_parser_defaults.items(): diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 13b419c3390fc..5709e7e4027e8 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -14,22 +14,11 @@ @skip_pyarrow -@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) -def test_basic(all_parsers, kwargs): - # TODO: add test for condition "mangle_dupe_cols=False" - # once it is actually supported (gh-12935) +def test_basic(all_parsers): parser = all_parsers data = "a,a,b,b,b\n1,2,3,4,5" - if "mangle_dupe_cols" in kwargs: - with tm.assert_produces_warning( - FutureWarning, - match="the 'mangle_dupe_cols' keyword is deprecated", - check_stacklevel=False, - ): - result = parser.read_csv(StringIO(data), sep=",", **kwargs) - else: - result = parser.read_csv(StringIO(data), sep=",", **kwargs) + result = parser.read_csv(StringIO(data), sep=",") expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 578cea44a8ed6..185dc733df3c2 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -34,14 +34,10 @@ class TestUnsupportedFeatures: def test_mangle_dupe_cols_false(self): # see gh-12935 data = "a b c\n1 2 3" - msg = "is not supported" for engine in ("c", "python"): - with tm.assert_produces_warning( - FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated" - ): - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) + with pytest.raises(TypeError, match="unexpected keyword"): + read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True) def test_c_engine(self): # see gh-6607