diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d794692de5005..c0cc985a42b87 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -548,6 +548,7 @@ Other API changes - Operations with :class:`Timestamp` or :class:`Timedelta` that would previously raise ``OverflowError`` instead raise ``OutOfBoundsDatetime`` or ``OutOfBoundsTimedelta`` where appropriate (:issue:`47268`) - When :func:`read_sas` previously returned ``None``, it now returns an empty :class:`DataFrame` (:issue:`47410`) - :class:`DataFrame` constructor raises if ``index`` or ``columns`` arguments are sets (:issue:`47215`) +- Removed ``mangle_dupe_cols`` argument from :func:`read_csv`, :func:`read_fwf` and :func:`read_excel`. The argument was never fully implemented and only supported value ``True`` (:issue:`47718`) .. --------------------------------------------------------------------------- .. _whatsnew_150.deprecations: diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi index 6b0bbf183f07e..89f0580fec965 100644 --- a/pandas/_libs/parsers.pyi +++ b/pandas/_libs/parsers.pyi @@ -58,7 +58,6 @@ class TextReader: skiprows=..., skipfooter: int = ..., # int64_t verbose: bool = ..., - mangle_dupe_cols: bool = ..., float_precision: Literal["round_trip", "legacy", "high"] | None = ..., skip_blank_lines: bool = ..., encoding_errors: bytes | str = ..., diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b07fa143c98b6..cfeca8d3615be 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -312,7 +312,7 @@ cdef class TextReader: object handle object orig_header bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns - bint mangle_dupe_cols, allow_leading_cols + bint allow_leading_cols uint64_t parser_start # this is modified after __init__ list clocks const char *encoding_errors @@ -367,7 +367,6 @@ cdef class TextReader: skiprows=None, skipfooter=0, # int64_t bint verbose=False, - bint mangle_dupe_cols=True, float_precision=None, bint skip_blank_lines=True, encoding_errors=b"strict"): @@ -383,8 +382,6 @@ cdef class TextReader: self.parser = parser_new() self.parser.chunksize = tokenize_chunksize - self.mangle_dupe_cols = mangle_dupe_cols - # For timekeeping self.clocks = [] @@ -672,7 +669,7 @@ cdef class TextReader: this_header.append(name) - if not self.has_mi_columns and self.mangle_dupe_cols: + if not self.has_mi_columns: # Ensure that regular columns are used before unnamed ones # to keep given names and mangle unnamed columns col_loop_order = [i for i in range(len(this_header)) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 44152f100d390..5310279256066 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -275,10 +275,6 @@ .. deprecated:: 1.3.0 convert_float will be removed in a future version -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. {storage_options} .. versionadded:: 1.2.0 @@ -386,7 +382,6 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., convert_float: bool | None = ..., - mangle_dupe_cols: bool = ..., storage_options: StorageOptions = ..., ) -> DataFrame: ... @@ -425,7 +420,6 @@ def read_excel( comment: str | None = ..., skipfooter: int = ..., convert_float: bool | None = ..., - mangle_dupe_cols: bool = ..., storage_options: StorageOptions = ..., ) -> dict[IntStrT, DataFrame]: ... @@ -465,7 +459,6 @@ def read_excel( comment: str | None = None, skipfooter: int = 0, convert_float: bool | None = None, - mangle_dupe_cols: bool = True, storage_options: StorageOptions = None, ) -> DataFrame | dict[IntStrT, DataFrame]: @@ -504,7 +497,6 @@ def read_excel( comment=comment, skipfooter=skipfooter, convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, ) finally: # make sure to close opened file handles @@ -709,7 +701,6 @@ def parse( comment: str | None = None, skipfooter: int = 0, convert_float: bool | None = None, - mangle_dupe_cols: bool = True, **kwds, ): @@ -877,7 +868,6 @@ def parse( comment=comment, skipfooter=skipfooter, usecols=usecols, - mangle_dupe_cols=mangle_dupe_cols, **kwds, ) @@ -1686,7 +1676,6 @@ def parse( comment: str | None = None, skipfooter: int = 0, convert_float: bool | None = None, - mangle_dupe_cols: bool = True, **kwds, ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]: """ @@ -1719,7 +1708,6 @@ def parse( comment=comment, skipfooter=skipfooter, convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, **kwds, ) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 0e40e47bf7cb1..2a596ad2b549a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -112,7 +112,6 @@ def __init__(self, kwds) -> None: self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") - self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) self.infer_datetime_format = kwds.pop("infer_datetime_format", False) self.cache_dates = kwds.pop("cache_dates", True) @@ -325,33 +324,32 @@ def extract(r): return names, index_names, col_names, passed_names @final - def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: + def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]: # see gh-7160 and gh-9424: this helps to provide # immediate alleviation of the duplicate names # issue and appears to be satisfactory to users, # but ultimately, not needing to butcher the names # would be nice! - if self.mangle_dupe_cols: - names = list(names) # so we can index - counts: DefaultDict[Hashable, int] = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names, self.index_col) + names = list(names) # so we can index + counts: DefaultDict[Hashable, int] = defaultdict(int) + is_potential_mi = _is_potential_multi_index(names, self.index_col) - for i, col in enumerate(names): - cur_count = counts[col] + for i, col in enumerate(names): + cur_count = counts[col] - while cur_count > 0: - counts[col] = cur_count + 1 + while cur_count > 0: + counts[col] = cur_count + 1 - if is_potential_mi: - # for mypy - assert isinstance(col, tuple) - col = col[:-1] + (f"{col[-1]}.{cur_count}",) - else: - col = f"{col}.{cur_count}" - cur_count = counts[col] + if is_potential_mi: + # for mypy + assert isinstance(col, tuple) + col = col[:-1] + (f"{col[-1]}.{cur_count}",) + else: + col = f"{col}.{cur_count}" + cur_count = counts[col] - names[i] = col - counts[col] = cur_count + 1 + names[i] = col + counts[col] = cur_count + 1 return names @@ -1135,7 +1133,6 @@ def converter(*date_cols): "encoding": None, "squeeze": None, "compression": None, - "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index aec999e40b0f5..773d9cc162aa6 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -248,7 +248,7 @@ def read( except StopIteration: if self._first_chunk: self._first_chunk = False - names = self._maybe_dedup_names(self.orig_names) + names = self._dedup_names(self.orig_names) index, columns, col_dict = self._get_empty_meta( names, self.index_col, @@ -295,7 +295,7 @@ def read( if self.usecols is not None: names = self._filter_usecols(names) - names = self._maybe_dedup_names(names) + names = self._dedup_names(names) # rename dict keys data_tups = sorted(data.items()) @@ -317,7 +317,7 @@ def read( # assert for mypy, orig_names is List or None, None would error in list(...) assert self.orig_names is not None names = list(self.orig_names) - names = self._maybe_dedup_names(names) + names = self._dedup_names(names) if self.usecols is not None: names = self._filter_usecols(names) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 7c03a81dbc0e6..af9c0b069e1d4 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -261,7 +261,7 @@ def read( columns: Sequence[Hashable] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - names = self._maybe_dedup_names(self.orig_names) + names = self._dedup_names(self.orig_names) # error: Cannot determine type of 'index_col' index, columns, col_dict = self._get_empty_meta( names, @@ -295,7 +295,7 @@ def _exclude_implicit_index( self, alldata: list[np.ndarray], ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: - names = self._maybe_dedup_names(self.orig_names) + names = self._dedup_names(self.orig_names) offset = 0 if self._implicit_index: @@ -426,7 +426,7 @@ def _infer_columns( else: this_columns.append(c) - if not have_mi_columns and self.mangle_dupe_cols: + if not have_mi_columns: counts: DefaultDict = defaultdict(int) # Ensure that regular columns are used before unnamed ones # to keep given names and mangle unnamed columns diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index dc4556542d8e2..b2dcd6fdc091f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -158,10 +158,6 @@ .. deprecated:: 1.4.0 Use a list comprehension on the DataFrame's columns after calling ``read_csv``. -mangle_dupe_cols : bool, default True - Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than - 'X'...'X'. Passing in False will cause data to be overwritten if there - are duplicate names in the columns. dtype : Type name or dict of column -> type, optional Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 'c': 'Int64'}} @@ -618,7 +614,6 @@ def read_csv( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -678,7 +673,6 @@ def read_csv( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -738,7 +732,6 @@ def read_csv( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -798,7 +791,6 @@ def read_csv( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -867,7 +859,6 @@ def read_csv( usecols=None, squeeze: bool | None = None, prefix: str | lib.NoDefault = lib.no_default, - mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -956,7 +947,6 @@ def read_table( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1016,7 +1006,6 @@ def read_table( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1076,7 +1065,6 @@ def read_table( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1136,7 +1124,6 @@ def read_table( usecols=..., squeeze: bool | None = ..., prefix: str | lib.NoDefault = ..., - mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters=..., @@ -1205,7 +1192,6 @@ def read_table( usecols=None, squeeze: bool | None = None, prefix: str | lib.NoDefault = lib.no_default, - mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1468,9 +1454,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: f"The {repr(argname)} option is not supported with the " f"'pyarrow' engine" ) - elif argname == "mangle_dupe_cols" and value is False: - # GH12935 - raise ValueError("Setting mangle_dupe_cols=False is not supported yet") else: options[argname] = value diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index ba6366b71d854..9708294ec1e68 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -975,12 +975,6 @@ def test_duplicated_columns(self, path): result = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(result, expected) - # Explicitly, we pass in the parameter. - result = pd.read_excel( - path, sheet_name="test1", index_col=0, mangle_dupe_cols=True - ) - tm.assert_frame_equal(result, expected) - # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(path, "test1") @@ -998,10 +992,6 @@ def test_duplicated_columns(self, path): expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) - msg = "Setting mangle_dupe_cols=False is not supported yet" - with pytest.raises(ValueError, match=msg): - pd.read_excel(path, sheet_name="test1", header=None, mangle_dupe_cols=False) - def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 3f7b1b5dfa19b..5709e7e4027e8 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -14,14 +14,11 @@ @skip_pyarrow -@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}]) -def test_basic(all_parsers, kwargs): - # TODO: add test for condition "mangle_dupe_cols=False" - # once it is actually supported (gh-12935) +def test_basic(all_parsers): parser = all_parsers data = "a,a,b,b,b\n1,2,3,4,5" - result = parser.read_csv(StringIO(data), sep=",", **kwargs) + result = parser.read_csv(StringIO(data), sep=",") expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 2f28697daf9e2..e99387be4eee2 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -31,15 +31,6 @@ def python_engine(request): class TestUnsupportedFeatures: - def test_mangle_dupe_cols_false(self): - # see gh-12935 - data = "a b c\n1 2 3" - msg = "is not supported" - - for engine in ("c", "python"): - with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) - def test_c_engine(self): # see gh-6607 data = "a b c\n1 2 3"