Skip to content

Commit 4d9145f

Browse files
authored
DEP: Enforce deprecation of mangle_dupe_cols (#49977)
1 parent 284758d commit 4d9145f

File tree

10 files changed

+33
-99
lines changed

10 files changed

+33
-99
lines changed

doc/source/user_guide/io.rst

+1-15
Original file line numberDiff line numberDiff line change
@@ -155,15 +155,6 @@ usecols : list-like or callable, default ``None``
155155
when using the c engine. The Python engine loads the data first before deciding
156156
which columns to drop.
157157

158-
mangle_dupe_cols : boolean, default ``True``
159-
Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'.
160-
Passing in ``False`` will cause data to be overwritten if there are duplicate
161-
names in the columns.
162-
163-
.. deprecated:: 1.5.0
164-
The argument was never implemented, and a new argument where the
165-
renaming pattern can be specified will be added instead.
166-
167158
General parsing configuration
168159
+++++++++++++++++++++++++++++
169160

@@ -587,10 +578,6 @@ If the header is in a row other than the first, pass the row number to
587578
Duplicate names parsing
588579
'''''''''''''''''''''''
589580

590-
.. deprecated:: 1.5.0
591-
``mangle_dupe_cols`` was never implemented, and a new argument where the
592-
renaming pattern can be specified will be added instead.
593-
594581
If the file or header contains duplicate names, pandas will by default
595582
distinguish between them so as to prevent overwriting data:
596583

@@ -599,8 +586,7 @@ distinguish between them so as to prevent overwriting data:
599586
data = "a,b,a\n0,1,2\n3,4,5"
600587
pd.read_csv(StringIO(data))
601588
602-
There is no more duplicate data because ``mangle_dupe_cols=True`` by default,
603-
which modifies a series of duplicate columns 'X', ..., 'X' to become
589+
There is no more duplicate data because duplicate columns 'X', ..., 'X' become
604590
'X', 'X.1', ..., 'X.N'.
605591

606592
.. _io.usecols:

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,7 @@ Removal of prior version deprecations/changes
434434
- Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`)
435435
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
436436
- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
437+
- Remove keyword ``mangle_dupe_cols`` from :func:`read_csv` and :func:`read_table` (:issue:`48137`)
437438
- Removed ``errors`` keyword from :meth:`DataFrame.where`, :meth:`Series.where`, :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`47728`)
438439
- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
439440
- Disallow passing non-keyword arguments to :meth:`DataFrame.drop` and :meth:`Series.drop` except ``labels`` (:issue:`41486`)

pandas/_libs/parsers.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ class TextReader:
5858
skiprows=...,
5959
skipfooter: int = ..., # int64_t
6060
verbose: bool = ...,
61-
mangle_dupe_cols: bool = ...,
6261
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
6362
skip_blank_lines: bool = ...,
6463
encoding_errors: bytes | str = ...,

pandas/_libs/parsers.pyx

+2-5
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ cdef class TextReader:
317317
object handle
318318
object orig_header
319319
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
320-
bint mangle_dupe_cols, allow_leading_cols
320+
bint allow_leading_cols
321321
uint64_t parser_start # this is modified after __init__
322322
list clocks
323323
const char *encoding_errors
@@ -373,7 +373,6 @@ cdef class TextReader:
373373
skiprows=None,
374374
skipfooter=0, # int64_t
375375
bint verbose=False,
376-
bint mangle_dupe_cols=True,
377376
float_precision=None,
378377
bint skip_blank_lines=True,
379378
encoding_errors=b"strict",
@@ -390,8 +389,6 @@ cdef class TextReader:
390389
self.parser = parser_new()
391390
self.parser.chunksize = tokenize_chunksize
392391

393-
self.mangle_dupe_cols = mangle_dupe_cols
394-
395392
# For timekeeping
396393
self.clocks = []
397394

@@ -680,7 +677,7 @@ cdef class TextReader:
680677

681678
this_header.append(name)
682679

683-
if not self.has_mi_columns and self.mangle_dupe_cols:
680+
if not self.has_mi_columns:
684681
# Ensure that regular columns are used before unnamed ones
685682
# to keep given names and mangle unnamed columns
686683
col_loop_order = [i for i in range(len(this_header))

pandas/io/parsers/base_parser.py

+18-26
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@ def __init__(self, kwds) -> None:
125125

126126
self.true_values = kwds.get("true_values")
127127
self.false_values = kwds.get("false_values")
128-
self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
129128
self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
130129
self.cache_dates = kwds.pop("cache_dates", True)
131130

@@ -333,34 +332,28 @@ def extract(r):
333332
return names, index_names, col_names, passed_names
334333

335334
@final
336-
def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
337-
# see gh-7160 and gh-9424: this helps to provide
338-
# immediate alleviation of the duplicate names
339-
# issue and appears to be satisfactory to users,
340-
# but ultimately, not needing to butcher the names
341-
# would be nice!
342-
if self.mangle_dupe_cols:
343-
names = list(names) # so we can index
344-
counts: DefaultDict[Hashable, int] = defaultdict(int)
345-
is_potential_mi = _is_potential_multi_index(names, self.index_col)
346-
347-
for i, col in enumerate(names):
348-
cur_count = counts[col]
349-
350-
while cur_count > 0:
351-
counts[col] = cur_count + 1
335+
def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
336+
names = list(names) # so we can index
337+
counts: DefaultDict[Hashable, int] = defaultdict(int)
338+
is_potential_mi = _is_potential_multi_index(names, self.index_col)
352339

353-
if is_potential_mi:
354-
# for mypy
355-
assert isinstance(col, tuple)
356-
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
357-
else:
358-
col = f"{col}.{cur_count}"
359-
cur_count = counts[col]
340+
for i, col in enumerate(names):
341+
cur_count = counts[col]
360342

361-
names[i] = col
343+
while cur_count > 0:
362344
counts[col] = cur_count + 1
363345

346+
if is_potential_mi:
347+
# for mypy
348+
assert isinstance(col, tuple)
349+
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
350+
else:
351+
col = f"{col}.{cur_count}"
352+
cur_count = counts[col]
353+
354+
names[i] = col
355+
counts[col] = cur_count + 1
356+
364357
return names
365358

366359
@final
@@ -1182,7 +1175,6 @@ def converter(*date_cols):
11821175
"verbose": False,
11831176
"encoding": None,
11841177
"compression": None,
1185-
"mangle_dupe_cols": True,
11861178
"infer_datetime_format": False,
11871179
"skip_blank_lines": True,
11881180
"encoding_errors": "strict",

pandas/io/parsers/c_parser_wrapper.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def read(
227227
except StopIteration:
228228
if self._first_chunk:
229229
self._first_chunk = False
230-
names = self._maybe_dedup_names(self.orig_names)
230+
names = self._dedup_names(self.orig_names)
231231
index, columns, col_dict = self._get_empty_meta(
232232
names,
233233
self.index_col,
@@ -281,7 +281,7 @@ def read(
281281
if self.usecols is not None:
282282
names = self._filter_usecols(names)
283283

284-
names = self._maybe_dedup_names(names)
284+
names = self._dedup_names(names)
285285

286286
# rename dict keys
287287
data_tups = sorted(data.items())
@@ -303,7 +303,7 @@ def read(
303303
# assert for mypy, orig_names is List or None, None would error in list(...)
304304
assert self.orig_names is not None
305305
names = list(self.orig_names)
306-
names = self._maybe_dedup_names(names)
306+
names = self._dedup_names(names)
307307

308308
if self.usecols is not None:
309309
names = self._filter_usecols(names)

pandas/io/parsers/python_parser.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def read(
259259
columns: Sequence[Hashable] = list(self.orig_names)
260260
if not len(content): # pragma: no cover
261261
# DataFrame with the right metadata, even though it's length 0
262-
names = self._maybe_dedup_names(self.orig_names)
262+
names = self._dedup_names(self.orig_names)
263263
# error: Cannot determine type of 'index_col'
264264
index, columns, col_dict = self._get_empty_meta(
265265
names,
@@ -293,7 +293,7 @@ def _exclude_implicit_index(
293293
self,
294294
alldata: list[np.ndarray],
295295
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
296-
names = self._maybe_dedup_names(self.orig_names)
296+
names = self._dedup_names(self.orig_names)
297297

298298
offset = 0
299299
if self._implicit_index:
@@ -424,7 +424,7 @@ def _infer_columns(
424424
else:
425425
this_columns.append(c)
426426

427-
if not have_mi_columns and self.mangle_dupe_cols:
427+
if not have_mi_columns:
428428
counts: DefaultDict = defaultdict(int)
429429
# Ensure that regular columns are used before unnamed ones
430430
# to keep given names and mangle unnamed columns

pandas/io/parsers/readers.py

+1-27
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,7 @@
4141
AbstractMethodError,
4242
ParserWarning,
4343
)
44-
from pandas.util._decorators import (
45-
Appender,
46-
deprecate_kwarg,
47-
)
44+
from pandas.util._decorators import Appender
4845
from pandas.util._exceptions import find_stack_level
4946

5047
from pandas.core.dtypes.common import (
@@ -152,14 +149,6 @@
152149
example of a valid callable argument would be ``lambda x: x.upper() in
153150
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
154151
parsing time and lower memory usage.
155-
mangle_dupe_cols : bool, default True
156-
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
157-
'X'...'X'. Passing in False will cause data to be overwritten if there
158-
are duplicate names in the columns.
159-
160-
.. deprecated:: 1.5.0
161-
Not implemented, and a new argument to specify the pattern for the
162-
names of duplicated columns will be added instead
163152
dtype : Type name or dict of column -> type, optional
164153
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
165154
'c': 'Int64'}}
@@ -604,7 +593,6 @@ def read_csv(
604593
names: Sequence[Hashable] | None | lib.NoDefault = ...,
605594
index_col: IndexLabel | Literal[False] | None = ...,
606595
usecols=...,
607-
mangle_dupe_cols: bool = ...,
608596
dtype: DtypeArg | None = ...,
609597
engine: CSVEngine | None = ...,
610598
converters=...,
@@ -661,7 +649,6 @@ def read_csv(
661649
names: Sequence[Hashable] | None | lib.NoDefault = ...,
662650
index_col: IndexLabel | Literal[False] | None = ...,
663651
usecols=...,
664-
mangle_dupe_cols: bool = ...,
665652
dtype: DtypeArg | None = ...,
666653
engine: CSVEngine | None = ...,
667654
converters=...,
@@ -718,7 +705,6 @@ def read_csv(
718705
names: Sequence[Hashable] | None | lib.NoDefault = ...,
719706
index_col: IndexLabel | Literal[False] | None = ...,
720707
usecols=...,
721-
mangle_dupe_cols: bool = ...,
722708
dtype: DtypeArg | None = ...,
723709
engine: CSVEngine | None = ...,
724710
converters=...,
@@ -775,7 +761,6 @@ def read_csv(
775761
names: Sequence[Hashable] | None | lib.NoDefault = ...,
776762
index_col: IndexLabel | Literal[False] | None = ...,
777763
usecols=...,
778-
mangle_dupe_cols: bool = ...,
779764
dtype: DtypeArg | None = ...,
780765
engine: CSVEngine | None = ...,
781766
converters=...,
@@ -821,7 +806,6 @@ def read_csv(
821806
...
822807

823808

824-
@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
825809
@Appender(
826810
_doc_read_csv_and_table.format(
827811
func_name="read_csv",
@@ -842,7 +826,6 @@ def read_csv(
842826
names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
843827
index_col: IndexLabel | Literal[False] | None = None,
844828
usecols=None,
845-
mangle_dupe_cols: bool = True,
846829
# General Parsing Configuration
847830
dtype: DtypeArg | None = None,
848831
engine: CSVEngine | None = None,
@@ -923,7 +906,6 @@ def read_table(
923906
names: Sequence[Hashable] | None | lib.NoDefault = ...,
924907
index_col: IndexLabel | Literal[False] | None = ...,
925908
usecols=...,
926-
mangle_dupe_cols: bool = ...,
927909
dtype: DtypeArg | None = ...,
928910
engine: CSVEngine | None = ...,
929911
converters=...,
@@ -980,7 +962,6 @@ def read_table(
980962
names: Sequence[Hashable] | None | lib.NoDefault = ...,
981963
index_col: IndexLabel | Literal[False] | None = ...,
982964
usecols=...,
983-
mangle_dupe_cols: bool = ...,
984965
dtype: DtypeArg | None = ...,
985966
engine: CSVEngine | None = ...,
986967
converters=...,
@@ -1037,7 +1018,6 @@ def read_table(
10371018
names: Sequence[Hashable] | None | lib.NoDefault = ...,
10381019
index_col: IndexLabel | Literal[False] | None = ...,
10391020
usecols=...,
1040-
mangle_dupe_cols: bool = ...,
10411021
dtype: DtypeArg | None = ...,
10421022
engine: CSVEngine | None = ...,
10431023
converters=...,
@@ -1094,7 +1074,6 @@ def read_table(
10941074
names: Sequence[Hashable] | None | lib.NoDefault = ...,
10951075
index_col: IndexLabel | Literal[False] | None = ...,
10961076
usecols=...,
1097-
mangle_dupe_cols: bool = ...,
10981077
dtype: DtypeArg | None = ...,
10991078
engine: CSVEngine | None = ...,
11001079
converters=...,
@@ -1140,7 +1119,6 @@ def read_table(
11401119
...
11411120

11421121

1143-
@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
11441122
@Appender(
11451123
_doc_read_csv_and_table.format(
11461124
func_name="read_table",
@@ -1161,7 +1139,6 @@ def read_table(
11611139
names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
11621140
index_col: IndexLabel | Literal[False] | None = None,
11631141
usecols=None,
1164-
mangle_dupe_cols: bool = True,
11651142
# General Parsing Configuration
11661143
dtype: DtypeArg | None = None,
11671144
engine: CSVEngine | None = None,
@@ -1406,9 +1383,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
14061383
f"The {repr(argname)} option is not supported with the "
14071384
f"'pyarrow' engine"
14081385
)
1409-
if argname == "mangle_dupe_cols" and value is False:
1410-
# GH12935
1411-
raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
14121386
options[argname] = value
14131387

14141388
for argname, default in _c_parser_defaults.items():

pandas/tests/io/parser/test_mangle_dupes.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,11 @@
1414

1515

1616
@skip_pyarrow
17-
@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
18-
def test_basic(all_parsers, kwargs):
19-
# TODO: add test for condition "mangle_dupe_cols=False"
20-
# once it is actually supported (gh-12935)
17+
def test_basic(all_parsers):
2118
parser = all_parsers
2219

2320
data = "a,a,b,b,b\n1,2,3,4,5"
24-
if "mangle_dupe_cols" in kwargs:
25-
with tm.assert_produces_warning(
26-
FutureWarning,
27-
match="the 'mangle_dupe_cols' keyword is deprecated",
28-
check_stacklevel=False,
29-
):
30-
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
31-
else:
32-
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
21+
result = parser.read_csv(StringIO(data), sep=",")
3322

3423
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
3524
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/test_unsupported.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,10 @@ class TestUnsupportedFeatures:
3434
def test_mangle_dupe_cols_false(self):
3535
# see gh-12935
3636
data = "a b c\n1 2 3"
37-
msg = "is not supported"
3837

3938
for engine in ("c", "python"):
40-
with tm.assert_produces_warning(
41-
FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated"
42-
):
43-
with pytest.raises(ValueError, match=msg):
44-
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False)
39+
with pytest.raises(TypeError, match="unexpected keyword"):
40+
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
4541

4642
def test_c_engine(self):
4743
# see gh-6607

0 commit comments

Comments
 (0)