Skip to content

DEP: Enforce deprecation of mangle_dupe_cols #49977

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 1 addition & 15 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -155,15 +155,6 @@ usecols : list-like or callable, default ``None``
when using the c engine. The Python engine loads the data first before deciding
which columns to drop.

mangle_dupe_cols : boolean, default ``True``
Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'.
Passing in ``False`` will cause data to be overwritten if there are duplicate
names in the columns.

.. deprecated:: 1.5.0
The argument was never implemented, and a new argument where the
renaming pattern can be specified will be added instead.

General parsing configuration
+++++++++++++++++++++++++++++

Expand Down Expand Up @@ -587,10 +578,6 @@ If the header is in a row other than the first, pass the row number to
Duplicate names parsing
'''''''''''''''''''''''

.. deprecated:: 1.5.0
``mangle_dupe_cols`` was never implemented, and a new argument where the
renaming pattern can be specified will be added instead.

If the file or header contains duplicate names, pandas will by default
distinguish between them so as to prevent overwriting data:

Expand All @@ -599,8 +586,7 @@ distinguish between them so as to prevent overwriting data:
data = "a,b,a\n0,1,2\n3,4,5"
pd.read_csv(StringIO(data))

There is no more duplicate data because ``mangle_dupe_cols=True`` by default,
which modifies a series of duplicate columns 'X', ..., 'X' to become
There is no more duplicate data because duplicate columns 'X', ..., 'X' become
'X', 'X.1', ..., 'X.N'.

.. _io.usecols:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ Removal of prior version deprecations/changes
- Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`)
- Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
- Remove keywords ``convert_float`` and ``mangle_dupe_cols`` from :func:`read_excel` (:issue:`41176`)
- Remove keyword ``mangle_dupe_cols`` from :func:`read_csv` and :func:`read_table` (:issue:`48137`)
- Removed ``errors`` keyword from :meth:`DataFrame.where`, :meth:`Series.where`, :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`47728`)
- Disallow passing non-keyword arguments to :func:`read_excel` except ``io`` and ``sheet_name`` (:issue:`34418`)
- Disallow passing non-keyword arguments to :meth:`DataFrame.drop` and :meth:`Series.drop` except ``labels`` (:issue:`41486`)
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/parsers.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ class TextReader:
skiprows=...,
skipfooter: int = ..., # int64_t
verbose: bool = ...,
mangle_dupe_cols: bool = ...,
float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
skip_blank_lines: bool = ...,
encoding_errors: bytes | str = ...,
Expand Down
7 changes: 2 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ cdef class TextReader:
object handle
object orig_header
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
bint mangle_dupe_cols, allow_leading_cols
bint allow_leading_cols
uint64_t parser_start # this is modified after __init__
list clocks
const char *encoding_errors
Expand Down Expand Up @@ -373,7 +373,6 @@ cdef class TextReader:
skiprows=None,
skipfooter=0, # int64_t
bint verbose=False,
bint mangle_dupe_cols=True,
float_precision=None,
bint skip_blank_lines=True,
encoding_errors=b"strict",
Expand All @@ -390,8 +389,6 @@ cdef class TextReader:
self.parser = parser_new()
self.parser.chunksize = tokenize_chunksize

self.mangle_dupe_cols = mangle_dupe_cols

# For timekeeping
self.clocks = []

Expand Down Expand Up @@ -680,7 +677,7 @@ cdef class TextReader:

this_header.append(name)

if not self.has_mi_columns and self.mangle_dupe_cols:
if not self.has_mi_columns:
# Ensure that regular columns are used before unnamed ones
# to keep given names and mangle unnamed columns
col_loop_order = [i for i in range(len(this_header))
Expand Down
44 changes: 18 additions & 26 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def __init__(self, kwds) -> None:

self.true_values = kwds.get("true_values")
self.false_values = kwds.get("false_values")
self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
self.cache_dates = kwds.pop("cache_dates", True)

Expand Down Expand Up @@ -333,34 +332,28 @@ def extract(r):
return names, index_names, col_names, passed_names

@final
def _maybe_dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
# see gh-7160 and gh-9424: this helps to provide
# immediate alleviation of the duplicate names
# issue and appears to be satisfactory to users,
# but ultimately, not needing to butcher the names
# would be nice!
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts: DefaultDict[Hashable, int] = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

for i, col in enumerate(names):
cur_count = counts[col]

while cur_count > 0:
counts[col] = cur_count + 1
def _dedup_names(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
names = list(names) # so we can index
counts: DefaultDict[Hashable, int] = defaultdict(int)
is_potential_mi = _is_potential_multi_index(names, self.index_col)

if is_potential_mi:
# for mypy
assert isinstance(col, tuple)
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
cur_count = counts[col]
for i, col in enumerate(names):
cur_count = counts[col]

names[i] = col
while cur_count > 0:
counts[col] = cur_count + 1

if is_potential_mi:
# for mypy
assert isinstance(col, tuple)
col = col[:-1] + (f"{col[-1]}.{cur_count}",)
else:
col = f"{col}.{cur_count}"
cur_count = counts[col]

names[i] = col
counts[col] = cur_count + 1

return names

@final
Expand Down Expand Up @@ -1182,7 +1175,6 @@ def converter(*date_cols):
"verbose": False,
"encoding": None,
"compression": None,
"mangle_dupe_cols": True,
"infer_datetime_format": False,
"skip_blank_lines": True,
"encoding_errors": "strict",
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def read(
except StopIteration:
if self._first_chunk:
self._first_chunk = False
names = self._maybe_dedup_names(self.orig_names)
names = self._dedup_names(self.orig_names)
index, columns, col_dict = self._get_empty_meta(
names,
self.index_col,
Expand Down Expand Up @@ -281,7 +281,7 @@ def read(
if self.usecols is not None:
names = self._filter_usecols(names)

names = self._maybe_dedup_names(names)
names = self._dedup_names(names)

# rename dict keys
data_tups = sorted(data.items())
Expand All @@ -303,7 +303,7 @@ def read(
# assert for mypy, orig_names is List or None, None would error in list(...)
assert self.orig_names is not None
names = list(self.orig_names)
names = self._maybe_dedup_names(names)
names = self._dedup_names(names)

if self.usecols is not None:
names = self._filter_usecols(names)
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def read(
columns: Sequence[Hashable] = list(self.orig_names)
if not len(content): # pragma: no cover
# DataFrame with the right metadata, even though it's length 0
names = self._maybe_dedup_names(self.orig_names)
names = self._dedup_names(self.orig_names)
# error: Cannot determine type of 'index_col'
index, columns, col_dict = self._get_empty_meta(
names,
Expand Down Expand Up @@ -293,7 +293,7 @@ def _exclude_implicit_index(
self,
alldata: list[np.ndarray],
) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
names = self._maybe_dedup_names(self.orig_names)
names = self._dedup_names(self.orig_names)

offset = 0
if self._implicit_index:
Expand Down Expand Up @@ -424,7 +424,7 @@ def _infer_columns(
else:
this_columns.append(c)

if not have_mi_columns and self.mangle_dupe_cols:
if not have_mi_columns:
counts: DefaultDict = defaultdict(int)
# Ensure that regular columns are used before unnamed ones
# to keep given names and mangle unnamed columns
Expand Down
28 changes: 1 addition & 27 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@
AbstractMethodError,
ParserWarning,
)
from pandas.util._decorators import (
Appender,
deprecate_kwarg,
)
from pandas.util._decorators import Appender
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -152,14 +149,6 @@
example of a valid callable argument would be ``lambda x: x.upper() in
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
parsing time and lower memory usage.
mangle_dupe_cols : bool, default True
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
'X'...'X'. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.

.. deprecated:: 1.5.0
Not implemented, and a new argument to specify the pattern for the
names of duplicated columns will be added instead
dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Expand Down Expand Up @@ -604,7 +593,6 @@ def read_csv(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -661,7 +649,6 @@ def read_csv(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -718,7 +705,6 @@ def read_csv(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -775,7 +761,6 @@ def read_csv(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -821,7 +806,6 @@ def read_csv(
...


@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
@Appender(
_doc_read_csv_and_table.format(
func_name="read_csv",
Expand All @@ -842,7 +826,6 @@ def read_csv(
names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
index_col: IndexLabel | Literal[False] | None = None,
usecols=None,
mangle_dupe_cols: bool = True,
# General Parsing Configuration
dtype: DtypeArg | None = None,
engine: CSVEngine | None = None,
Expand Down Expand Up @@ -923,7 +906,6 @@ def read_table(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -980,7 +962,6 @@ def read_table(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1037,7 +1018,6 @@ def read_table(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1094,7 +1074,6 @@ def read_table(
names: Sequence[Hashable] | None | lib.NoDefault = ...,
index_col: IndexLabel | Literal[False] | None = ...,
usecols=...,
mangle_dupe_cols: bool = ...,
dtype: DtypeArg | None = ...,
engine: CSVEngine | None = ...,
converters=...,
Expand Down Expand Up @@ -1140,7 +1119,6 @@ def read_table(
...


@deprecate_kwarg(old_arg_name="mangle_dupe_cols", new_arg_name=None)
@Appender(
_doc_read_csv_and_table.format(
func_name="read_table",
Expand All @@ -1161,7 +1139,6 @@ def read_table(
names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
index_col: IndexLabel | Literal[False] | None = None,
usecols=None,
mangle_dupe_cols: bool = True,
# General Parsing Configuration
dtype: DtypeArg | None = None,
engine: CSVEngine | None = None,
Expand Down Expand Up @@ -1406,9 +1383,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
f"The {repr(argname)} option is not supported with the "
f"'pyarrow' engine"
)
if argname == "mangle_dupe_cols" and value is False:
# GH12935
raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
options[argname] = value

for argname, default in _c_parser_defaults.items():
Expand Down
15 changes: 2 additions & 13 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,11 @@


@skip_pyarrow
@pytest.mark.parametrize("kwargs", [{}, {"mangle_dupe_cols": True}])
def test_basic(all_parsers, kwargs):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
def test_basic(all_parsers):
parser = all_parsers

data = "a,a,b,b,b\n1,2,3,4,5"
if "mangle_dupe_cols" in kwargs:
with tm.assert_produces_warning(
FutureWarning,
match="the 'mangle_dupe_cols' keyword is deprecated",
check_stacklevel=False,
):
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
else:
result = parser.read_csv(StringIO(data), sep=",", **kwargs)
result = parser.read_csv(StringIO(data), sep=",")

expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
tm.assert_frame_equal(result, expected)
Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/io/parser/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,10 @@ class TestUnsupportedFeatures:
def test_mangle_dupe_cols_false(self):
# see gh-12935
data = "a b c\n1 2 3"
msg = "is not supported"

for engine in ("c", "python"):
with tm.assert_produces_warning(
FutureWarning, match="the 'mangle_dupe_cols' keyword is deprecated"
):
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False)
with pytest.raises(TypeError, match="unexpected keyword"):
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)

def test_c_engine(self):
# see gh-6607
Expand Down