Skip to content

Commit 76792f1

Browse files
authored
Bug in read_csv and read_excel not applying dtype to second col with dup cols (#41411)
1 parent 9a6c13d commit 76792f1

10 files changed

+42
-8
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,7 @@ I/O
840840
- Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`)
841841
- Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
842842
- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`)
843+
- Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`)
843844
- Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`)
844845
- Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`)
845846

pandas/_libs/parsers.pyx

+11-4
Original file line numberDiff line numberDiff line change
@@ -685,10 +685,17 @@ cdef class TextReader:
685685
count = counts.get(name, 0)
686686

687687
if not self.has_mi_columns and self.mangle_dupe_cols:
688-
while count > 0:
689-
counts[name] = count + 1
690-
name = f'{name}.{count}'
691-
count = counts.get(name, 0)
688+
if count > 0:
689+
while count > 0:
690+
counts[name] = count + 1
691+
name = f'{name}.{count}'
692+
count = counts.get(name, 0)
693+
if (
694+
self.dtype is not None
695+
and self.dtype.get(old_name) is not None
696+
and self.dtype.get(name) is None
697+
):
698+
self.dtype.update({name: self.dtype.get(old_name)})
692699

693700
if old_name == '':
694701
unnamed_cols.add(name)

pandas/io/parsers/python_parser.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -421,12 +421,20 @@ def _infer_columns(self):
421421
counts: DefaultDict = defaultdict(int)
422422

423423
for i, col in enumerate(this_columns):
424+
old_col = col
424425
cur_count = counts[col]
425426

426-
while cur_count > 0:
427-
counts[col] = cur_count + 1
428-
col = f"{col}.{cur_count}"
429-
cur_count = counts[col]
427+
if cur_count > 0:
428+
while cur_count > 0:
429+
counts[col] = cur_count + 1
430+
col = f"{col}.{cur_count}"
431+
cur_count = counts[col]
432+
if (
433+
self.dtype is not None
434+
and self.dtype.get(old_col) is not None
435+
and self.dtype.get(col) is None
436+
):
437+
self.dtype.update({col: self.dtype.get(old_col)})
430438

431439
this_columns[i] = col
432440
counts[col] = cur_count + 1
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

pandas/tests/io/excel/test_readers.py

+8
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,14 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
555555
actual = pd.read_excel(basename + read_ext, dtype=dtype)
556556
tm.assert_frame_equal(actual, expected)
557557

558+
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
559+
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
560+
# GH#35211
561+
basename = "df_mangle_dup_col_dtypes"
562+
result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes})
563+
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
564+
tm.assert_frame_equal(result, expected)
565+
558566
def test_reader_spaces(self, read_ext):
559567
# see gh-32207
560568
basename = "test_spaces"

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+10
Original file line numberDiff line numberDiff line change
@@ -238,3 +238,13 @@ def test_true_values_cast_to_bool(all_parsers):
238238
)
239239
expected["a"] = expected["a"].astype("boolean")
240240
tm.assert_frame_equal(result, expected)
241+
242+
243+
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
244+
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
245+
# GH#35211
246+
parser = all_parsers
247+
data = """a,a\n1,1"""
248+
result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes})
249+
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
250+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)