diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..d9f8bee3acdec 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -837,6 +837,7 @@ I/O - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) +- Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 8d9f1773590b0..0878aff562c12 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -685,10 +685,17 @@ cdef class TextReader: count = counts.get(name, 0) if not self.has_mi_columns and self.mangle_dupe_cols: - while count > 0: - counts[name] = count + 1 - name = f'{name}.{count}' - count = counts.get(name, 0) + if count > 0: + while count > 0: + counts[name] = count + 1 + name = f'{name}.{count}' + count = counts.get(name, 0) + if ( + self.dtype is not None + and self.dtype.get(old_name) is not None + and self.dtype.get(name) is None + ): + self.dtype.update({name: self.dtype.get(old_name)}) if old_name == '': unnamed_cols.add(name) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 0055f3123f3c0..1d70b6e59c51b 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -421,12 +421,20 @@ def _infer_columns(self): counts: DefaultDict = defaultdict(int) for i, col in enumerate(this_columns): + old_col = col cur_count = counts[col] - while cur_count > 0: - counts[col] = cur_count + 1 - col = f"{col}.{cur_count}" - cur_count = counts[col] + if cur_count > 0: + while cur_count > 0: + counts[col] = cur_count + 1 + col = f"{col}.{cur_count}" + cur_count = counts[col] + if ( + self.dtype is not None + and self.dtype.get(old_col) is not None + and self.dtype.get(col) is None + ): + self.dtype.update({col: self.dtype.get(old_col)}) this_columns[i] = col counts[col] = cur_count + 1 diff --git a/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.ods b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.ods new file mode 100644 index 0000000000000..66558c16319fc Binary files /dev/null and b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.ods differ diff --git a/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xls b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xls new file mode 100644 index 0000000000000..472ad75901286 Binary files /dev/null and b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xls differ diff --git a/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsb b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsb new file mode 100755 index 0000000000000..5052102c6655d Binary files /dev/null and b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsb differ diff --git a/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsm b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsm new file mode 100644 index 0000000000000..51edc7f94f9d8 Binary files /dev/null and b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsm differ diff --git a/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsx b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsx new file mode 100644 index 0000000000000..ec4e49add4233 Binary files /dev/null and b/pandas/tests/io/data/excel/df_mangle_dup_col_dtypes.xlsx differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 632de5f70f64a..a46cb70097bd8 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -555,6 +555,14 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) + @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) + def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): + # GH#35211 + basename = "df_mangle_dup_col_dtypes" + result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes}) + expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + tm.assert_frame_equal(result, expected) + def test_reader_spaces(self, read_ext): # see gh-32207 basename = "test_spaces" diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e452159189d4a..59fd3de60e0bf 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -238,3 +238,13 @@ def test_true_values_cast_to_bool(all_parsers): ) expected["a"] = expected["a"].astype("boolean") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) +def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): + # GH#35211 + parser = all_parsers + data = """a,a\n1,1""" + result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes}) + expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + tm.assert_frame_equal(result, expected)