diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index c716460e997d0..4e6ea85e2ff1d 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -30,6 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :meth:`pandas.read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`) - 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`) - :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 7c9fcde08bf24..af253fc062632 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -4,6 +4,7 @@ abc, defaultdict, ) +from copy import copy import csv from io import StringIO import re @@ -81,7 +82,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): self.verbose = kwds["verbose"] self.converters = kwds["converters"] - self.dtype = kwds["dtype"] + self.dtype = copy(kwds["dtype"]) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -432,7 +433,6 @@ def _infer_columns(self): and self.dtype.get(col) is None ): self.dtype.update({col: self.dtype.get(old_col)}) - this_columns[i] = col counts[col] = cur_count + 1 elif have_mi_columns: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index cbd241ceda0b1..f999733192725 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -576,8 +576,12 @@ def test_reader_dtype_str(self, read_ext, dtype, expected): def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): # GH#35211 basename = "df_mangle_dup_col_dtypes" - result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + dtype_dict_copy = dtype_dict.copy() + # GH#42462 + result = pd.read_excel(basename + read_ext, dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) def test_reader_spaces(self, read_ext): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 6ed52ed86af2a..32a7ac44c0b38 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -245,8 +245,12 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): # GH#35211 parser = all_parsers data = """a,a\n1,1""" - result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + # GH#42462 + dtype_dict_copy = dtype_dict.copy() + result = parser.read_csv(StringIO(data), dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected)