Skip to content

read_excel() modifies provided types dict when accessing file with duplicate column #42508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Bug in :meth:`pandas.read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`)
- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
- :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`)

Expand Down
4 changes: 2 additions & 2 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
abc,
defaultdict,
)
from copy import copy
import csv
from io import StringIO
import re
Expand Down Expand Up @@ -81,7 +82,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds):
self.verbose = kwds["verbose"]
self.converters = kwds["converters"]

self.dtype = kwds["dtype"]
self.dtype = copy(kwds["dtype"])
self.thousands = kwds["thousands"]
self.decimal = kwds["decimal"]

Expand Down Expand Up @@ -432,7 +433,6 @@ def _infer_columns(self):
and self.dtype.get(col) is None
):
self.dtype.update({col: self.dtype.get(old_col)})

this_columns[i] = col
counts[col] = cur_count + 1
elif have_mi_columns:
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,12 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
# GH#35211
basename = "df_mangle_dup_col_dtypes"
result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes})
dtype_dict = {"a": str, **dtypes}
dtype_dict_copy = dtype_dict.copy()
# GH#42462
result = pd.read_excel(basename + read_ext, dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
tm.assert_frame_equal(result, expected)

def test_reader_spaces(self, read_ext):
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,12 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
parser = all_parsers
data = """a,a\n1,1"""
result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes})
dtype_dict = {"a": str, **dtypes}
# GH#42462
dtype_dict_copy = dtype_dict.copy()
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
tm.assert_frame_equal(result, expected)


Expand Down