Skip to content

read_excel() modifies provided types dict when accessing file with duplicate column #42508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ from pandas.core.dtypes.common import (
is_object_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.inference import is_dict_like

cdef:
float64_t INF = <float64_t>np.inf
Expand Down Expand Up @@ -688,13 +687,6 @@ cdef class TextReader:
counts[name] = count + 1
name = f'{name}.{count}'
count = counts.get(name, 0)
if (
self.dtype is not None
and is_dict_like(self.dtype)
and self.dtype.get(old_name) is not None
and self.dtype.get(name) is None
):
self.dtype.update({name: self.dtype.get(old_name)})

if old_name == '':
unnamed_cols.add(name)
Expand Down Expand Up @@ -989,6 +981,10 @@ cdef class TextReader:
col_dtype = self.dtype[name]
elif i in self.dtype:
col_dtype = self.dtype[i]
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

umm, can we not leave the original and simply copy self.dtype?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I had done that originally (send a copy of dtype, so that the original is unaffected). But then you suggested to try and not modify the dict at all(#42508 (comment)) 😅 . Also @phofl had mentioned that this was a unwanted side effect of #41411. So, I tried to incorporate those comments.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please let me know if you want me to revert all the changes, and just pass a copy of dtype dictionary

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer keeping the mangle_dup_cols logic together, e.g. simply copying the dict would be the best here. If we spread this over the code base it will be harder to refactor

if isinstance(name, str) and name.split(".")[-1].isnumeric():
orig_name = ".".join(name.split(".")[:-1])
col_dtype = self.dtype.get(orig_name, None)
else:
if self.dtype.names:
# structured array
Expand Down
7 changes: 7 additions & 0 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,13 @@ def _convert_to_ndarrays(
conv_f = None if converters is None else converters.get(c, None)
if isinstance(dtypes, dict):
cast_type = dtypes.get(c, None)
if (
cast_type is None
and isinstance(c, str)
and c.split(".")[-1].isnumeric()
):
orig_c = ".".join(c.split(".")[:-1])
cast_type = dtypes.get(orig_c, None)
else:
# single dtype or None
cast_type = dtypes
Expand Down
9 changes: 0 additions & 9 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
)

from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.inference import is_dict_like

from pandas.io.parsers.base_parser import (
ParserBase,
Expand Down Expand Up @@ -417,21 +416,13 @@ def _infer_columns(self):
counts: DefaultDict = defaultdict(int)

for i, col in enumerate(this_columns):
old_col = col
cur_count = counts[col]

if cur_count > 0:
while cur_count > 0:
counts[col] = cur_count + 1
col = f"{col}.{cur_count}"
cur_count = counts[col]
if (
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Same is done in the c parser, you should remove that too probably?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made the changes in c parser as well

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test failure Looks related

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that particular failure is resolved now. I am not getting the current failures in local.

self.dtype is not None
and is_dict_like(self.dtype)
and self.dtype.get(old_col) is not None
and self.dtype.get(col) is None
):
self.dtype.update({col: self.dtype.get(old_col)})

this_columns[i] = col
counts[col] = cur_count + 1
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,11 @@ def test_reader_dtype_str(self, read_ext, dtype, expected):
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
# GH#35211
basename = "df_mangle_dup_col_dtypes"
result = pd.read_excel(basename + read_ext, dtype={"a": str, **dtypes})
dtype_dict = {"a": str, **dtypes}
dtype_dict_copy = dtype_dict.copy()
result = pd.read_excel(basename + read_ext, dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed" # GH 42462
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move the issue reference to the start and remove the comment, same below

tm.assert_frame_equal(result, expected)

def test_reader_spaces(self, read_ext):
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,11 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
parser = all_parsers
data = """a,a\n1,1"""
result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes})
dtype_dict = {"a": str, **dtypes}
dtype_dict_copy = dtype_dict.copy()
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed" # GH 42462
tm.assert_frame_equal(result, expected)


Expand Down