Skip to content

BUG: #57954 encoding ignored for filelike #57968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
11 changes: 11 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1310,6 +1310,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
raise ValueError(
"The 'python' engine cannot iterate through this file buffer."
)
if hasattr(f, "encoding"):
file_encoding = f.encoding
orig_reader_enc = self.orig_options.get("encoding", None)
any_none = file_encoding is None or orig_reader_enc is None
if file_encoding != orig_reader_enc and not any_none:
file_path = getattr(f, "name", None)
raise ValueError(
f"The specified reader encoding {orig_reader_enc} is different "
f"from the encoding {file_encoding} of file {file_path}."
)

def _clean_options(
self, options: dict[str, Any], engine: CSVEngine
Expand Down Expand Up @@ -1485,6 +1495,7 @@ def _make_engine(
"pyarrow": ArrowParserWrapper,
"python-fwf": FixedWidthFieldParser,
}

if engine not in mapping:
raise ValueError(
f"Unknown engine: {engine} (valid options are {mapping.keys()})"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ def __next__(self):
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
# see gh-22748
t = BytesIO(b"\xb0")
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape")
msg = "'utf-8' codec can't encode character"
with pytest.raises(UnicodeError, match=msg):
c_parser_only.read_csv(t, encoding="UTF-8")
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/io/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ def test_StringIO(self, csv_path):
reader = TextReader(src, header=None)
reader.read()

def test_encoding_mismatch_warning(self, csv_path):
# GH-57954
with open(csv_path, encoding="UTF-8") as f:
msg = "latin1 is different from the encoding"
with pytest.raises(ValueError, match=msg):
read_csv(f, encoding="latin1")

def test_string_factorize(self):
# should this be optional?
data = "a\nb\na\nb\na"
Expand Down
Loading