diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b234a6b78e051..7ecd8cd6d5012 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1310,6 +1310,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: raise ValueError( "The 'python' engine cannot iterate through this file buffer." ) + if hasattr(f, "encoding"): + file_encoding = f.encoding + orig_reader_enc = self.orig_options.get("encoding", None) + any_none = file_encoding is None or orig_reader_enc is None + if file_encoding != orig_reader_enc and not any_none: + file_path = getattr(f, "name", None) + raise ValueError( + f"The specified reader encoding {orig_reader_enc} is different " + f"from the encoding {file_encoding} of file {file_path}." + ) def _clean_options( self, options: dict[str, Any], engine: CSVEngine @@ -1485,6 +1495,7 @@ def _make_engine( "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } + if engine not in mapping: raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 090235c862a2a..98a460f221592 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -511,7 +511,7 @@ def __next__(self): def test_buffer_rd_bytes_bad_unicode(c_parser_only): # see gh-22748 t = BytesIO(b"\xb0") - t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape") msg = "'utf-8' codec can't encode character" with pytest.raises(UnicodeError, match=msg): c_parser_only.read_csv(t, encoding="UTF-8") diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 6aeed2377a3aa..eeb783f1957b7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -48,6 +48,13 @@ def test_StringIO(self, csv_path): reader = TextReader(src, header=None) reader.read() + def test_encoding_mismatch_warning(self, csv_path): + # GH-57954 + with open(csv_path, encoding="UTF-8") as f: + msg = "latin1 is different from the encoding" + with pytest.raises(ValueError, match=msg): + read_csv(f, encoding="latin1") + def test_string_factorize(self): # should this be optional? data = "a\nb\na\nb\na"