From cb49c2105f53562b439c681d791167cdf476baab Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Fri, 22 Mar 2024 21:22:28 +0100 Subject: [PATCH 1/6] add exception when encodings exist and do not match --- pandas/io/parsers/readers.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 539d9abf84f90..8aee715260b69 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1516,6 +1516,17 @@ def _make_engine( "pyarrow": ArrowParserWrapper, "python-fwf": FixedWidthFieldParser, } + + file_encoding = getattr(f, "encoding", None) + orig_reader_enc = self.orig_options.get("encoding", None) + are_both_encodings = file_encoding is not None and orig_reader_enc is not None + if are_both_encodings and file_encoding != orig_reader_enc: + file_path = getattr(f, "name", None) + raise ValueError( + f"The specified reader encoding {orig_reader_enc} is different from " + f"the encoding {file_encoding} of file {file_path}." + ) + if engine not in mapping: raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" From da616c4f9516dbca05f593a6e7688aacb05c3a6c Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Fri, 22 Mar 2024 21:54:31 +0100 Subject: [PATCH 2/6] add exception when encodings exist and do not match --- pandas/tests/io/parser/test_textreader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 6aeed2377a3aa..5657b05297569 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -48,6 +48,13 @@ def test_StringIO(self, csv_path): reader = TextReader(src, header=None) reader.read() + def test_encoding_mismatch_warning(self, csv_path): + # GH-57954 + with open(csv_path) as f: + msg = "latin1 is different from the encoding" + with pytest.raises(ValueError, match=msg): + read_csv(f, encoding="latin1") + def test_string_factorize(self): # should this be optional? data = "a\nb\na\nb\na" From f35b3b3c202d864c8f3e45a4b208f58c906bca07 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Fri, 22 Mar 2024 21:54:31 +0100 Subject: [PATCH 3/6] add test for mismatching encodings warning --- pandas/tests/io/parser/test_textreader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 6aeed2377a3aa..5657b05297569 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -48,6 +48,13 @@ def test_StringIO(self, csv_path): reader = TextReader(src, header=None) reader.read() + def test_encoding_mismatch_warning(self, csv_path): + # GH-57954 + with open(csv_path) as f: + msg = "latin1 is different from the encoding" + with pytest.raises(ValueError, match=msg): + read_csv(f, encoding="latin1") + def test_string_factorize(self): # should this be optional? data = "a\nb\na\nb\na" From 0a222977ee5c1bace3eb1469bf82987a8d214f39 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Fri, 22 Mar 2024 23:09:22 +0100 Subject: [PATCH 4/6] add test for mismatching encodings warning --- pandas/tests/io/parser/test_c_parser_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 090235c862a2a..98a460f221592 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -511,7 +511,7 @@ def __next__(self): def test_buffer_rd_bytes_bad_unicode(c_parser_only): # see gh-22748 t = BytesIO(b"\xb0") - t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + t = TextIOWrapper(t, encoding="UTF-8", errors="surrogateescape") msg = "'utf-8' codec can't encode character" with pytest.raises(UnicodeError, match=msg): c_parser_only.read_csv(t, encoding="UTF-8") From efae8b84ecb338ed435621633ca38f36220e13ab Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Sat, 23 Mar 2024 13:23:04 +0100 Subject: [PATCH 5/6] add encoding for python 3.10+ --- pandas/tests/io/parser/test_textreader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 5657b05297569..eeb783f1957b7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -50,7 +50,7 @@ def test_StringIO(self, csv_path): def test_encoding_mismatch_warning(self, csv_path): # GH-57954 - with open(csv_path) as f: + with open(csv_path, encoding="UTF-8") as f: msg = "latin1 is different from the encoding" with pytest.raises(ValueError, match=msg): read_csv(f, encoding="latin1") From 277a0d27617f236507c598dc2d62c1c66b1131d5 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 26 Mar 2024 22:03:22 +0100 Subject: [PATCH 6/6] move to _check_file; invert var and condition --- pandas/io/parsers/readers.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d42b028131eb2..7ecd8cd6d5012 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1310,6 +1310,16 @@ def _check_file_or_buffer(self, f, engine: CSVEngine) -> None: raise ValueError( "The 'python' engine cannot iterate through this file buffer." ) + if hasattr(f, "encoding"): + file_encoding = f.encoding + orig_reader_enc = self.orig_options.get("encoding", None) + any_none = file_encoding is None or orig_reader_enc is None + if file_encoding != orig_reader_enc and not any_none: + file_path = getattr(f, "name", None) + raise ValueError( + f"The specified reader encoding {orig_reader_enc} is different " + f"from the encoding {file_encoding} of file {file_path}." + ) def _clean_options( self, options: dict[str, Any], engine: CSVEngine @@ -1486,16 +1496,6 @@ def _make_engine( "python-fwf": FixedWidthFieldParser, } - file_encoding = getattr(f, "encoding", None) - orig_reader_enc = self.orig_options.get("encoding", None) - are_both_encodings = file_encoding is not None and orig_reader_enc is not None - if are_both_encodings and file_encoding != orig_reader_enc: - file_path = getattr(f, "name", None) - raise ValueError( - f"The specified reader encoding {orig_reader_enc} is different from " - f"the encoding {file_encoding} of file {file_path}." - ) - if engine not in mapping: raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})"