From bbd9e99ccfbda0e05ce4d1c471e2c6e30b1c2a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 16 Sep 2021 20:15:56 -0400 Subject: [PATCH 1/2] CLN: let `codecs` validate the possible values of encoding errors --- pandas/io/common.py | 22 ++++------------------ pandas/tests/io/test_common.py | 2 +- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 46be1f9bb09b2..fcb9669e09ee3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -602,25 +602,11 @@ def get_handle( if _is_binary_mode(path_or_buf, mode) and "b" not in mode: mode += "b" - # valdiate errors + # validate encoding and errors + if isinstance(encoding, str): + codecs.lookup(encoding) if isinstance(errors, str): - errors = errors.lower() - if errors not in ( - None, - "strict", - "ignore", - "replace", - "xmlcharrefreplace", - "backslashreplace", - "namereplace", - "surrogateescape", - "surrogatepass", - ): - raise ValueError( - f"Invalid value for `encoding_errors` ({errors}). Please see " - + "https://docs.python.org/3/library/codecs.html#error-handlers " - + "for valid values." - ) + codecs.lookup_error(errors) # open URLs ioargs = _get_filepath_or_buffer( diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ad0b25d26d6f6..699459ab3666d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -593,7 +593,7 @@ def test_encoding_errors(encoding_errors, format): def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="Invalid value for `encoding_errors`"): + with pytest.raises(LookupError, match="unknown error handler name"): icom.get_handle(path, "w", errors="bad") From 1fa9c26a46f75acbf8f9c19095d6274350c4e18d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 16 Sep 2021 22:06:50 -0400 Subject: [PATCH 2/2] stricter parsing of encoding --- pandas/io/common.py | 4 ---- pandas/tests/io/xml/test_xml.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index fcb9669e09ee3..ba1cc82bfea56 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -276,10 +276,6 @@ def _get_filepath_or_buffer( compression = dict(compression, method=compression_method) - # uniform encoding names - if encoding is not None: - encoding = encoding.replace("_", "-").lower() - # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 7e9a03c2a59a8..a99f66336bf22 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -709,7 +709,7 @@ def test_utf16_encoding(datapath, parser): def test_unknown_encoding(datapath, parser): filename = datapath("io", "data", "xml", "baby_names.xml") - with pytest.raises(LookupError, match=("unknown encoding: uft-8")): + with pytest.raises(LookupError, match=("unknown encoding: UFT-8")): read_xml(filename, encoding="UFT-8", parser=parser)