From d6148fea83906695f9d84b249c315c6809c098ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 12 Feb 2021 09:01:37 -0500 Subject: [PATCH 1/2] ENH: 'encoding_errors' argument for read_csv/json --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/parsers.pyx | 53 ++++++++++--------- pandas/_libs/src/parser/io.c | 4 +- pandas/_libs/src/parser/io.h | 2 +- pandas/_libs/src/parser/tokenizer.c | 20 ++++--- pandas/_libs/src/parser/tokenizer.h | 6 +-- pandas/io/common.py | 25 +++++++-- pandas/io/json/_json.py | 15 +++++- pandas/io/parsers/base_parser.py | 2 + pandas/io/parsers/readers.py | 15 ++++++ .../io/parser/common/test_common_basic.py | 19 +++++++ .../io/parser/common/test_read_errors.py | 2 +- pandas/tests/io/test_common.py | 52 +++++++++++++++--- 13 files changed, 162 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 370ea28832758..b240dad32f0e1 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -141,6 +141,7 @@ Other enhancements - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :meth:`pandas.Timedelta` (:issue:`37172`) - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`) - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`) +- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c4d98ccb88ba5..031a567925a4d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -20,13 +20,19 @@ from libc.string cimport ( import cython from cython import Py_ssize_t -from cpython.bytes cimport PyBytes_AsString +from cpython.bytes cimport ( + PyBytes_AsString, + PyBytes_FromString, +) from cpython.exc cimport ( PyErr_Fetch, PyErr_Occurred, ) from cpython.object cimport PyObject -from cpython.ref cimport Py_XDECREF +from cpython.ref cimport ( + Py_INCREF, + Py_XDECREF, +) from cpython.unicode cimport ( PyUnicode_AsUTF8String, PyUnicode_Decode, @@ -143,7 +149,7 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status) + int *status, const char *encoding_errors) ctypedef int (*io_cleanup)(void *src) ctypedef struct parser_t: @@ -255,8 +261,8 @@ cdef extern from "parser/tokenizer.h": int parser_trim_buffers(parser_t *self) - int tokenize_all_rows(parser_t *self) nogil - int tokenize_nrows(parser_t *self, size_t nrows) nogil + int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil + int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil int64_t str_to_int64(char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) nogil @@ -293,7 +299,7 @@ cdef extern from "parser/io.h": size_t *bytes_read, int *status) void* buffer_rd_bytes(void *source, size_t nbytes, - size_t *bytes_read, int *status) + size_t *bytes_read, int *status, const char *encoding_errors) cdef class TextReader: @@ -316,6 +322,7 @@ cdef class TextReader: uint64_t parser_start list clocks char *c_encoding + const char *encoding_errors kh_str_starts_t *false_set kh_str_starts_t *true_set @@ -370,10 +377,15 @@ cdef class TextReader: bint verbose=False, bint mangle_dupe_cols=True, float_precision=None, - bint skip_blank_lines=True): + bint skip_blank_lines=True, + encoding_errors=b"strict"): # set encoding for native Python and C library self.c_encoding = NULL + if isinstance(encoding_errors, str): + encoding_errors = encoding_errors.encode("utf-8") + Py_INCREF(encoding_errors) + self.encoding_errors = PyBytes_AsString(encoding_errors) self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -558,13 +570,7 @@ cdef class TextReader: pass def __dealloc__(self): - parser_free(self.parser) - if self.true_set: - kh_destroy_str_starts(self.true_set) - self.true_set = NULL - if self.false_set: - kh_destroy_str_starts(self.false_set) - self.false_set = NULL + self.close() parser_del(self.parser) def close(self): @@ -632,7 +638,6 @@ cdef class TextReader: char *word object name, old_name uint64_t hr, data_line = 0 - char *errors = "strict" StringPath path = _string_path(self.c_encoding) list header = [] set unnamed_cols = set() @@ -673,11 +678,8 @@ cdef class TextReader: for i in range(field_count): word = self.parser.words[start + i] - if path == UTF8: - name = PyUnicode_FromString(word) - elif path == ENCODED: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, self.encoding_errors) # We use this later when collecting placeholder names. old_name = name @@ -831,7 +833,7 @@ cdef class TextReader: int status with nogil: - status = tokenize_nrows(self.parser, nrows) + status = tokenize_nrows(self.parser, nrows, self.encoding_errors) if self.parser.warn_msg != NULL: print(self.parser.warn_msg, file=sys.stderr) @@ -859,7 +861,7 @@ cdef class TextReader: 'the whole file') else: with nogil: - status = tokenize_all_rows(self.parser) + status = tokenize_all_rows(self.parser, self.encoding_errors) if self.parser.warn_msg != NULL: print(self.parser.warn_msg, file=sys.stderr) @@ -1201,7 +1203,7 @@ cdef class TextReader: if path == UTF8: return _string_box_utf8(self.parser, i, start, end, na_filter, - na_hashset) + na_hashset, self.encoding_errors) elif path == ENCODED: return _string_box_decode(self.parser, i, start, end, na_filter, na_hashset, self.c_encoding) @@ -1352,7 +1354,8 @@ cdef inline StringPath _string_path(char *encoding): cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset, + const char *encoding_errors): cdef: int error, na_count = 0 Py_ssize_t i, lines @@ -1391,7 +1394,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col, pyval = table.vals[k] else: # box it. new ref? - pyval = PyUnicode_FromString(word) + pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors) k = kh_put_strbox(table, word, &ret) table.vals[k] = pyval diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 51504527de5a2..449f0b55bff70 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -163,7 +163,7 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, } void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, - int *status) { + int *status, const char *encoding_errors) { PyGILState_STATE state; PyObject *result, *func, *args, *tmp; @@ -191,7 +191,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, *status = CALLING_READ_FAILED; return NULL; } else if (!PyBytes_Check(result)) { - tmp = PyUnicode_AsUTF8String(result); + tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); Py_DECREF(result); if (tmp == NULL) { PyGILState_Release(state); diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h index aac418457d3b6..dbe757b458c54 100644 --- a/pandas/_libs/src/parser/io.h +++ b/pandas/_libs/src/parser/io.h @@ -64,6 +64,6 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status); void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, - int *status); + int *status, const char *encoding_errors); #endif // PANDAS__LIBS_SRC_PARSER_IO_H_ diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 1b229171ea879..49eb1e7855098 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -553,13 +553,15 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { return 0; } -static int parser_buffer_bytes(parser_t *self, size_t nbytes) { +static int parser_buffer_bytes(parser_t *self, size_t nbytes, + const char *encoding_errors) { int status; size_t bytes_read; status = 0; self->datapos = 0; - self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); + self->data = self->cb_io(self->source, nbytes, &bytes_read, &status, + encoding_errors); TRACE(( "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", nbytes, bytes_read, status)); @@ -1334,7 +1336,8 @@ int parser_trim_buffers(parser_t *self) { all : tokenize all the data vs. certain number of rows */ -int _tokenize_helper(parser_t *self, size_t nrows, int all) { +int _tokenize_helper(parser_t *self, size_t nrows, int all, + const char *encoding_errors) { int status = 0; uint64_t start_lines = self->lines; @@ -1350,7 +1353,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { - status = parser_buffer_bytes(self, self->chunksize); + status = parser_buffer_bytes(self, self->chunksize, + encoding_errors); if (status == REACHED_EOF) { // close out last line @@ -1383,13 +1387,13 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { return status; } -int tokenize_nrows(parser_t *self, size_t nrows) { - int status = _tokenize_helper(self, nrows, 0); +int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) { + int status = _tokenize_helper(self, nrows, 0, encoding_errors); return status; } -int tokenize_all_rows(parser_t *self) { - int status = _tokenize_helper(self, -1, 1); +int tokenize_all_rows(parser_t *self, const char *encoding_errors) { + int status = _tokenize_helper(self, -1, 1, encoding_errors); return status; } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 876e2267906ee..f69fee4993d34 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -85,7 +85,7 @@ typedef enum { } QuoteStyle; typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status); + int *status, const char *encoding_errors); typedef int (*io_cleanup)(void *src); typedef struct parser_t { @@ -196,9 +196,9 @@ void parser_del(parser_t *self); void parser_set_default_options(parser_t *self); -int tokenize_nrows(parser_t *self, size_t nrows); +int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors); -int tokenize_all_rows(parser_t *self); +int tokenize_all_rows(parser_t *self, const char *encoding_errors); // Have parsed / type-converted a chunk of data // and want to free memory from the token stream diff --git a/pandas/io/common.py b/pandas/io/common.py index cf3b92ec93b1f..b87e8fcae1064 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -583,12 +583,32 @@ def get_handle( Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior - encoding_passed, encoding = encoding, encoding or "utf-8" + encoding = encoding or "utf-8" # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: mode += "b" + # valdiate errors + if isinstance(errors, str): + errors = errors.lower() + if errors not in ( + None, + "strict", + "ignore", + "replace", + "xmlcharrefreplace", + "backslashreplace", + "namereplace", + "surrogateescape", + "surrogatepass", + ): + raise ValueError( + f"Invalid value for `encoding_errors` ({errors}). Please see " + + "https://docs.python.org/3/library/codecs.html#error-handlers " + + "for valid values." + ) + # open URLs ioargs = _get_filepath_or_buffer( path_or_buf, @@ -677,9 +697,6 @@ def get_handle( # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. if ioargs.encoding and "b" not in ioargs.mode: - if errors is None and encoding_passed is None: - # ignore errors when no encoding is specified - errors = "replace" # Encoding handle = open( handle, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f050a6a086584..aa654e971641f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -334,6 +334,7 @@ def read_json( precise_float: bool = False, date_unit=None, encoding=None, + encoding_errors: Optional[str] = "strict", lines: bool = False, chunksize: Optional[int] = None, compression: CompressionOptions = "infer", @@ -456,6 +457,12 @@ def read_json( encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. + encoding_errors : str, optional, default "strict" + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3 + lines : bool, default False Read the file as a json object per line. @@ -584,6 +591,7 @@ def read_json( compression=compression, nrows=nrows, storage_options=storage_options, + encoding_errors=encoding_errors, ) if chunksize: @@ -620,6 +628,7 @@ def __init__( compression: CompressionOptions, nrows: Optional[int], storage_options: StorageOptions = None, + encoding_errors: Optional[str] = "strict", ): self.orient = orient @@ -638,6 +647,7 @@ def __init__( self.chunksize = chunksize self.nrows_seen = 0 self.nrows = nrows + self.encoding_errors = encoding_errors self.handles: Optional[IOHandles] = None if self.chunksize is not None: @@ -661,8 +671,8 @@ def _preprocess_data(self, data): Otherwise, we read it into memory for the `read` method. """ if hasattr(data, "read") and not (self.chunksize or self.nrows): - data = data.read() - self.close() + with self: + data = data.read() if not hasattr(data, "read") and (self.chunksize or self.nrows): data = StringIO(data) @@ -692,6 +702,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): encoding=self.encoding, compression=self.compression, storage_options=self.storage_options, + errors=self.encoding_errors, ) filepath_or_buffer = self.handles.handle diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2d17978b60327..c05efe9e73c5a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -109,6 +109,7 @@ "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, + "encoding_errors": "strict", } @@ -212,6 +213,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: compression=kwds.get("compression", None), memory_map=kwds.get("memory_map", False), storage_options=kwds.get("storage_options", None), + errors=kwds.get("encoding_errors", "strict"), ) def _validate_parse_dates_presence(self, columns: List[str]) -> None: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index edfc7ee0b6258..6adf1b20b769f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -296,11 +296,24 @@ Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python standard encodings `_ . + .. versionchanged:: 1.2 When ``encoding`` is ``None``, ``errors="replace"`` is passed to ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. This behavior was previously only the case for ``engine="python"``. + + .. versionchanged:: 1.3 + + ``encoding_errors`` is a new argument. ``encoding`` has no longer an + influence on how encoding errors are handled. + +encoding_errors : str, optional, default "strict" + How encoding errors are treated. `List of possible values + `_ . + + .. versionadded:: 1.3 + dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: `delimiter`, `doublequote`, `escapechar`, @@ -515,6 +528,7 @@ def read_csv( escapechar=None, comment=None, encoding=None, + encoding_errors: Optional[str] = "strict", dialect=None, # Error Handling error_bad_lines=True, @@ -599,6 +613,7 @@ def read_table( # Error Handling error_bad_lines=True, warn_bad_lines=True, + encoding_errors: Optional[str] = "strict", # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 6730d8cc46603..7cf9470e3057d 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -6,6 +6,7 @@ from inspect import signature from io import StringIO import os +from pathlib import Path import numpy as np import pytest @@ -734,3 +735,21 @@ def test_dict_keys_as_names(all_parsers): result = parser.read_csv(StringIO(data), names=keys) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) + + +def test_encoding_surrogatepass(all_parsers): + # GH39017 + parser = all_parsers + content = b"\xed\xbd\xbf" + decoded = content.decode("utf-8", errors="surrogatepass") + expected = DataFrame({decoded: [decoded]}, index=[decoded * 2]) + expected.index.name = decoded * 2 + + with tm.ensure_clean() as path: + Path(path).write_bytes( + content * 2 + b"," + content + b"\n" + content * 2 + b"," + content + ) + df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0) + tm.assert_frame_equal(df, expected) + with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"): + parser.read_csv(path) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 91b5e26efafa1..4e3d99af685ec 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -232,5 +232,5 @@ def test_open_file(all_parsers): warnings.simplefilter("always", category=ResourceWarning) with warnings.catch_warnings(record=True) as record: with pytest.raises(csv.Error, match="Could not determine delimiter"): - parser.read_csv(file, sep=None) + parser.read_csv(file, sep=None, encoding_errors="replace") assert len(record) == 0, record[0].message diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e1dcec56913f9..bacec6969b940 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -2,6 +2,7 @@ Tests for the pandas.io.common functionalities """ import codecs +from functools import partial from io import ( BytesIO, StringIO, @@ -429,14 +430,6 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("relative/local/path") -def test_default_errors(): - # GH 38989 - with tm.ensure_clean() as path: - file = Path(path) - file.write_bytes(b"\xe4\na\n1") - tm.assert_frame_equal(pd.read_csv(file, skiprows=[0]), pd.DataFrame({"a": [1]})) - - @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): @@ -481,3 +474,46 @@ def test_explicit_encoding(io_class, mode, msg): with io_class() as buffer: with pytest.raises(TypeError, match=msg): expected.to_csv(buffer, mode=f"w{mode}") + + +@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"]) +@pytest.mark.parametrize("format", ["csv", "json"]) +def test_encoding_errors(encoding_errors, format): + # GH39450 + msg = "'utf-8' codec can't decode byte" + bad_encoding = b"\xe4" + + if format == "csv": + return + content = bad_encoding + b"\n" + bad_encoding + reader = pd.read_csv + else: + content = ( + b'{"' + + bad_encoding * 2 + + b'": {"' + + bad_encoding + + b'":"' + + bad_encoding + + b'"}}' + ) + reader = partial(pd.read_json, orient="index") + with tm.ensure_clean() as path: + file = Path(path) + file.write_bytes(content) + + if encoding_errors != "replace": + with pytest.raises(UnicodeDecodeError, match=msg): + reader(path, encoding_errors=encoding_errors) + else: + df = reader(path, encoding_errors=encoding_errors) + decoded = bad_encoding.decode(errors=encoding_errors) + expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2]) + tm.assert_frame_equal(df, expected) + + +def test_bad_encdoing_errors(): + # GH 39777 + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="Invalide value for `encoding_errors`"): + icom.get_handle(path, "w", errors="bad") From 029c61c80dbc06ede6d79bd0804104531f5f3996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 8 Mar 2021 20:56:46 -0500 Subject: [PATCH 2/2] fix typo in test --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index bacec6969b940..e483b5b5a8abb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -515,5 +515,5 @@ def test_encoding_errors(encoding_errors, format): def test_bad_encdoing_errors(): # GH 39777 with tm.ensure_clean() as path: - with pytest.raises(ValueError, match="Invalide value for `encoding_errors`"): + with pytest.raises(ValueError, match="Invalid value for `encoding_errors`"): icom.get_handle(path, "w", errors="bad")