Skip to content

ENH: 'encoding_errors' argument for read_csv/json #39777

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ Other enhancements
- Add support for parsing ``ISO 8601``-like timestamps with negative signs to :meth:`pandas.Timedelta` (:issue:`37172`)
- Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)

.. ---------------------------------------------------------------------------

Expand Down
53 changes: 28 additions & 25 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,19 @@ from libc.string cimport (
import cython
from cython import Py_ssize_t

from cpython.bytes cimport PyBytes_AsString
from cpython.bytes cimport (
PyBytes_AsString,
PyBytes_FromString,
)
from cpython.exc cimport (
PyErr_Fetch,
PyErr_Occurred,
)
from cpython.object cimport PyObject
from cpython.ref cimport Py_XDECREF
from cpython.ref cimport (
Py_INCREF,
Py_XDECREF,
)
from cpython.unicode cimport (
PyUnicode_AsUTF8String,
PyUnicode_Decode,
Expand Down Expand Up @@ -143,7 +149,7 @@ cdef extern from "parser/tokenizer.h":
enum: ERROR_OVERFLOW

ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status)
int *status, const char *encoding_errors)
ctypedef int (*io_cleanup)(void *src)

ctypedef struct parser_t:
Expand Down Expand Up @@ -255,8 +261,8 @@ cdef extern from "parser/tokenizer.h":

int parser_trim_buffers(parser_t *self)

int tokenize_all_rows(parser_t *self) nogil
int tokenize_nrows(parser_t *self, size_t nrows) nogil
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil

int64_t str_to_int64(char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep) nogil
Expand Down Expand Up @@ -293,7 +299,7 @@ cdef extern from "parser/io.h":
size_t *bytes_read, int *status)

void* buffer_rd_bytes(void *source, size_t nbytes,
size_t *bytes_read, int *status)
size_t *bytes_read, int *status, const char *encoding_errors)


cdef class TextReader:
Expand All @@ -316,6 +322,7 @@ cdef class TextReader:
uint64_t parser_start
list clocks
char *c_encoding
const char *encoding_errors
kh_str_starts_t *false_set
kh_str_starts_t *true_set

Expand Down Expand Up @@ -370,10 +377,15 @@ cdef class TextReader:
bint verbose=False,
bint mangle_dupe_cols=True,
float_precision=None,
bint skip_blank_lines=True):
bint skip_blank_lines=True,
encoding_errors=b"strict"):

# set encoding for native Python and C library
self.c_encoding = NULL
if isinstance(encoding_errors, str):
encoding_errors = encoding_errors.encode("utf-8")
Py_INCREF(encoding_errors)
self.encoding_errors = PyBytes_AsString(encoding_errors)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you assert the valid values here that are allowed (or if this is not validated at a higher level could raise ValueError on an illegal value). do we have tests for same?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a test in get_handle. That will fire for the python/c-engine.


self.parser = parser_new()
self.parser.chunksize = tokenize_chunksize
Expand Down Expand Up @@ -558,13 +570,7 @@ cdef class TextReader:
pass

def __dealloc__(self):
parser_free(self.parser)
if self.true_set:
kh_destroy_str_starts(self.true_set)
self.true_set = NULL
if self.false_set:
kh_destroy_str_starts(self.false_set)
self.false_set = NULL
self.close()
parser_del(self.parser)

def close(self):
Expand Down Expand Up @@ -632,7 +638,6 @@ cdef class TextReader:
char *word
object name, old_name
uint64_t hr, data_line = 0
char *errors = "strict"
StringPath path = _string_path(self.c_encoding)
list header = []
set unnamed_cols = set()
Expand Down Expand Up @@ -673,11 +678,8 @@ cdef class TextReader:
for i in range(field_count):
word = self.parser.words[start + i]

if path == UTF8:
name = PyUnicode_FromString(word)
elif path == ENCODED:
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, errors)
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, self.encoding_errors)

# We use this later when collecting placeholder names.
old_name = name
Expand Down Expand Up @@ -831,7 +833,7 @@ cdef class TextReader:
int status

with nogil:
status = tokenize_nrows(self.parser, nrows)
status = tokenize_nrows(self.parser, nrows, self.encoding_errors)

if self.parser.warn_msg != NULL:
print(self.parser.warn_msg, file=sys.stderr)
Expand Down Expand Up @@ -859,7 +861,7 @@ cdef class TextReader:
'the whole file')
else:
with nogil:
status = tokenize_all_rows(self.parser)
status = tokenize_all_rows(self.parser, self.encoding_errors)

if self.parser.warn_msg != NULL:
print(self.parser.warn_msg, file=sys.stderr)
Expand Down Expand Up @@ -1201,7 +1203,7 @@ cdef class TextReader:

if path == UTF8:
return _string_box_utf8(self.parser, i, start, end, na_filter,
na_hashset)
na_hashset, self.encoding_errors)
elif path == ENCODED:
return _string_box_decode(self.parser, i, start, end,
na_filter, na_hashset, self.c_encoding)
Expand Down Expand Up @@ -1352,7 +1354,8 @@ cdef inline StringPath _string_path(char *encoding):

cdef _string_box_utf8(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
const char *encoding_errors):
cdef:
int error, na_count = 0
Py_ssize_t i, lines
Expand Down Expand Up @@ -1391,7 +1394,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
pyval = <object>table.vals[k]
else:
# box it. new ref?
pyval = PyUnicode_FromString(word)
pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)

k = kh_put_strbox(table, word, &ret)
table.vals[k] = <PyObject *>pyval
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/src/parser/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
}

void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status) {
int *status, const char *encoding_errors) {
PyGILState_STATE state;
PyObject *result, *func, *args, *tmp;

Expand Down Expand Up @@ -191,7 +191,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
*status = CALLING_READ_FAILED;
return NULL;
} else if (!PyBytes_Check(result)) {
tmp = PyUnicode_AsUTF8String(result);
tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
Py_DECREF(result);
if (tmp == NULL) {
PyGILState_Release(state);
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/src/parser/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,6 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status);

void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
int *status);
int *status, const char *encoding_errors);

#endif // PANDAS__LIBS_SRC_PARSER_IO_H_
20 changes: 12 additions & 8 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -553,13 +553,15 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
return 0;
}

static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
static int parser_buffer_bytes(parser_t *self, size_t nbytes,
const char *encoding_errors) {
int status;
size_t bytes_read;

status = 0;
self->datapos = 0;
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status,
encoding_errors);
TRACE((
"parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
nbytes, bytes_read, status));
Expand Down Expand Up @@ -1334,7 +1336,8 @@ int parser_trim_buffers(parser_t *self) {
all : tokenize all the data vs. certain number of rows
*/

int _tokenize_helper(parser_t *self, size_t nrows, int all) {
int _tokenize_helper(parser_t *self, size_t nrows, int all,
const char *encoding_errors) {
int status = 0;
uint64_t start_lines = self->lines;

Expand All @@ -1350,7 +1353,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
if (!all && self->lines - start_lines >= nrows) break;

if (self->datapos == self->datalen) {
status = parser_buffer_bytes(self, self->chunksize);
status = parser_buffer_bytes(self, self->chunksize,
encoding_errors);

if (status == REACHED_EOF) {
// close out last line
Expand Down Expand Up @@ -1383,13 +1387,13 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
return status;
}

int tokenize_nrows(parser_t *self, size_t nrows) {
int status = _tokenize_helper(self, nrows, 0);
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
int status = _tokenize_helper(self, nrows, 0, encoding_errors);
return status;
}

int tokenize_all_rows(parser_t *self) {
int status = _tokenize_helper(self, -1, 1);
int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
int status = _tokenize_helper(self, -1, 1, encoding_errors);
return status;
}

Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ typedef enum {
} QuoteStyle;

typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
int *status);
int *status, const char *encoding_errors);
typedef int (*io_cleanup)(void *src);

typedef struct parser_t {
Expand Down Expand Up @@ -196,9 +196,9 @@ void parser_del(parser_t *self);

void parser_set_default_options(parser_t *self);

int tokenize_nrows(parser_t *self, size_t nrows);
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);

int tokenize_all_rows(parser_t *self);
int tokenize_all_rows(parser_t *self, const char *encoding_errors);

// Have parsed / type-converted a chunk of data
// and want to free memory from the token stream
Expand Down
25 changes: 21 additions & 4 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,12 +583,32 @@ def get_handle(
Returns the dataclass IOHandles
"""
# Windows does not default to utf-8. Set to utf-8 for a consistent behavior
encoding_passed, encoding = encoding, encoding or "utf-8"
encoding = encoding or "utf-8"

# read_csv does not know whether the buffer is opened in binary/text mode
if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
mode += "b"

# valdiate errors
if isinstance(errors, str):
errors = errors.lower()
if errors not in (
None,
"strict",
"ignore",
"replace",
"xmlcharrefreplace",
"backslashreplace",
"namereplace",
"surrogateescape",
"surrogatepass",
):
raise ValueError(
f"Invalid value for `encoding_errors` ({errors}). Please see "
+ "https://docs.python.org/3/library/codecs.html#error-handlers "
+ "for valid values."
)

# open URLs
ioargs = _get_filepath_or_buffer(
path_or_buf,
Expand Down Expand Up @@ -677,9 +697,6 @@ def get_handle(
# Check whether the filename is to be opened in binary mode.
# Binary mode does not support 'encoding' and 'newline'.
if ioargs.encoding and "b" not in ioargs.mode:
if errors is None and encoding_passed is None:
# ignore errors when no encoding is specified
errors = "replace"
# Encoding
handle = open(
handle,
Expand Down
15 changes: 13 additions & 2 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ def read_json(
precise_float: bool = False,
date_unit=None,
encoding=None,
encoding_errors: Optional[str] = "strict",
lines: bool = False,
chunksize: Optional[int] = None,
compression: CompressionOptions = "infer",
Expand Down Expand Up @@ -456,6 +457,12 @@ def read_json(
encoding : str, default is 'utf-8'
The encoding to use to decode py3 bytes.

encoding_errors : str, optional, default "strict"
How encoding errors are treated. `List of possible values
<https://docs.python.org/3/library/codecs.html#error-handlers>`_ .

.. versionadded:: 1.3

lines : bool, default False
Read the file as a json object per line.

Expand Down Expand Up @@ -584,6 +591,7 @@ def read_json(
compression=compression,
nrows=nrows,
storage_options=storage_options,
encoding_errors=encoding_errors,
)

if chunksize:
Expand Down Expand Up @@ -620,6 +628,7 @@ def __init__(
compression: CompressionOptions,
nrows: Optional[int],
storage_options: StorageOptions = None,
encoding_errors: Optional[str] = "strict",
):

self.orient = orient
Expand All @@ -638,6 +647,7 @@ def __init__(
self.chunksize = chunksize
self.nrows_seen = 0
self.nrows = nrows
self.encoding_errors = encoding_errors
self.handles: Optional[IOHandles] = None

if self.chunksize is not None:
Expand All @@ -661,8 +671,8 @@ def _preprocess_data(self, data):
Otherwise, we read it into memory for the `read` method.
"""
if hasattr(data, "read") and not (self.chunksize or self.nrows):
data = data.read()
self.close()
with self:
data = data.read()
if not hasattr(data, "read") and (self.chunksize or self.nrows):
data = StringIO(data)

Expand Down Expand Up @@ -692,6 +702,7 @@ def _get_data_from_filepath(self, filepath_or_buffer):
encoding=self.encoding,
compression=self.compression,
storage_options=self.storage_options,
errors=self.encoding_errors,
)
filepath_or_buffer = self.handles.handle

Expand Down
2 changes: 2 additions & 0 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
"mangle_dupe_cols": True,
"infer_datetime_format": False,
"skip_blank_lines": True,
"encoding_errors": "strict",
}


Expand Down Expand Up @@ -212,6 +213,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
compression=kwds.get("compression", None),
memory_map=kwds.get("memory_map", False),
storage_options=kwds.get("storage_options", None),
errors=kwds.get("encoding_errors", "strict"),
)

def _validate_parse_dates_presence(self, columns: List[str]) -> None:
Expand Down
Loading