Skip to content

Commit d1dd19d

Browse files
committed
ENH: 'encoding_errors' argument for read_csv/json
1 parent e06ad0a commit d1dd19d

File tree

13 files changed

+128
-52
lines changed

13 files changed

+128
-52
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ Other enhancements
138138
- :meth:`pandas.read_stata` and :class:`StataReader` support reading data from compressed files.
139139
- Add support for parsing ``ISO 8601``-like timestamps with negative signs to :meth:`pandas.Timedelta` (:issue:`37172`)
140140
- Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
141+
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
141142

142143
.. ---------------------------------------------------------------------------
143144

pandas/_libs/parsers.pyx

+19-23
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ cdef extern from "parser/tokenizer.h":
143143
enum: ERROR_OVERFLOW
144144

145145
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
146-
int *status)
146+
int *status, const char *encoding_errors)
147147
ctypedef int (*io_cleanup)(void *src)
148148

149149
ctypedef struct parser_t:
@@ -255,8 +255,8 @@ cdef extern from "parser/tokenizer.h":
255255

256256
int parser_trim_buffers(parser_t *self)
257257

258-
int tokenize_all_rows(parser_t *self) nogil
259-
int tokenize_nrows(parser_t *self, size_t nrows) nogil
258+
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
259+
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
260260

261261
int64_t str_to_int64(char *p_item, int64_t int_min,
262262
int64_t int_max, int *error, char tsep) nogil
@@ -293,7 +293,7 @@ cdef extern from "parser/io.h":
293293
size_t *bytes_read, int *status)
294294

295295
void* buffer_rd_bytes(void *source, size_t nbytes,
296-
size_t *bytes_read, int *status)
296+
size_t *bytes_read, int *status, const char *encoding_errors)
297297

298298

299299
cdef class TextReader:
@@ -316,6 +316,7 @@ cdef class TextReader:
316316
uint64_t parser_start
317317
list clocks
318318
char *c_encoding
319+
const char *encoding_errors
319320
kh_str_starts_t *false_set
320321
kh_str_starts_t *true_set
321322

@@ -370,10 +371,14 @@ cdef class TextReader:
370371
bint verbose=False,
371372
bint mangle_dupe_cols=True,
372373
float_precision=None,
373-
bint skip_blank_lines=True):
374+
bint skip_blank_lines=True,
375+
encoding_errors=b"strict"):
374376

375377
# set encoding for native Python and C library
376378
self.c_encoding = NULL
379+
if not isinstance(encoding_errors, bytes):
380+
encoding_errors = encoding_errors.encode("utf-8")
381+
self.encoding_errors = <const char*>encoding_errors
377382

378383
self.parser = parser_new()
379384
self.parser.chunksize = tokenize_chunksize
@@ -558,13 +563,7 @@ cdef class TextReader:
558563
pass
559564

560565
def __dealloc__(self):
561-
parser_free(self.parser)
562-
if self.true_set:
563-
kh_destroy_str_starts(self.true_set)
564-
self.true_set = NULL
565-
if self.false_set:
566-
kh_destroy_str_starts(self.false_set)
567-
self.false_set = NULL
566+
self.close()
568567
parser_del(self.parser)
569568

570569
def close(self):
@@ -632,7 +631,6 @@ cdef class TextReader:
632631
char *word
633632
object name, old_name
634633
uint64_t hr, data_line = 0
635-
char *errors = "strict"
636634
StringPath path = _string_path(self.c_encoding)
637635
list header = []
638636
set unnamed_cols = set()
@@ -673,11 +671,8 @@ cdef class TextReader:
673671
for i in range(field_count):
674672
word = self.parser.words[start + i]
675673

676-
if path == UTF8:
677-
name = PyUnicode_FromString(word)
678-
elif path == ENCODED:
679-
name = PyUnicode_Decode(word, strlen(word),
680-
self.c_encoding, errors)
674+
name = PyUnicode_Decode(word, strlen(word),
675+
self.c_encoding, self.encoding_errors)
681676

682677
# We use this later when collecting placeholder names.
683678
old_name = name
@@ -831,7 +826,7 @@ cdef class TextReader:
831826
int status
832827

833828
with nogil:
834-
status = tokenize_nrows(self.parser, nrows)
829+
status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
835830

836831
if self.parser.warn_msg != NULL:
837832
print(self.parser.warn_msg, file=sys.stderr)
@@ -859,7 +854,7 @@ cdef class TextReader:
859854
'the whole file')
860855
else:
861856
with nogil:
862-
status = tokenize_all_rows(self.parser)
857+
status = tokenize_all_rows(self.parser, self.encoding_errors)
863858

864859
if self.parser.warn_msg != NULL:
865860
print(self.parser.warn_msg, file=sys.stderr)
@@ -1201,7 +1196,7 @@ cdef class TextReader:
12011196

12021197
if path == UTF8:
12031198
return _string_box_utf8(self.parser, i, start, end, na_filter,
1204-
na_hashset)
1199+
na_hashset, self.encoding_errors)
12051200
elif path == ENCODED:
12061201
return _string_box_decode(self.parser, i, start, end,
12071202
na_filter, na_hashset, self.c_encoding)
@@ -1352,7 +1347,8 @@ cdef inline StringPath _string_path(char *encoding):
13521347

13531348
cdef _string_box_utf8(parser_t *parser, int64_t col,
13541349
int64_t line_start, int64_t line_end,
1355-
bint na_filter, kh_str_starts_t *na_hashset):
1350+
bint na_filter, kh_str_starts_t *na_hashset,
1351+
const char *encoding_errors):
13561352
cdef:
13571353
int error, na_count = 0
13581354
Py_ssize_t i, lines
@@ -1391,7 +1387,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
13911387
pyval = <object>table.vals[k]
13921388
else:
13931389
# box it. new ref?
1394-
pyval = PyUnicode_FromString(word)
1390+
pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
13951391

13961392
k = kh_put_strbox(table, word, &ret)
13971393
table.vals[k] = <PyObject *>pyval

pandas/_libs/src/parser/io.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
163163
}
164164

165165
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
166-
int *status) {
166+
int *status, const char *encoding_errors) {
167167
PyGILState_STATE state;
168168
PyObject *result, *func, *args, *tmp;
169169

@@ -191,7 +191,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
191191
*status = CALLING_READ_FAILED;
192192
return NULL;
193193
} else if (!PyBytes_Check(result)) {
194-
tmp = PyUnicode_AsUTF8String(result);
194+
tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
195195
Py_DECREF(result);
196196
if (tmp == NULL) {
197197
PyGILState_Release(state);

pandas/_libs/src/parser/io.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,6 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
6464
int *status);
6565

6666
void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
67-
int *status);
67+
int *status, const char *encoding_errors);
6868

6969
#endif // PANDAS__LIBS_SRC_PARSER_IO_H_

pandas/_libs/src/parser/tokenizer.c

+14-8
Original file line numberDiff line numberDiff line change
@@ -553,13 +553,16 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
553553
return 0;
554554
}
555555

556-
static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
556+
static int parser_buffer_bytes(parser_t *self, size_t nbytes,
557+
const char *encoding_errors) {
557558
int status;
558559
size_t bytes_read;
559560

560561
status = 0;
561562
self->datapos = 0;
562-
self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
563+
self->data = self->cb_io(
564+
self->source, nbytes, &bytes_read, &status, encoding_errors
565+
);
563566
TRACE((
564567
"parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
565568
nbytes, bytes_read, status));
@@ -1334,7 +1337,8 @@ int parser_trim_buffers(parser_t *self) {
13341337
all : tokenize all the data vs. certain number of rows
13351338
*/
13361339

1337-
int _tokenize_helper(parser_t *self, size_t nrows, int all) {
1340+
int _tokenize_helper(parser_t *self, size_t nrows, int all,
1341+
const char *encoding_errors) {
13381342
int status = 0;
13391343
uint64_t start_lines = self->lines;
13401344

@@ -1350,7 +1354,9 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13501354
if (!all && self->lines - start_lines >= nrows) break;
13511355

13521356
if (self->datapos == self->datalen) {
1353-
status = parser_buffer_bytes(self, self->chunksize);
1357+
status = parser_buffer_bytes(
1358+
self, self->chunksize, encoding_errors
1359+
);
13541360

13551361
if (status == REACHED_EOF) {
13561362
// close out last line
@@ -1383,13 +1389,13 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
13831389
return status;
13841390
}
13851391

1386-
int tokenize_nrows(parser_t *self, size_t nrows) {
1387-
int status = _tokenize_helper(self, nrows, 0);
1392+
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
1393+
int status = _tokenize_helper(self, nrows, 0, encoding_errors);
13881394
return status;
13891395
}
13901396

1391-
int tokenize_all_rows(parser_t *self) {
1392-
int status = _tokenize_helper(self, -1, 1);
1397+
int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
1398+
int status = _tokenize_helper(self, -1, 1, encoding_errors);
13931399
return status;
13941400
}
13951401

pandas/_libs/src/parser/tokenizer.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ typedef enum {
8585
} QuoteStyle;
8686

8787
typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
88-
int *status);
88+
int *status, const char *encoding_errors);
8989
typedef int (*io_cleanup)(void *src);
9090

9191
typedef struct parser_t {
@@ -196,9 +196,9 @@ void parser_del(parser_t *self);
196196

197197
void parser_set_default_options(parser_t *self);
198198

199-
int tokenize_nrows(parser_t *self, size_t nrows);
199+
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);
200200

201-
int tokenize_all_rows(parser_t *self);
201+
int tokenize_all_rows(parser_t *self, const char *encoding_errors);
202202

203203
// Have parsed / type-converted a chunk of data
204204
// and want to free memory from the token stream

pandas/io/common.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ def get_handle(
583583
Returns the dataclass IOHandles
584584
"""
585585
# Windows does not default to utf-8. Set to utf-8 for a consistent behavior
586-
encoding_passed, encoding = encoding, encoding or "utf-8"
586+
encoding = encoding or "utf-8"
587587

588588
# read_csv does not know whether the buffer is opened in binary/text mode
589589
if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
@@ -670,9 +670,6 @@ def get_handle(
670670
# Check whether the filename is to be opened in binary mode.
671671
# Binary mode does not support 'encoding' and 'newline'.
672672
if ioargs.encoding and "b" not in ioargs.mode:
673-
if errors is None and encoding_passed is None:
674-
# ignore errors when no encoding is specified
675-
errors = "replace"
676673
# Encoding
677674
handle = open(
678675
handle,

pandas/io/json/_json.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ def read_json(
334334
precise_float: bool = False,
335335
date_unit=None,
336336
encoding=None,
337+
encoding_errors: Optional[str] = "strict",
337338
lines: bool = False,
338339
chunksize: Optional[int] = None,
339340
compression: CompressionOptions = "infer",
@@ -456,6 +457,12 @@ def read_json(
456457
encoding : str, default is 'utf-8'
457458
The encoding to use to decode py3 bytes.
458459
460+
encoding_errors : str, optional, default "strict"
461+
How encoding errors are treated. `List of possible values
462+
<https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
463+
464+
.. versionadded:: 1.3
465+
459466
lines : bool, default False
460467
Read the file as a json object per line.
461468
@@ -584,6 +591,7 @@ def read_json(
584591
compression=compression,
585592
nrows=nrows,
586593
storage_options=storage_options,
594+
encoding_errors=encoding_errors,
587595
)
588596

589597
if chunksize:
@@ -620,6 +628,7 @@ def __init__(
620628
compression: CompressionOptions,
621629
nrows: Optional[int],
622630
storage_options: StorageOptions = None,
631+
encoding_errors: Optional[str] = "strict",
623632
):
624633

625634
self.orient = orient
@@ -638,6 +647,7 @@ def __init__(
638647
self.chunksize = chunksize
639648
self.nrows_seen = 0
640649
self.nrows = nrows
650+
self.encoding_errors = encoding_errors
641651
self.handles: Optional[IOHandles] = None
642652

643653
if self.chunksize is not None:
@@ -661,8 +671,8 @@ def _preprocess_data(self, data):
661671
Otherwise, we read it into memory for the `read` method.
662672
"""
663673
if hasattr(data, "read") and not (self.chunksize or self.nrows):
664-
data = data.read()
665-
self.close()
674+
with self:
675+
data = data.read()
666676
if not hasattr(data, "read") and (self.chunksize or self.nrows):
667677
data = StringIO(data)
668678

@@ -692,6 +702,7 @@ def _get_data_from_filepath(self, filepath_or_buffer):
692702
encoding=self.encoding,
693703
compression=self.compression,
694704
storage_options=self.storage_options,
705+
errors=self.encoding_errors,
695706
)
696707
filepath_or_buffer = self.handles.handle
697708

pandas/io/parsers/base_parser.py

+2
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@
109109
"mangle_dupe_cols": True,
110110
"infer_datetime_format": False,
111111
"skip_blank_lines": True,
112+
"encoding_errors": "strict",
112113
}
113114

114115

@@ -212,6 +213,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
212213
compression=kwds.get("compression", None),
213214
memory_map=kwds.get("memory_map", False),
214215
storage_options=kwds.get("storage_options", None),
216+
errors=kwds.get("encoding_errors", "strict"),
215217
)
216218

217219
def _validate_parse_dates_presence(self, columns: List[str]) -> None:

pandas/io/parsers/readers.py

+15
Original file line numberDiff line numberDiff line change
@@ -296,11 +296,24 @@
296296
Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
297297
standard encodings
298298
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
299+
299300
.. versionchanged:: 1.2
300301
301302
When ``encoding`` is ``None``, ``errors="replace"`` is passed to
302303
``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
303304
This behavior was previously only the case for ``engine="python"``.
305+
306+
.. versionchanged:: 1.3
307+
308+
``encoding_errors`` is a new argument. ``encoding`` has no longer an
309+
influence on how encoding errors are handled.
310+
311+
encoding_errors : str, optional, default "strict"
312+
How encoding errors are treated. `List of possible values
313+
<https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
314+
315+
.. versionadded:: 1.3
316+
304317
dialect : str or csv.Dialect, optional
305318
If provided, this parameter will override values (default or not) for the
306319
following parameters: `delimiter`, `doublequote`, `escapechar`,
@@ -515,6 +528,7 @@ def read_csv(
515528
escapechar=None,
516529
comment=None,
517530
encoding=None,
531+
encoding_errors: Optional[str] = "strict",
518532
dialect=None,
519533
# Error Handling
520534
error_bad_lines=True,
@@ -599,6 +613,7 @@ def read_table(
599613
# Error Handling
600614
error_bad_lines=True,
601615
warn_bad_lines=True,
616+
encoding_errors: Optional[str] = "strict",
602617
# Internal
603618
delim_whitespace=False,
604619
low_memory=_c_parser_defaults["low_memory"],

0 commit comments

Comments
 (0)