@@ -20,13 +20,19 @@ from libc.string cimport (
20
20
import cython
21
21
from cython import Py_ssize_t
22
22
23
- from cpython.bytes cimport PyBytes_AsString
23
+ from cpython.bytes cimport (
24
+ PyBytes_AsString,
25
+ PyBytes_FromString,
26
+ )
24
27
from cpython.exc cimport (
25
28
PyErr_Fetch,
26
29
PyErr_Occurred,
27
30
)
28
31
from cpython.object cimport PyObject
29
- from cpython.ref cimport Py_XDECREF
32
+ from cpython.ref cimport (
33
+ Py_INCREF,
34
+ Py_XDECREF,
35
+ )
30
36
from cpython.unicode cimport (
31
37
PyUnicode_AsUTF8String,
32
38
PyUnicode_Decode,
@@ -143,7 +149,7 @@ cdef extern from "parser/tokenizer.h":
143
149
enum : ERROR_OVERFLOW
144
150
145
151
ctypedef void * (* io_callback)(void * src, size_t nbytes, size_t * bytes_read,
146
- int * status)
152
+ int * status, const char * encoding_errors )
147
153
ctypedef int (* io_cleanup)(void * src)
148
154
149
155
ctypedef struct parser_t:
@@ -255,8 +261,8 @@ cdef extern from "parser/tokenizer.h":
255
261
256
262
int parser_trim_buffers(parser_t * self )
257
263
258
- int tokenize_all_rows(parser_t * self ) nogil
259
- int tokenize_nrows(parser_t * self , size_t nrows) nogil
264
+ int tokenize_all_rows(parser_t * self , const char * encoding_errors ) nogil
265
+ int tokenize_nrows(parser_t * self , size_t nrows, const char * encoding_errors ) nogil
260
266
261
267
int64_t str_to_int64(char * p_item, int64_t int_min,
262
268
int64_t int_max, int * error, char tsep) nogil
@@ -293,7 +299,7 @@ cdef extern from "parser/io.h":
293
299
size_t * bytes_read, int * status)
294
300
295
301
void * buffer_rd_bytes(void * source, size_t nbytes,
296
- size_t * bytes_read, int * status)
302
+ size_t * bytes_read, int * status, const char * encoding_errors )
297
303
298
304
299
305
cdef class TextReader:
@@ -316,6 +322,7 @@ cdef class TextReader:
316
322
uint64_t parser_start
317
323
list clocks
318
324
char * c_encoding
325
+ const char * encoding_errors
319
326
kh_str_starts_t * false_set
320
327
kh_str_starts_t * true_set
321
328
@@ -370,10 +377,15 @@ cdef class TextReader:
370
377
bint verbose = False ,
371
378
bint mangle_dupe_cols = True ,
372
379
float_precision = None ,
373
- bint skip_blank_lines = True ):
380
+ bint skip_blank_lines = True ,
381
+ encoding_errors = b" strict" ):
374
382
375
383
# set encoding for native Python and C library
376
384
self .c_encoding = NULL
385
+ if isinstance (encoding_errors, str ):
386
+ encoding_errors = encoding_errors.encode(" utf-8" )
387
+ Py_INCREF(encoding_errors)
388
+ self .encoding_errors = PyBytes_AsString(encoding_errors)
377
389
378
390
self .parser = parser_new()
379
391
self .parser.chunksize = tokenize_chunksize
@@ -558,13 +570,7 @@ cdef class TextReader:
558
570
pass
559
571
560
572
def __dealloc__ (self ):
561
- parser_free(self .parser)
562
- if self .true_set:
563
- kh_destroy_str_starts(self .true_set)
564
- self .true_set = NULL
565
- if self .false_set:
566
- kh_destroy_str_starts(self .false_set)
567
- self .false_set = NULL
573
+ self .close()
568
574
parser_del(self .parser)
569
575
570
576
def close (self ):
@@ -632,7 +638,6 @@ cdef class TextReader:
632
638
char * word
633
639
object name, old_name
634
640
uint64_t hr, data_line = 0
635
- char * errors = " strict"
636
641
StringPath path = _string_path(self .c_encoding)
637
642
list header = []
638
643
set unnamed_cols = set ()
@@ -673,11 +678,8 @@ cdef class TextReader:
673
678
for i in range (field_count):
674
679
word = self .parser.words[start + i]
675
680
676
- if path == UTF8:
677
- name = PyUnicode_FromString(word)
678
- elif path == ENCODED:
679
- name = PyUnicode_Decode(word, strlen(word),
680
- self .c_encoding, errors)
681
+ name = PyUnicode_Decode(word, strlen(word),
682
+ self .c_encoding, self .encoding_errors)
681
683
682
684
# We use this later when collecting placeholder names.
683
685
old_name = name
@@ -831,7 +833,7 @@ cdef class TextReader:
831
833
int status
832
834
833
835
with nogil:
834
- status = tokenize_nrows(self .parser, nrows)
836
+ status = tokenize_nrows(self .parser, nrows, self .encoding_errors )
835
837
836
838
if self .parser.warn_msg != NULL :
837
839
print (self .parser.warn_msg, file = sys.stderr)
@@ -859,7 +861,7 @@ cdef class TextReader:
859
861
' the whole file' )
860
862
else :
861
863
with nogil:
862
- status = tokenize_all_rows(self .parser)
864
+ status = tokenize_all_rows(self .parser, self .encoding_errors )
863
865
864
866
if self .parser.warn_msg != NULL :
865
867
print (self .parser.warn_msg, file = sys.stderr)
@@ -1201,7 +1203,7 @@ cdef class TextReader:
1201
1203
1202
1204
if path == UTF8:
1203
1205
return _string_box_utf8(self .parser, i, start, end, na_filter,
1204
- na_hashset)
1206
+ na_hashset, self .encoding_errors )
1205
1207
elif path == ENCODED:
1206
1208
return _string_box_decode(self .parser, i, start, end,
1207
1209
na_filter, na_hashset, self .c_encoding)
@@ -1352,7 +1354,8 @@ cdef inline StringPath _string_path(char *encoding):
1352
1354
1353
1355
cdef _string_box_utf8(parser_t * parser, int64_t col,
1354
1356
int64_t line_start, int64_t line_end,
1355
- bint na_filter, kh_str_starts_t * na_hashset):
1357
+ bint na_filter, kh_str_starts_t * na_hashset,
1358
+ const char * encoding_errors):
1356
1359
cdef:
1357
1360
int error, na_count = 0
1358
1361
Py_ssize_t i, lines
@@ -1391,7 +1394,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
1391
1394
pyval = < object > table.vals[k]
1392
1395
else :
1393
1396
# box it. new ref?
1394
- pyval = PyUnicode_FromString (word)
1397
+ pyval = PyUnicode_Decode (word, strlen(word), " utf-8 " , encoding_errors )
1395
1398
1396
1399
k = kh_put_strbox(table, word, & ret)
1397
1400
table.vals[k] = < PyObject * > pyval
0 commit comments