@@ -143,7 +143,7 @@ cdef extern from "parser/tokenizer.h":
143
143
enum : ERROR_OVERFLOW
144
144
145
145
ctypedef void * (* io_callback)(void * src, size_t nbytes, size_t * bytes_read,
146
- int * status)
146
+ int * status, const char * encoding_errors )
147
147
ctypedef int (* io_cleanup)(void * src)
148
148
149
149
ctypedef struct parser_t:
@@ -255,8 +255,8 @@ cdef extern from "parser/tokenizer.h":
255
255
256
256
int parser_trim_buffers(parser_t * self )
257
257
258
- int tokenize_all_rows(parser_t * self ) nogil
259
- int tokenize_nrows(parser_t * self , size_t nrows) nogil
258
+ int tokenize_all_rows(parser_t * self , const char * encoding_errors ) nogil
259
+ int tokenize_nrows(parser_t * self , size_t nrows, const char * encoding_errors ) nogil
260
260
261
261
int64_t str_to_int64(char * p_item, int64_t int_min,
262
262
int64_t int_max, int * error, char tsep) nogil
@@ -293,7 +293,7 @@ cdef extern from "parser/io.h":
293
293
size_t * bytes_read, int * status)
294
294
295
295
void * buffer_rd_bytes(void * source, size_t nbytes,
296
- size_t * bytes_read, int * status)
296
+ size_t * bytes_read, int * status, const char * encoding_errors )
297
297
298
298
299
299
cdef class TextReader:
@@ -316,6 +316,7 @@ cdef class TextReader:
316
316
uint64_t parser_start
317
317
list clocks
318
318
char * c_encoding
319
+ const char * encoding_errors
319
320
kh_str_starts_t * false_set
320
321
kh_str_starts_t * true_set
321
322
@@ -370,10 +371,14 @@ cdef class TextReader:
370
371
bint verbose = False ,
371
372
bint mangle_dupe_cols = True ,
372
373
float_precision = None ,
373
- bint skip_blank_lines = True ):
374
+ bint skip_blank_lines = True ,
375
+ encoding_errors = b" strict" ):
374
376
375
377
# set encoding for native Python and C library
376
378
self .c_encoding = NULL
379
+ if not isinstance (encoding_errors, bytes):
380
+ encoding_errors = encoding_errors.encode(" utf-8" )
381
+ self .encoding_errors = < const char * > encoding_errors
377
382
378
383
self .parser = parser_new()
379
384
self .parser.chunksize = tokenize_chunksize
@@ -558,13 +563,7 @@ cdef class TextReader:
558
563
pass
559
564
560
565
def __dealloc__ (self ):
561
- parser_free(self .parser)
562
- if self .true_set:
563
- kh_destroy_str_starts(self .true_set)
564
- self .true_set = NULL
565
- if self .false_set:
566
- kh_destroy_str_starts(self .false_set)
567
- self .false_set = NULL
566
+ self .close()
568
567
parser_del(self .parser)
569
568
570
569
def close (self ):
@@ -632,7 +631,6 @@ cdef class TextReader:
632
631
char * word
633
632
object name, old_name
634
633
uint64_t hr, data_line = 0
635
- char * errors = " strict"
636
634
StringPath path = _string_path(self .c_encoding)
637
635
list header = []
638
636
set unnamed_cols = set ()
@@ -673,11 +671,8 @@ cdef class TextReader:
673
671
for i in range (field_count):
674
672
word = self .parser.words[start + i]
675
673
676
- if path == UTF8:
677
- name = PyUnicode_FromString(word)
678
- elif path == ENCODED:
679
- name = PyUnicode_Decode(word, strlen(word),
680
- self .c_encoding, errors)
674
+ name = PyUnicode_Decode(word, strlen(word),
675
+ self .c_encoding, self .encoding_errors)
681
676
682
677
# We use this later when collecting placeholder names.
683
678
old_name = name
@@ -831,7 +826,7 @@ cdef class TextReader:
831
826
int status
832
827
833
828
with nogil:
834
- status = tokenize_nrows(self .parser, nrows)
829
+ status = tokenize_nrows(self .parser, nrows, self .encoding_errors )
835
830
836
831
if self .parser.warn_msg != NULL :
837
832
print (self .parser.warn_msg, file = sys.stderr)
@@ -859,7 +854,7 @@ cdef class TextReader:
859
854
' the whole file' )
860
855
else :
861
856
with nogil:
862
- status = tokenize_all_rows(self .parser)
857
+ status = tokenize_all_rows(self .parser, self .encoding_errors )
863
858
864
859
if self .parser.warn_msg != NULL :
865
860
print (self .parser.warn_msg, file = sys.stderr)
@@ -1201,7 +1196,7 @@ cdef class TextReader:
1201
1196
1202
1197
if path == UTF8:
1203
1198
return _string_box_utf8(self .parser, i, start, end, na_filter,
1204
- na_hashset)
1199
+ na_hashset, self .encoding_errors )
1205
1200
elif path == ENCODED:
1206
1201
return _string_box_decode(self .parser, i, start, end,
1207
1202
na_filter, na_hashset, self .c_encoding)
@@ -1352,7 +1347,8 @@ cdef inline StringPath _string_path(char *encoding):
1352
1347
1353
1348
cdef _string_box_utf8(parser_t * parser, int64_t col,
1354
1349
int64_t line_start, int64_t line_end,
1355
- bint na_filter, kh_str_starts_t * na_hashset):
1350
+ bint na_filter, kh_str_starts_t * na_hashset,
1351
+ const char * encoding_errors):
1356
1352
cdef:
1357
1353
int error, na_count = 0
1358
1354
Py_ssize_t i, lines
@@ -1391,7 +1387,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
1391
1387
pyval = < object > table.vals[k]
1392
1388
else :
1393
1389
# box it. new ref?
1394
- pyval = PyUnicode_FromString (word)
1390
+ pyval = PyUnicode_Decode (word, strlen(word), " utf-8 " , encoding_errors )
1395
1391
1396
1392
k = kh_put_strbox(table, word, & ret)
1397
1393
table.vals[k] = < PyObject * > pyval
0 commit comments