@@ -374,6 +374,17 @@ cdef class TextReader:
374
374
float_precision = None ,
375
375
skip_blank_lines = True ):
376
376
377
+ # encoding
378
+ if encoding is not None :
379
+ if not isinstance (encoding, bytes):
380
+ encoding = encoding.encode(' utf-8' )
381
+ encoding = encoding.lower()
382
+ self .c_encoding = < char * > encoding
383
+ else :
384
+ self .c_encoding = NULL
385
+
386
+ self .encoding = encoding
387
+
377
388
self .parser = parser_new()
378
389
self .parser.chunksize = tokenize_chunksize
379
390
@@ -495,17 +506,6 @@ cdef class TextReader:
495
506
self .parser.double_converter_nogil = NULL
496
507
self .parser.double_converter_withgil = round_trip
497
508
498
- # encoding
499
- if encoding is not None :
500
- if not isinstance (encoding, bytes):
501
- encoding = encoding.encode(' utf-8' )
502
- encoding = encoding.lower()
503
- self .c_encoding = < char * > encoding
504
- else :
505
- self .c_encoding = NULL
506
-
507
- self .encoding = encoding
508
-
509
509
if isinstance (dtype, dict ):
510
510
dtype = {k: pandas_dtype(dtype[k])
511
511
for k in dtype}
@@ -684,6 +684,12 @@ cdef class TextReader:
684
684
else :
685
685
raise ValueError (' Unrecognized compression type: %s ' %
686
686
self .compression)
687
+
688
+ if b' utf-16' in (self .encoding or b' ' ):
689
+ source = com.UTF8Recoder(source, self .encoding.decode(' utf-8' ))
690
+ self .encoding = b' utf-8'
691
+ self .c_encoding = < char * > self .encoding
692
+
687
693
self .handle = source
688
694
689
695
if isinstance (source, basestring ):
0 commit comments