Skip to content

Commit 973a2d8

Browse files
committed
BUG: Fix the error when reading the compressed UTF-16 file
1 parent b4375bd commit 973a2d8

File tree

2 files changed

+19
-12
lines changed

2 files changed

+19
-12
lines changed

pandas/_libs/parsers.pyx

+17-11
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,17 @@ cdef class TextReader:
374374
float_precision=None,
375375
skip_blank_lines=True):
376376

377+
# encoding
378+
if encoding is not None:
379+
if not isinstance(encoding, bytes):
380+
encoding = encoding.encode('utf-8')
381+
encoding = encoding.lower()
382+
self.c_encoding = <char*> encoding
383+
else:
384+
self.c_encoding = NULL
385+
386+
self.encoding = encoding
387+
377388
self.parser = parser_new()
378389
self.parser.chunksize = tokenize_chunksize
379390

@@ -495,17 +506,6 @@ cdef class TextReader:
495506
self.parser.double_converter_nogil = NULL
496507
self.parser.double_converter_withgil = round_trip
497508

498-
# encoding
499-
if encoding is not None:
500-
if not isinstance(encoding, bytes):
501-
encoding = encoding.encode('utf-8')
502-
encoding = encoding.lower()
503-
self.c_encoding = <char*> encoding
504-
else:
505-
self.c_encoding = NULL
506-
507-
self.encoding = encoding
508-
509509
if isinstance(dtype, dict):
510510
dtype = {k: pandas_dtype(dtype[k])
511511
for k in dtype}
@@ -684,6 +684,12 @@ cdef class TextReader:
684684
else:
685685
raise ValueError('Unrecognized compression type: %s' %
686686
self.compression)
687+
688+
if b'utf-16' in (self.encoding or b''):
689+
source = com.UTF8Recoder(source, self.encoding.decode('utf-8'))
690+
self.encoding = b'utf-8'
691+
self.c_encoding = <char*> self.encoding
692+
687693
self.handle = source
688694

689695
if isinstance(source, basestring):

pandas/io/parsers.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1671,7 +1671,8 @@ def __init__(self, src, **kwds):
16711671

16721672
ParserBase.__init__(self, kwds)
16731673

1674-
if 'utf-16' in (kwds.get('encoding') or ''):
1674+
if kwds.get('compression') is None \
1675+
and 'utf-16' in (kwds.get('encoding') or ''):
16751676
if isinstance(src, compat.string_types):
16761677
src = open(src, 'rb')
16771678
self.handles.append(src)

0 commit comments

Comments
 (0)