Skip to content

Commit 0c4cc0d

Browse files
Licht-Tgfyoung
authored andcommitted
BUG: Fix the error when reading the compressed UTF-16 file (#18091)
(cherry picked from commit e0c9c6)
1 parent 8137209 commit 0c4cc0d

File tree

5 files changed

+39
-12
lines changed

5 files changed

+39
-12
lines changed

doc/source/whatsnew/v0.21.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ Indexing
7474
I/O
7575
^^^
7676

77+
- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
78+
- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
79+
7780
Plotting
7881
^^^^^^^^
7982

pandas/_libs/parsers.pyx

+19-11
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,17 @@ cdef class TextReader:
374374
float_precision=None,
375375
skip_blank_lines=True):
376376

377+
# set encoding for native Python and C library
378+
if encoding is not None:
379+
if not isinstance(encoding, bytes):
380+
encoding = encoding.encode('utf-8')
381+
encoding = encoding.lower()
382+
self.c_encoding = <char*> encoding
383+
else:
384+
self.c_encoding = NULL
385+
386+
self.encoding = encoding
387+
377388
self.parser = parser_new()
378389
self.parser.chunksize = tokenize_chunksize
379390

@@ -495,17 +506,6 @@ cdef class TextReader:
495506
self.parser.double_converter_nogil = NULL
496507
self.parser.double_converter_withgil = round_trip
497508

498-
# encoding
499-
if encoding is not None:
500-
if not isinstance(encoding, bytes):
501-
encoding = encoding.encode('utf-8')
502-
encoding = encoding.lower()
503-
self.c_encoding = <char*> encoding
504-
else:
505-
self.c_encoding = NULL
506-
507-
self.encoding = encoding
508-
509509
if isinstance(dtype, dict):
510510
dtype = {k: pandas_dtype(dtype[k])
511511
for k in dtype}
@@ -684,6 +684,14 @@ cdef class TextReader:
684684
else:
685685
raise ValueError('Unrecognized compression type: %s' %
686686
self.compression)
687+
688+
if b'utf-16' in (self.encoding or b''):
689+
# we need to read utf-16 through UTF8Recoder.
690+
# if source is utf-16, convert source to utf-8 by UTF8Recoder.
691+
source = com.UTF8Recoder(source, self.encoding.decode('utf-8'))
692+
self.encoding = b'utf-8'
693+
self.c_encoding = <char*> self.encoding
694+
687695
self.handle = source
688696

689697
if isinstance(source, basestring):

pandas/io/parsers.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1671,7 +1671,9 @@ def __init__(self, src, **kwds):
16711671

16721672
ParserBase.__init__(self, kwds)
16731673

1674-
if 'utf-16' in (kwds.get('encoding') or ''):
1674+
if (kwds.get('compression') is None
1675+
and 'utf-16' in (kwds.get('encoding') or '')):
1676+
# if source is utf-16 plain text, convert source to utf-8
16751677
if isinstance(src, compat.string_types):
16761678
src = open(src, 'rb')
16771679
self.handles.append(src)

pandas/tests/io/parser/compression.py

+14
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pytest
99

10+
import pandas as pd
1011
import pandas.util.testing as tm
1112

1213

@@ -157,6 +158,19 @@ def test_read_csv_infer_compression(self):
157158

158159
inputs[3].close()
159160

161+
def test_read_csv_compressed_utf16_example(self):
162+
# GH18071
163+
path = tm.get_data_path('utf16_ex_small.zip')
164+
165+
result = self.read_csv(path, encoding='utf-16',
166+
compression='zip', sep='\t')
167+
expected = pd.DataFrame({
168+
u'Country': [u'Venezuela', u'Venezuela'],
169+
u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
170+
})
171+
172+
tm.assert_frame_equal(result, expected)
173+
160174
def test_invalid_compression(self):
161175
msg = 'Unrecognized compression type: sfark'
162176
with tm.assert_raises_regex(ValueError, msg):
285 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)