diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4adafe7c06450..270d81dda6b45 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -76,7 +76,7 @@ I/O ^^^ - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - +- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a5ce6c560d844..85857c158f96e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -374,6 +374,17 @@ cdef class TextReader: float_precision=None, skip_blank_lines=True): + # set encoding for native Python and C library + if encoding is not None: + if not isinstance(encoding, bytes): + encoding = encoding.encode('utf-8') + encoding = encoding.lower() + self.c_encoding = encoding + else: + self.c_encoding = NULL + + self.encoding = encoding + self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -495,17 +506,6 @@ cdef class TextReader: self.parser.double_converter_nogil = NULL self.parser.double_converter_withgil = round_trip - # encoding - if encoding is not None: - if not isinstance(encoding, bytes): - encoding = encoding.encode('utf-8') - encoding = encoding.lower() - self.c_encoding = encoding - else: - self.c_encoding = NULL - - self.encoding = encoding - if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) for k in dtype} @@ -684,6 +684,14 @@ cdef class TextReader: else: raise ValueError('Unrecognized compression type: %s' % self.compression) + + if b'utf-16' in (self.encoding or b''): + # we need to read utf-16 through UTF8Recoder. + # if source is utf-16, convert source to utf-8 by UTF8Recoder. + source = com.UTF8Recoder(source, self.encoding.decode('utf-8')) + self.encoding = b'utf-8' + self.c_encoding = self.encoding + self.handle = source if isinstance(source, basestring): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1b6414ea974fa..7f3f5630e49f9 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1671,7 +1671,9 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if 'utf-16' in (kwds.get('encoding') or ''): + if (kwds.get('compression') is None + and 'utf-16' in (kwds.get('encoding') or '')): + # if source is utf-16 plain text, convert source to utf-8 if isinstance(src, compat.string_types): src = open(src, 'rb') self.handles.append(src) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 797c12139656d..84db9d14eee07 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -7,6 +7,7 @@ import pytest +import pandas as pd import pandas.util.testing as tm @@ -157,6 +158,19 @@ def test_read_csv_infer_compression(self): inputs[3].close() + def test_read_csv_compressed_utf16_example(self): + # GH18071 + path = tm.get_data_path('utf16_ex_small.zip') + + result = self.read_csv(path, encoding='utf-16', + compression='zip', sep='\t') + expected = pd.DataFrame({ + u'Country': [u'Venezuela', u'Venezuela'], + u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.'] + }) + + tm.assert_frame_equal(result, expected) + def test_invalid_compression(self): msg = 'Unrecognized compression type: sfark' with tm.assert_raises_regex(ValueError, msg): diff --git a/pandas/tests/io/parser/data/utf16_ex_small.zip b/pandas/tests/io/parser/data/utf16_ex_small.zip new file mode 100644 index 0000000000000..b0560c1b1f6c4 Binary files /dev/null and b/pandas/tests/io/parser/data/utf16_ex_small.zip differ