BUG: Fix the error when reading the compressed UTF-16 file (#18091)

Licht-T · gfyoung · commit 0c4cc0d03e91 · 2017-11-04T13:41:15.000-07:00
(cherry picked from commit e0c9c6)
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -74,6 +74,9 @@ Indexing
 I/O
 ^^^
 
+- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
+- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
+
 Plotting
 ^^^^^^^^
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -374,6 +374,17 @@ cdef class TextReader:
                   float_precision=None,
                   skip_blank_lines=True):
 
+        # set encoding for native Python and C library
+        if encoding is not None:
+            if not isinstance(encoding, bytes):
+                encoding = encoding.encode('utf-8')
+            encoding = encoding.lower()
+            self.c_encoding = <char*> encoding
+        else:
+            self.c_encoding = NULL
+
+        self.encoding = encoding
+
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
 
@@ -495,17 +506,6 @@ cdef class TextReader:
             self.parser.double_converter_nogil = NULL
             self.parser.double_converter_withgil = round_trip
 
-        # encoding
-        if encoding is not None:
-            if not isinstance(encoding, bytes):
-                encoding = encoding.encode('utf-8')
-            encoding = encoding.lower()
-            self.c_encoding = <char*> encoding
-        else:
-            self.c_encoding = NULL
-
-        self.encoding = encoding
-
         if isinstance(dtype, dict):
             dtype = {k: pandas_dtype(dtype[k])
                      for k in dtype}
@@ -684,6 +684,14 @@ cdef class TextReader:
             else:
                 raise ValueError('Unrecognized compression type: %s' %
                                  self.compression)
+
+            if b'utf-16' in (self.encoding or b''):
+                # we need to read utf-16 through UTF8Recoder.
+                # if source is utf-16, convert source to utf-8 by UTF8Recoder.
+                source = com.UTF8Recoder(source, self.encoding.decode('utf-8'))
+                self.encoding = b'utf-8'
+                self.c_encoding = <char*> self.encoding
+
             self.handle = source
 
         if isinstance(source, basestring):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1671,7 +1671,9 @@ def __init__(self, src, **kwds):
 
         ParserBase.__init__(self, kwds)
 
-        if 'utf-16' in (kwds.get('encoding') or ''):
+        if (kwds.get('compression') is None
+           and 'utf-16' in (kwds.get('encoding') or '')):
+            # if source is utf-16 plain text, convert source to utf-8
             if isinstance(src, compat.string_types):
                 src = open(src, 'rb')
                 self.handles.append(src)
diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py
@@ -7,6 +7,7 @@
 
 import pytest
 
+import pandas as pd
 import pandas.util.testing as tm
 
 
@@ -157,6 +158,19 @@ def test_read_csv_infer_compression(self):
 
         inputs[3].close()
 
+    def test_read_csv_compressed_utf16_example(self):
+        # GH18071
+        path = tm.get_data_path('utf16_ex_small.zip')
+
+        result = self.read_csv(path, encoding='utf-16',
+                               compression='zip', sep='\t')
+        expected = pd.DataFrame({
+            u'Country': [u'Venezuela', u'Venezuela'],
+            u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
+        })
+
+        tm.assert_frame_equal(result, expected)
+
     def test_invalid_compression(self):
         msg = 'Unrecognized compression type: sfark'
         with tm.assert_raises_regex(ValueError, msg):
diff --git a/pandas/tests/io/parser/data/utf16_ex_small.zip b/pandas/tests/io/parser/data/utf16_ex_small.zip