BUG: Expand encoding for C engine beyond utf-16

gfyoung · gfyoung · commit c0210604ebbb · 2020-01-06T22:38:51.000-08:00
And by utf-16, we mean the string "utf-16" Closes pandas-dev#24130
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -941,6 +941,7 @@ I/O
 - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`)
 - Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`)
 - :func:`read_excel` now accepts binary data (:issue:`15914`)
+- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -2,6 +2,7 @@
 # See LICENSE for the license
 import bz2
 import gzip
+import io
 import os
 import sys
 import time
@@ -637,11 +638,10 @@ cdef class TextReader:
                 raise ValueError(f'Unrecognized compression type: '
                                  f'{self.compression}')
 
-            if b'utf-16' in (self.encoding or b''):
-                # we need to read utf-16 through UTF8Recoder.
-                # if source is utf-16, convert source to utf-8 by UTF8Recoder.
-                source = icom.UTF8Recoder(source,
-                                          self.encoding.decode('utf-8'))
+            if self.encoding and isinstance(source, io.BufferedIOBase):
+                source = io.TextIOWrapper(
+                    source, self.encoding.decode('utf-8'), newline='')
+
                 self.encoding = b'utf-8'
                 self.c_encoding = <char*>self.encoding
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -1,7 +1,6 @@
 """Common IO api utilities"""
 
 import bz2
-import codecs
 from collections import abc
 import gzip
 from io import BufferedIOBase, BytesIO
@@ -12,7 +11,6 @@
     IO,
     Any,
     AnyStr,
-    BinaryIO,
     Dict,
     List,
     Mapping,
@@ -538,24 +536,3 @@ def __next__(self) -> str:
         if newline == "":
             raise StopIteration
         return newline
-
-
-class UTF8Recoder(abc.Iterator):
-    """
-    Iterator that reads an encoded stream and re-encodes the input to UTF-8
-    """
-
-    def __init__(self, f: BinaryIO, encoding: str):
-        self.reader = codecs.getreader(encoding)(f)
-
-    def read(self, bytes: int = -1) -> bytes:
-        return self.reader.read(bytes).encode("utf-8")
-
-    def readline(self) -> bytes:
-        return self.reader.readline().encode("utf-8")
-
-    def __next__(self) -> bytes:
-        return next(self.reader).encode("utf-8")
-
-    def close(self):
-        self.reader.close()
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -5,7 +5,7 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from io import StringIO
+from io import StringIO, BufferedIOBase, TextIOWrapper
 import re
 import sys
 from textwrap import fill
@@ -62,7 +62,6 @@
 from pandas.core.tools import datetimes as tools
 
 from pandas.io.common import (
-    UTF8Recoder,
     get_filepath_or_buffer,
     get_handle,
     infer_compression,
@@ -1868,12 +1867,18 @@ def __init__(self, src, **kwds):
 
         ParserBase.__init__(self, kwds)
 
-        if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""):
-            # if source is utf-16 plain text, convert source to utf-8
+        encoding = kwds.get("encoding")
+
+        if kwds.get("compression") is None and encoding:
             if isinstance(src, str):
                 src = open(src, "rb")
                 self.handles.append(src)
-            src = UTF8Recoder(src, kwds["encoding"])
+
+            # Handle the file object with universal line mode enabled.
+            # We will handle the newline character ourselves later on.
+            if isinstance(src, BufferedIOBase):
+                src = TextIOWrapper(src, encoding=encoding, newline="")
+
             kwds["encoding"] = "utf-8"
 
         # #2442
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -80,3 +80,22 @@ def c_parser_only(request):
 @pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
 def python_parser_only(request):
     return request.param
+
+
+_utf_values = [8, 16, 32]
+
+_encoding_seps = ["", "-", "_"]
+_encoding_prefixes = ["utf", "UTF"]
+
+_encoding_fmts = [f"{prefix}{sep}" + "{0}" for sep in _encoding_seps
+                  for prefix in _encoding_prefixes]
+
+
+@pytest.fixture(params=_utf_values)
+def utf_value(request):
+    return request.param
+
+
+@pytest.fixture(params=_encoding_fmts)
+def encoding_fmt(request):
+    return request.param
diff --git a/pandas/tests/io/parser/data/utf32_ex_small.zip b/pandas/tests/io/parser/data/utf32_ex_small.zip
diff --git a/pandas/tests/io/parser/data/utf8_ex_small.zip b/pandas/tests/io/parser/data/utf8_ex_small.zip
diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
@@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     tm.assert_frame_equal(result, expected)
 
 
-def test_compression_utf16_encoding(all_parsers, csv_dir_path):
-    # see gh-18071
+def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
+    # see gh-18071, gh-24130
     parser = all_parsers
-    path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
+    encoding = encoding_fmt.format(utf_value)
+    path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
 
-    result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t")
+    result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
     expected = pd.DataFrame(
         {
             "Country": ["Venezuela", "Venezuela"],
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -5,6 +5,7 @@
 
 from io import BytesIO
 import os
+import tempfile
 
 import numpy as np
 import pytest
@@ -119,14 +120,12 @@ def _encode_data_with_bom(_data):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("byte", [8, 16])
-@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
-def test_read_csv_utf_aliases(all_parsers, byte, fmt):
+def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
     # see gh-13549
     expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
     parser = all_parsers
 
-    encoding = fmt.format(byte)
+    encoding = encoding_fmt.format(utf_value)
     data = "mb_num,multibyte\n4.8,test".encode(encoding)
 
     result = parser.read_csv(BytesIO(data), encoding=encoding)
@@ -155,3 +154,19 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
     with open(fpath, mode="rb") as fb:
         result = parser.read_csv(fb, encoding=encoding)
     tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("pass_encoding", [True, False])
+def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
+    # see gh-24130
+    parser = all_parsers
+    encoding = encoding_fmt.format(utf_value)
+
+    expected = DataFrame({"foo": ["bar"]})
+
+    with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f:
+        f.write("foo\nbar")
+        f.seek(0)
+
+        result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
+        tm.assert_frame_equal(result, expected)