diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b4761c3bc6c5..c9815ae63baaa 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -941,6 +941,7 @@ I/O - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) - Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) - :func:`read_excel` now accepts binary data (:issue:`15914`) +- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5122bb3d4e75b..3df2362f41f0f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,6 +2,7 @@ # See LICENSE for the license import bz2 import gzip +import io import os import sys import time @@ -637,11 +638,10 @@ cdef class TextReader: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') - if b'utf-16' in (self.encoding or b''): - # we need to read utf-16 through UTF8Recoder. - # if source is utf-16, convert source to utf-8 by UTF8Recoder. - source = icom.UTF8Recoder(source, - self.encoding.decode('utf-8')) + if self.encoding and isinstance(source, io.BufferedIOBase): + source = io.TextIOWrapper( + source, self.encoding.decode('utf-8'), newline='') + self.encoding = b'utf-8' self.c_encoding = self.encoding diff --git a/pandas/io/common.py b/pandas/io/common.py index 43cd7d81ae4cd..771a302d647ec 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,25 +1,13 @@ """Common IO api utilities""" import bz2 -import codecs from collections import abc import gzip from io import BufferedIOBase, BytesIO import mmap import os import pathlib -from typing import ( - IO, - Any, - AnyStr, - BinaryIO, - Dict, - List, - Mapping, - Optional, - Tuple, - Union, -) +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union from urllib.parse import ( # noqa urlencode, urljoin, @@ -538,24 +526,3 @@ def __next__(self) -> str: if newline == "": raise StopIteration return newline - - -class UTF8Recoder(abc.Iterator): - """ - Iterator that reads an encoded stream and re-encodes the input to UTF-8 - """ - - def __init__(self, f: BinaryIO, encoding: str): - self.reader = codecs.getreader(encoding)(f) - - def read(self, bytes: int = -1) -> bytes: - return self.reader.read(bytes).encode("utf-8") - - def readline(self) -> bytes: - return self.reader.readline().encode("utf-8") - - def __next__(self) -> bytes: - return next(self.reader).encode("utf-8") - - def close(self): - self.reader.close() diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 21e1ef98fc55c..b4eb2fb1411d0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,7 +5,7 @@ from collections import abc, defaultdict import csv import datetime -from io import StringIO +from io import BufferedIOBase, StringIO, TextIOWrapper import re import sys from textwrap import fill @@ -62,7 +62,6 @@ from pandas.core.tools import datetimes as tools from pandas.io.common import ( - UTF8Recoder, get_filepath_or_buffer, get_handle, infer_compression, @@ -1868,12 +1867,18 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""): - # if source is utf-16 plain text, convert source to utf-8 + encoding = kwds.get("encoding") + + if kwds.get("compression") is None and encoding: if isinstance(src, str): src = open(src, "rb") self.handles.append(src) - src = UTF8Recoder(src, kwds["encoding"]) + + # Handle the file object with universal line mode enabled. + # We will handle the newline character ourselves later on. + if isinstance(src, BufferedIOBase): + src = TextIOWrapper(src, encoding=encoding, newline="") + kwds["encoding"] = "utf-8" # #2442 diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index a87e1e796c194..1fe8951a2d5f0 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -80,3 +80,29 @@ def c_parser_only(request): @pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): return request.param + + +_utf_values = [8, 16, 32] + +_encoding_seps = ["", "-", "_"] +_encoding_prefixes = ["utf", "UTF"] + +_encoding_fmts = [ + f"{prefix}{sep}" + "{0}" for sep in _encoding_seps for prefix in _encoding_prefixes +] + + +@pytest.fixture(params=_utf_values) +def utf_value(request): + """ + Fixture for all possible integer values for a UTF encoding. + """ + return request.param + + +@pytest.fixture(params=_encoding_fmts) +def encoding_fmt(request): + """ + Fixture for all possible string formats of a UTF encoding. + """ + return request.param diff --git a/pandas/tests/io/parser/data/utf32_ex_small.zip b/pandas/tests/io/parser/data/utf32_ex_small.zip new file mode 100644 index 0000000000000..9a6d5c08da9db Binary files /dev/null and b/pandas/tests/io/parser/data/utf32_ex_small.zip differ diff --git a/pandas/tests/io/parser/data/utf8_ex_small.zip b/pandas/tests/io/parser/data/utf8_ex_small.zip new file mode 100644 index 0000000000000..a4c5440bdffa7 Binary files /dev/null and b/pandas/tests/io/parser/data/utf8_ex_small.zip differ diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index e2b6fdd3af2ff..dc03370daa1e2 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf16_encoding(all_parsers, csv_dir_path): - # see gh-18071 +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): + # see gh-18071, gh-24130 parser = all_parsers - path = os.path.join(csv_dir_path, "utf16_ex_small.zip") + encoding = encoding_fmt.format(utf_value) + path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") - result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t") + result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") expected = pd.DataFrame( { "Country": ["Venezuela", "Venezuela"], diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 2540dd9b19fce..33abf4bb7d9ee 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -5,6 +5,7 @@ from io import BytesIO import os +import tempfile import numpy as np import pytest @@ -119,14 +120,12 @@ def _encode_data_with_bom(_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) -def test_read_csv_utf_aliases(all_parsers, byte, fmt): +def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) parser = all_parsers - encoding = fmt.format(byte) + encoding = encoding_fmt.format(utf_value) data = "mb_num,multibyte\n4.8,test".encode(encoding) result = parser.read_csv(BytesIO(data), encoding=encoding) @@ -155,3 +154,19 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): with open(fpath, mode="rb") as fb: result = parser.read_csv(fb, encoding=encoding) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("pass_encoding", [True, False]) +def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): + # see gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + + expected = DataFrame({"foo": ["bar"]}) + + with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f: + f.write("foo\nbar") + f.seek(0) + + result = parser.read_csv(f, encoding=encoding if pass_encoding else None) + tm.assert_frame_equal(result, expected)