Skip to content

Commit c021060

Browse files
committed
BUG: Expand encoding for C engine beyond utf-16
And by utf-16, we mean the string "utf-16" Closes pandas-dev#24130
1 parent 40bff2f commit c021060

File tree

9 files changed

+59
-41
lines changed

9 files changed

+59
-41
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,7 @@ I/O
941941
- :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`)
942942
- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`)
943943
- :func:`read_excel` now accepts binary data (:issue:`15914`)
944+
- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`)
944945

945946
Plotting
946947
^^^^^^^^

pandas/_libs/parsers.pyx

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# See LICENSE for the license
33
import bz2
44
import gzip
5+
import io
56
import os
67
import sys
78
import time
@@ -637,11 +638,10 @@ cdef class TextReader:
637638
raise ValueError(f'Unrecognized compression type: '
638639
f'{self.compression}')
639640

640-
if b'utf-16' in (self.encoding or b''):
641-
# we need to read utf-16 through UTF8Recoder.
642-
# if source is utf-16, convert source to utf-8 by UTF8Recoder.
643-
source = icom.UTF8Recoder(source,
644-
self.encoding.decode('utf-8'))
641+
if self.encoding and isinstance(source, io.BufferedIOBase):
642+
source = io.TextIOWrapper(
643+
source, self.encoding.decode('utf-8'), newline='')
644+
645645
self.encoding = b'utf-8'
646646
self.c_encoding = <char*>self.encoding
647647

pandas/io/common.py

-23
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Common IO api utilities"""
22

33
import bz2
4-
import codecs
54
from collections import abc
65
import gzip
76
from io import BufferedIOBase, BytesIO
@@ -12,7 +11,6 @@
1211
IO,
1312
Any,
1413
AnyStr,
15-
BinaryIO,
1614
Dict,
1715
List,
1816
Mapping,
@@ -538,24 +536,3 @@ def __next__(self) -> str:
538536
if newline == "":
539537
raise StopIteration
540538
return newline
541-
542-
543-
class UTF8Recoder(abc.Iterator):
544-
"""
545-
Iterator that reads an encoded stream and re-encodes the input to UTF-8
546-
"""
547-
548-
def __init__(self, f: BinaryIO, encoding: str):
549-
self.reader = codecs.getreader(encoding)(f)
550-
551-
def read(self, bytes: int = -1) -> bytes:
552-
return self.reader.read(bytes).encode("utf-8")
553-
554-
def readline(self) -> bytes:
555-
return self.reader.readline().encode("utf-8")
556-
557-
def __next__(self) -> bytes:
558-
return next(self.reader).encode("utf-8")
559-
560-
def close(self):
561-
self.reader.close()

pandas/io/parsers.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from collections import abc, defaultdict
66
import csv
77
import datetime
8-
from io import StringIO
8+
from io import StringIO, BufferedIOBase, TextIOWrapper
99
import re
1010
import sys
1111
from textwrap import fill
@@ -62,7 +62,6 @@
6262
from pandas.core.tools import datetimes as tools
6363

6464
from pandas.io.common import (
65-
UTF8Recoder,
6665
get_filepath_or_buffer,
6766
get_handle,
6867
infer_compression,
@@ -1868,12 +1867,18 @@ def __init__(self, src, **kwds):
18681867

18691868
ParserBase.__init__(self, kwds)
18701869

1871-
if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""):
1872-
# if source is utf-16 plain text, convert source to utf-8
1870+
encoding = kwds.get("encoding")
1871+
1872+
if kwds.get("compression") is None and encoding:
18731873
if isinstance(src, str):
18741874
src = open(src, "rb")
18751875
self.handles.append(src)
1876-
src = UTF8Recoder(src, kwds["encoding"])
1876+
1877+
# Handle the file object with universal line mode enabled.
1878+
# We will handle the newline character ourselves later on.
1879+
if isinstance(src, BufferedIOBase):
1880+
src = TextIOWrapper(src, encoding=encoding, newline="")
1881+
18771882
kwds["encoding"] = "utf-8"
18781883

18791884
# #2442

pandas/tests/io/parser/conftest.py

+19
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,22 @@ def c_parser_only(request):
8080
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
8181
def python_parser_only(request):
8282
return request.param
83+
84+
85+
_utf_values = [8, 16, 32]
86+
87+
_encoding_seps = ["", "-", "_"]
88+
_encoding_prefixes = ["utf", "UTF"]
89+
90+
_encoding_fmts = [f"{prefix}{sep}" + "{0}" for sep in _encoding_seps
91+
for prefix in _encoding_prefixes]
92+
93+
94+
@pytest.fixture(params=_utf_values)
95+
def utf_value(request):
96+
return request.param
97+
98+
99+
@pytest.fixture(params=_encoding_fmts)
100+
def encoding_fmt(request):
101+
return request.param
251 Bytes
Binary file not shown.
201 Bytes
Binary file not shown.

pandas/tests/io/parser/test_compression.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
123123
tm.assert_frame_equal(result, expected)
124124

125125

126-
def test_compression_utf16_encoding(all_parsers, csv_dir_path):
127-
# see gh-18071
126+
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
127+
# see gh-18071, gh-24130
128128
parser = all_parsers
129-
path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
129+
encoding = encoding_fmt.format(utf_value)
130+
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
130131

131-
result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t")
132+
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
132133
expected = pd.DataFrame(
133134
{
134135
"Country": ["Venezuela", "Venezuela"],

pandas/tests/io/parser/test_encoding.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from io import BytesIO
77
import os
8+
import tempfile
89

910
import numpy as np
1011
import pytest
@@ -119,14 +120,12 @@ def _encode_data_with_bom(_data):
119120
tm.assert_frame_equal(result, expected)
120121

121122

122-
@pytest.mark.parametrize("byte", [8, 16])
123-
@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
124-
def test_read_csv_utf_aliases(all_parsers, byte, fmt):
123+
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
125124
# see gh-13549
126125
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
127126
parser = all_parsers
128127

129-
encoding = fmt.format(byte)
128+
encoding = encoding_fmt.format(utf_value)
130129
data = "mb_num,multibyte\n4.8,test".encode(encoding)
131130

132131
result = parser.read_csv(BytesIO(data), encoding=encoding)
@@ -155,3 +154,19 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
155154
with open(fpath, mode="rb") as fb:
156155
result = parser.read_csv(fb, encoding=encoding)
157156
tm.assert_frame_equal(expected, result)
157+
158+
159+
@pytest.mark.parametrize("pass_encoding", [True, False])
160+
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
161+
# see gh-24130
162+
parser = all_parsers
163+
encoding = encoding_fmt.format(utf_value)
164+
165+
expected = DataFrame({"foo": ["bar"]})
166+
167+
with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f:
168+
f.write("foo\nbar")
169+
f.seek(0)
170+
171+
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
172+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)