Skip to content

BUG: Expand encoding for C engine beyond utf-16 #30771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,7 @@ I/O
- :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`)
- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`)
- :func:`read_excel` now accepts binary data (:issue:`15914`)
- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`)

Plotting
^^^^^^^^
Expand Down
10 changes: 5 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# See LICENSE for the license
import bz2
import gzip
import io
import os
import sys
import time
Expand Down Expand Up @@ -637,11 +638,10 @@ cdef class TextReader:
raise ValueError(f'Unrecognized compression type: '
f'{self.compression}')

if b'utf-16' in (self.encoding or b''):
# we need to read utf-16 through UTF8Recoder.
# if source is utf-16, convert source to utf-8 by UTF8Recoder.
source = icom.UTF8Recoder(source,
self.encoding.decode('utf-8'))
if self.encoding and isinstance(source, io.BufferedIOBase):
source = io.TextIOWrapper(
source, self.encoding.decode('utf-8'), newline='')

self.encoding = b'utf-8'
self.c_encoding = <char*>self.encoding

Expand Down
35 changes: 1 addition & 34 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,13 @@
"""Common IO api utilities"""

import bz2
import codecs
from collections import abc
import gzip
from io import BufferedIOBase, BytesIO
import mmap
import os
import pathlib
from typing import (
IO,
Any,
AnyStr,
BinaryIO,
Dict,
List,
Mapping,
Optional,
Tuple,
Union,
)
from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union
from urllib.parse import ( # noqa
urlencode,
urljoin,
Expand Down Expand Up @@ -538,24 +526,3 @@ def __next__(self) -> str:
if newline == "":
raise StopIteration
return newline


class UTF8Recoder(abc.Iterator):
"""
Iterator that reads an encoded stream and re-encodes the input to UTF-8
"""

def __init__(self, f: BinaryIO, encoding: str):
self.reader = codecs.getreader(encoding)(f)

def read(self, bytes: int = -1) -> bytes:
return self.reader.read(bytes).encode("utf-8")

def readline(self) -> bytes:
return self.reader.readline().encode("utf-8")

def __next__(self) -> bytes:
return next(self.reader).encode("utf-8")

def close(self):
self.reader.close()
15 changes: 10 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import abc, defaultdict
import csv
import datetime
from io import StringIO
from io import BufferedIOBase, StringIO, TextIOWrapper
import re
import sys
from textwrap import fill
Expand Down Expand Up @@ -62,7 +62,6 @@
from pandas.core.tools import datetimes as tools

from pandas.io.common import (
UTF8Recoder,
get_filepath_or_buffer,
get_handle,
infer_compression,
Expand Down Expand Up @@ -1868,12 +1867,18 @@ def __init__(self, src, **kwds):

ParserBase.__init__(self, kwds)

if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""):
# if source is utf-16 plain text, convert source to utf-8
encoding = kwds.get("encoding")

if kwds.get("compression") is None and encoding:
if isinstance(src, str):
src = open(src, "rb")
self.handles.append(src)
src = UTF8Recoder(src, kwds["encoding"])

# Handle the file object with universal line mode enabled.
# We will handle the newline character ourselves later on.
if isinstance(src, BufferedIOBase):
src = TextIOWrapper(src, encoding=encoding, newline="")

kwds["encoding"] = "utf-8"

# #2442
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,29 @@ def c_parser_only(request):
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
def python_parser_only(request):
return request.param


_utf_values = [8, 16, 32]

_encoding_seps = ["", "-", "_"]
_encoding_prefixes = ["utf", "UTF"]

_encoding_fmts = [
f"{prefix}{sep}" + "{0}" for sep in _encoding_seps for prefix in _encoding_prefixes
]


@pytest.fixture(params=_utf_values)
def utf_value(request):
"""
Fixture for all possible integer values for a UTF encoding.
"""
return request.param


@pytest.fixture(params=_encoding_fmts)
def encoding_fmt(request):
"""
Fixture for all possible string formats of a UTF encoding.
"""
return request.param
Binary file added pandas/tests/io/parser/data/utf32_ex_small.zip
Binary file not shown.
Binary file added pandas/tests/io/parser/data/utf8_ex_small.zip
Binary file not shown.
9 changes: 5 additions & 4 deletions pandas/tests/io/parser/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
tm.assert_frame_equal(result, expected)


def test_compression_utf16_encoding(all_parsers, csv_dir_path):
# see gh-18071
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
# see gh-18071, gh-24130
parser = all_parsers
path = os.path.join(csv_dir_path, "utf16_ex_small.zip")
encoding = encoding_fmt.format(utf_value)
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")

result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t")
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
expected = pd.DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
Expand Down
23 changes: 19 additions & 4 deletions pandas/tests/io/parser/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from io import BytesIO
import os
import tempfile
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can pytest builtin fixture or ensure_clean be used instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need the encoding to be parameterized in this test, which can't be done with ensure_clean at the moment (though could be a useful enhancement as a follow-up).

The pytest builtin fixture has a similar issue (as an aside, if I had to choose between ensure_clean or pytest fixture, I would generally go with our in-house one since it's a little more flexible to use).


import numpy as np
import pytest
Expand Down Expand Up @@ -119,14 +120,12 @@ def _encode_data_with_bom(_data):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("byte", [8, 16])
@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"])
def test_read_csv_utf_aliases(all_parsers, byte, fmt):
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
# see gh-13549
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
parser = all_parsers

encoding = fmt.format(byte)
encoding = encoding_fmt.format(utf_value)
data = "mb_num,multibyte\n4.8,test".encode(encoding)

result = parser.read_csv(BytesIO(data), encoding=encoding)
Expand Down Expand Up @@ -155,3 +154,19 @@ def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding):
with open(fpath, mode="rb") as fb:
result = parser.read_csv(fb, encoding=encoding)
tm.assert_frame_equal(expected, result)


@pytest.mark.parametrize("pass_encoding", [True, False])
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
# see gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)

expected = DataFrame({"foo": ["bar"]})

with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f:
f.write("foo\nbar")
f.seek(0)

result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
tm.assert_frame_equal(result, expected)