Skip to content

ENH: Enable more Arrow CSV tests/features #51302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ Other enhancements
- :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`)
- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
- Added support for SQLAlchemy 2.0 (:issue:`40686`)
- Added support for ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
- :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`)
- Added new escape mode "latex-math" to avoid escaping "$" in formatter (:issue:`50040`)

Expand Down Expand Up @@ -1287,6 +1288,7 @@ I/O
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
- Bug in :meth:`DataFrame.to_html` with ``na_rep`` set when the :class:`DataFrame` contains non-scalar data (:issue:`47103`)
- Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
- Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`)
- Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`)
- Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`)

Expand Down
10 changes: 9 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def _get_pyarrow_options(self) -> None:
"na_values": "null_values",
"escapechar": "escape_char",
"skip_blank_lines": "ignore_empty_lines",
"decimal": "decimal_point",
}
for pandas_name, pyarrow_name in mapping.items():
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
Expand All @@ -71,13 +72,20 @@ def _get_pyarrow_options(self) -> None:
for option_name, option_value in self.kwds.items()
if option_value is not None
and option_name
in ("include_columns", "null_values", "true_values", "false_values")
in (
"include_columns",
"null_values",
"true_values",
"false_values",
"decimal_point",
)
}
self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
if self.header is not None
else self.kwds["skiprows"],
"encoding": self.encoding,
}

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
Expand Down
1 change: 0 additions & 1 deletion pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,6 @@ class _Fwf_Defaults(TypedDict):
"quoting",
"lineterminator",
"converters",
"decimal",
"iterator",
"dayfirst",
"verbose",
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/parser/common/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from pandas import DataFrame
import pandas._testing as tm

pytestmark = pytest.mark.usefixtures("pyarrow_skip")
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@xfail_pyarrow
@pytest.mark.parametrize(
"data,thousands,decimal",
[
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/io/parser/dtypes/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers):
tm.assert_frame_equal(actual, expected)


@xfail_pyarrow
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
Expand Down
27 changes: 16 additions & 11 deletions pandas/tests/io/parser/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
import pandas._testing as tm

skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@skip_pyarrow
def test_bytes_io_input(all_parsers):
encoding = "cp1255"
parser = all_parsers
Expand All @@ -44,7 +44,7 @@ def test_read_csv_unicode(all_parsers):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
@xfail_pyarrow
@pytest.mark.parametrize("sep", [",", "\t"])
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
Expand Down Expand Up @@ -73,15 +73,13 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_utf16_example(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
result = parser.read_csv(path, encoding="utf-16", sep="\t")
assert len(result) == 50


@skip_pyarrow
def test_unicode_encoding(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
Expand All @@ -94,7 +92,6 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
assert got == expected


@skip_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
Expand All @@ -114,7 +111,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
),
],
)
def test_utf8_bom(all_parsers, data, kwargs, expected):
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
# see gh-4793
parser = all_parsers
bom = "\ufeff"
Expand All @@ -124,11 +121,20 @@ def _encode_data_with_bom(_data):
bom_data = (bom + _data).encode(utf8)
return BytesIO(bom_data)

if (
parser.engine == "pyarrow"
and data == "\n1"
and kwargs.get("skip_blank_lines", True)
):
# Manually xfail, since we don't have mechanism to xfail specific version
request.node.add_marker(
pytest.mark.xfail(reason="Pyarrow can't read blank lines")
)

result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
# see gh-13549
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
Expand All @@ -141,7 +147,6 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
@pytest.mark.parametrize(
"file_path,encoding",
[
Expand Down Expand Up @@ -226,7 +231,7 @@ def test_parse_encoded_special_characters(encoding):
tm.assert_frame_equal(result, expected)


@skip_pyarrow
@xfail_pyarrow
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
def test_encoding_memory_map(all_parsers, encoding):
# GH40986
Expand All @@ -244,7 +249,7 @@ def test_encoding_memory_map(all_parsers, encoding):
tm.assert_frame_equal(df, expected)


@skip_pyarrow
@xfail_pyarrow
def test_chunk_splits_multibyte_char(all_parsers):
"""
Chunk splits a multibyte character with memory_map=True
Expand All @@ -264,7 +269,7 @@ def test_chunk_splits_multibyte_char(all_parsers):
tm.assert_frame_equal(dfr, df)


@skip_pyarrow
@xfail_pyarrow
def test_readcsv_memmap_utf8(all_parsers):
"""
GH 43787
Expand Down