Skip to content

Commit 430d038

Browse files
Backport PR #51302 on branch 2.0.x (ENH: Enable more Arrow CSV tests/features) (#51998)
Backport PR #51302: ENH: Enable more Arrow CSV tests/features Co-authored-by: Thomas Li <[email protected]>
1 parent b9dd4fa commit 430d038

File tree

6 files changed

+29
-15
lines changed

6 files changed

+29
-15
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ Other enhancements
270270
- :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`)
271271
- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
272272
- Added support for SQLAlchemy 2.0 (:issue:`40686`)
273+
- Added support for ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`)
273274
- :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`)
274275

275276
.. ---------------------------------------------------------------------------
@@ -1284,6 +1285,7 @@ I/O
12841285
- Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`)
12851286
- Bug in :meth:`DataFrame.to_html` with ``na_rep`` set when the :class:`DataFrame` contains non-scalar data (:issue:`47103`)
12861287
- Bug in :func:`read_xml` where file-like objects failed when iterparse is used (:issue:`50641`)
1288+
- Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`)
12871289
- Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`)
12881290
- Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`)
12891291

pandas/io/parsers/arrow_parser_wrapper.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def _get_pyarrow_options(self) -> None:
5050
"na_values": "null_values",
5151
"escapechar": "escape_char",
5252
"skip_blank_lines": "ignore_empty_lines",
53+
"decimal": "decimal_point",
5354
}
5455
for pandas_name, pyarrow_name in mapping.items():
5556
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
@@ -67,13 +68,20 @@ def _get_pyarrow_options(self) -> None:
6768
for option_name, option_value in self.kwds.items()
6869
if option_value is not None
6970
and option_name
70-
in ("include_columns", "null_values", "true_values", "false_values")
71+
in (
72+
"include_columns",
73+
"null_values",
74+
"true_values",
75+
"false_values",
76+
"decimal_point",
77+
)
7178
}
7279
self.read_options = {
7380
"autogenerate_column_names": self.header is None,
7481
"skip_rows": self.header
7582
if self.header is not None
7683
else self.kwds["skiprows"],
84+
"encoding": self.encoding,
7785
}
7886

7987
def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:

pandas/io/parsers/readers.py

-1
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,6 @@
455455
"quoting",
456456
"lineterminator",
457457
"converters",
458-
"decimal",
459458
"iterator",
460459
"dayfirst",
461460
"verbose",

pandas/tests/io/parser/common/test_decimal.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
from pandas import DataFrame
1010
import pandas._testing as tm
1111

12-
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
12+
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
1313

1414

15+
@xfail_pyarrow
1516
@pytest.mark.parametrize(
1617
"data,thousands,decimal",
1718
[

pandas/tests/io/parser/dtypes/test_categorical.py

-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers):
118118
tm.assert_frame_equal(actual, expected)
119119

120120

121-
@xfail_pyarrow
122121
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
123122
# see gh-10153
124123
pth = os.path.join(csv_dir_path, "utf16_ex.txt")

pandas/tests/io/parser/test_encoding.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
import pandas._testing as tm
2121

2222
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
23+
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
2324

2425

25-
@skip_pyarrow
2626
def test_bytes_io_input(all_parsers):
2727
encoding = "cp1255"
2828
parser = all_parsers
@@ -44,7 +44,7 @@ def test_read_csv_unicode(all_parsers):
4444
tm.assert_frame_equal(result, expected)
4545

4646

47-
@skip_pyarrow
47+
@xfail_pyarrow
4848
@pytest.mark.parametrize("sep", [",", "\t"])
4949
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
5050
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
@@ -73,15 +73,13 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding):
7373
tm.assert_frame_equal(result, expected)
7474

7575

76-
@skip_pyarrow
7776
def test_utf16_example(all_parsers, csv_dir_path):
7877
path = os.path.join(csv_dir_path, "utf16_ex.txt")
7978
parser = all_parsers
8079
result = parser.read_csv(path, encoding="utf-16", sep="\t")
8180
assert len(result) == 50
8281

8382

84-
@skip_pyarrow
8583
def test_unicode_encoding(all_parsers, csv_dir_path):
8684
path = os.path.join(csv_dir_path, "unicode_series.csv")
8785
parser = all_parsers
@@ -94,7 +92,6 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
9492
assert got == expected
9593

9694

97-
@skip_pyarrow
9895
@pytest.mark.parametrize(
9996
"data,kwargs,expected",
10097
[
@@ -114,7 +111,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path):
114111
),
115112
],
116113
)
117-
def test_utf8_bom(all_parsers, data, kwargs, expected):
114+
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
118115
# see gh-4793
119116
parser = all_parsers
120117
bom = "\ufeff"
@@ -124,11 +121,20 @@ def _encode_data_with_bom(_data):
124121
bom_data = (bom + _data).encode(utf8)
125122
return BytesIO(bom_data)
126123

124+
if (
125+
parser.engine == "pyarrow"
126+
and data == "\n1"
127+
and kwargs.get("skip_blank_lines", True)
128+
):
129+
# Manually xfail, since we don't have mechanism to xfail specific version
130+
request.node.add_marker(
131+
pytest.mark.xfail(reason="Pyarrow can't read blank lines")
132+
)
133+
127134
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
128135
tm.assert_frame_equal(result, expected)
129136

130137

131-
@skip_pyarrow
132138
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
133139
# see gh-13549
134140
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
@@ -141,7 +147,6 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
141147
tm.assert_frame_equal(result, expected)
142148

143149

144-
@skip_pyarrow
145150
@pytest.mark.parametrize(
146151
"file_path,encoding",
147152
[
@@ -226,7 +231,7 @@ def test_parse_encoded_special_characters(encoding):
226231
tm.assert_frame_equal(result, expected)
227232

228233

229-
@skip_pyarrow
234+
@xfail_pyarrow
230235
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
231236
def test_encoding_memory_map(all_parsers, encoding):
232237
# GH40986
@@ -244,7 +249,7 @@ def test_encoding_memory_map(all_parsers, encoding):
244249
tm.assert_frame_equal(df, expected)
245250

246251

247-
@skip_pyarrow
252+
@xfail_pyarrow
248253
def test_chunk_splits_multibyte_char(all_parsers):
249254
"""
250255
Chunk splits a multibyte character with memory_map=True
@@ -264,7 +269,7 @@ def test_chunk_splits_multibyte_char(all_parsers):
264269
tm.assert_frame_equal(dfr, df)
265270

266271

267-
@skip_pyarrow
272+
@xfail_pyarrow
268273
def test_readcsv_memmap_utf8(all_parsers):
269274
"""
270275
GH 43787

0 commit comments

Comments
 (0)