Skip to content

Commit 4ac5cf6

Browse files
authored
TST: de-xfail pyarrow usecols tests (#56045)
1 parent 50171fc commit 4ac5cf6

File tree

3 files changed

+82
-20
lines changed

3 files changed

+82
-20
lines changed

pandas/io/parsers/arrow_parser_wrapper.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,17 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
218218
raise ValueError(e)
219219
return frame
220220

221+
def _validate_usecols(self, usecols):
222+
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
223+
raise ValueError(
224+
"The pyarrow engine does not allow 'usecols' to be integer "
225+
"column positions. Pass a list of string column names instead."
226+
)
227+
elif callable(usecols):
228+
raise ValueError(
229+
"The pyarrow engine does not allow 'usecols' to be a callable."
230+
)
231+
221232
def read(self) -> DataFrame:
222233
"""
223234
Reads the contents of a CSV file into a DataFrame and
@@ -233,12 +244,20 @@ def read(self) -> DataFrame:
233244
pyarrow_csv = import_optional_dependency("pyarrow.csv")
234245
self._get_pyarrow_options()
235246

247+
try:
248+
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
249+
except TypeError:
250+
include = self.convert_options.get("include_columns", None)
251+
if include is not None:
252+
self._validate_usecols(include)
253+
raise
254+
236255
try:
237256
table = pyarrow_csv.read_csv(
238257
self.src,
239258
read_options=pyarrow_csv.ReadOptions(**self.read_options),
240259
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
241-
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
260+
convert_options=convert_options,
242261
)
243262
except pa.ArrowInvalid as e:
244263
raise ParserError(e) from e

pandas/tests/io/parser/usecols/test_parse_dates.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,12 @@
1919
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
2020
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
2121

22+
_msg_pyarrow_requires_names = (
23+
"The pyarrow engine does not allow 'usecols' to be integer column "
24+
"positions. Pass a list of string column names instead."
25+
)
26+
2227

23-
@xfail_pyarrow # TypeError: expected bytes, int found
2428
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
2529
def test_usecols_with_parse_dates(all_parsers, usecols):
2630
# see gh-9755
@@ -35,6 +39,10 @@ def test_usecols_with_parse_dates(all_parsers, usecols):
3539
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
3640
}
3741
expected = DataFrame(cols, columns=["c_d", "a"])
42+
if parser.engine == "pyarrow":
43+
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
44+
parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
45+
return
3846
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
3947
tm.assert_frame_equal(result, expected)
4048

pandas/tests/io/parser/usecols/test_usecols_basic.py

+53-18
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828
_msg_validate_usecols_names = (
2929
"Usecols do not match columns, columns expected but not found: {0}"
3030
)
31+
_msg_pyarrow_requires_names = (
32+
"The pyarrow engine does not allow 'usecols' to be integer column "
33+
"positions. Pass a list of string column names instead."
34+
)
3135

3236
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
3337
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@@ -60,15 +64,16 @@ def test_usecols(all_parsers, usecols, request):
6064
10,11,12"""
6165
parser = all_parsers
6266
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
63-
mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found")
64-
request.applymarker(mark)
67+
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
68+
parser.read_csv(StringIO(data), usecols=usecols)
69+
return
70+
6571
result = parser.read_csv(StringIO(data), usecols=usecols)
6672

6773
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
6874
tm.assert_frame_equal(result, expected)
6975

7076

71-
@xfail_pyarrow # TypeError: expected bytes, int found
7277
def test_usecols_with_names(all_parsers):
7378
data = """\
7479
a,b,c
@@ -78,6 +83,12 @@ def test_usecols_with_names(all_parsers):
7883
10,11,12"""
7984
parser = all_parsers
8085
names = ["foo", "bar"]
86+
87+
if parser.engine == "pyarrow":
88+
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
89+
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
90+
return
91+
8192
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
8293

8394
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
@@ -131,7 +142,6 @@ def test_usecols_name_length_conflict(all_parsers):
131142
10,11,12"""
132143
parser = all_parsers
133144
msg = "Number of passed names did not match number of header fields in the file"
134-
135145
with pytest.raises(ValueError, match=msg):
136146
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
137147

@@ -166,10 +176,13 @@ def test_usecols_index_col_false(all_parsers, data):
166176
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
167177
# see gh-4201: test that index_col as integer reflects usecols
168178
parser = all_parsers
169-
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
170-
mark = pytest.mark.xfail(raises=TypeError, match="expected bytes, int found")
171-
request.applymarker(mark)
172179
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
180+
181+
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
182+
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
183+
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
184+
return
185+
173186
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
174187

175188
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
@@ -274,8 +287,9 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected, reques
274287
4000,5000,6000"""
275288

276289
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
277-
mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found")
278-
request.applymarker(mark)
290+
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
291+
parser.read_csv(StringIO(data), usecols=usecols)
292+
return
279293

280294
result = parser.read_csv(StringIO(data), usecols=usecols)
281295
tm.assert_frame_equal(result, expected)
@@ -302,7 +316,6 @@ def test_np_array_usecols(all_parsers):
302316
tm.assert_frame_equal(result, expected)
303317

304318

305-
@xfail_pyarrow # TypeError: 'function' object is not iterable
306319
@pytest.mark.parametrize(
307320
"usecols,expected",
308321
[
@@ -331,6 +344,12 @@ def test_callable_usecols(all_parsers, usecols, expected):
331344
3.568935038,7,False,a"""
332345
parser = all_parsers
333346

347+
if parser.engine == "pyarrow":
348+
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
349+
with pytest.raises(ValueError, match=msg):
350+
parser.read_csv(StringIO(data), usecols=usecols)
351+
return
352+
334353
result = parser.read_csv(StringIO(data), usecols=usecols)
335354
tm.assert_frame_equal(result, expected)
336355

@@ -447,19 +466,28 @@ def test_raises_on_usecols_names_mismatch(
447466
tm.assert_frame_equal(result, expected)
448467

449468

450-
@xfail_pyarrow # TypeError: expected bytes, int found
451469
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
452-
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
470+
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
453471
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
454472
names = ["A", "B", "C", "D"]
455473
parser = all_parsers
456474

475+
if parser.engine == "pyarrow":
476+
if isinstance(usecols[0], int):
477+
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
478+
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
479+
return
480+
mark = pytest.mark.xfail(
481+
reason="pyarrow.lib.ArrowKeyError: Column 'A' in include_columns "
482+
"does not exist"
483+
)
484+
request.applymarker(mark)
485+
457486
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
458487
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
459488
tm.assert_frame_equal(result, expected)
460489

461490

462-
@xfail_pyarrow # TypeError: expected bytes, int found
463491
@pytest.mark.parametrize("names", [None, ["a", "b"]])
464492
def test_usecols_indices_out_of_bounds(all_parsers, names):
465493
# GH#25623 & GH 41130; enforced in 2.0
@@ -468,7 +496,14 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
468496
a,b
469497
1,2
470498
"""
471-
with pytest.raises(ParserError, match="Defining usecols with out-of-bounds"):
499+
500+
err = ParserError
501+
msg = "Defining usecols with out-of-bounds"
502+
if parser.engine == "pyarrow":
503+
err = ValueError
504+
msg = _msg_pyarrow_requires_names
505+
506+
with pytest.raises(err, match=msg):
472507
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
473508

474509

@@ -478,8 +513,8 @@ def test_usecols_additional_columns(all_parsers):
478513
usecols = lambda header: header.strip() in ["a", "b", "c"]
479514

480515
if parser.engine == "pyarrow":
481-
msg = "'function' object is not iterable"
482-
with pytest.raises(TypeError, match=msg):
516+
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
517+
with pytest.raises(ValueError, match=msg):
483518
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
484519
return
485520
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
@@ -492,8 +527,8 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
492527
parser = all_parsers
493528
usecols = lambda header: header.strip() in ["0", "1"]
494529
if parser.engine == "pyarrow":
495-
msg = "'function' object is not iterable"
496-
with pytest.raises(TypeError, match=msg):
530+
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
531+
with pytest.raises(ValueError, match=msg):
497532
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
498533
return
499534
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)

0 commit comments

Comments
 (0)