Skip to content

TST: de-xfail pyarrow usecols tests #56045

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,17 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
raise ValueError(e)
return frame

def _validate_usecols(self, usecols):
if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols):
raise ValueError(
"The pyarrow engine does not allow 'usecols' to be integer "
"column positions. Pass a list of string column names instead."
)
elif callable(usecols):
raise ValueError(
"The pyarrow engine does not allow 'usecols' to be a callable."
)

def read(self) -> DataFrame:
"""
Reads the contents of a CSV file into a DataFrame and
Expand All @@ -233,12 +244,20 @@ def read(self) -> DataFrame:
pyarrow_csv = import_optional_dependency("pyarrow.csv")
self._get_pyarrow_options()

try:
convert_options = pyarrow_csv.ConvertOptions(**self.convert_options)
except TypeError:
include = self.convert_options.get("include_columns", None)
if include is not None:
self._validate_usecols(include)
raise

try:
table = pyarrow_csv.read_csv(
self.src,
read_options=pyarrow_csv.ReadOptions(**self.read_options),
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
convert_options=convert_options,
)
except pa.ArrowInvalid as e:
raise ParserError(e) from e
Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/io/parser/usecols/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")

_msg_pyarrow_requires_names = (
"The pyarrow engine does not allow 'usecols' to be integer column "
"positions. Pass a list of string column names instead."
)


@xfail_pyarrow # TypeError: expected bytes, int found
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
def test_usecols_with_parse_dates(all_parsers, usecols):
# see gh-9755
Expand All @@ -35,6 +39,10 @@ def test_usecols_with_parse_dates(all_parsers, usecols):
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
if parser.engine == "pyarrow":
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
return
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)

Expand Down
71 changes: 53 additions & 18 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
_msg_validate_usecols_names = (
"Usecols do not match columns, columns expected but not found: {0}"
)
_msg_pyarrow_requires_names = (
"The pyarrow engine does not allow 'usecols' to be integer column "
"positions. Pass a list of string column names instead."
)

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
Expand Down Expand Up @@ -60,15 +64,16 @@ def test_usecols(all_parsers, usecols, request):
10,11,12"""
parser = all_parsers
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found")
request.applymarker(mark)
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols)
return

result = parser.read_csv(StringIO(data), usecols=usecols)

expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # TypeError: expected bytes, int found
def test_usecols_with_names(all_parsers):
data = """\
a,b,c
Expand All @@ -78,6 +83,12 @@ def test_usecols_with_names(all_parsers):
10,11,12"""
parser = all_parsers
names = ["foo", "bar"]

if parser.engine == "pyarrow":
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
return

result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)

expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
Expand Down Expand Up @@ -131,7 +142,6 @@ def test_usecols_name_length_conflict(all_parsers):
10,11,12"""
parser = all_parsers
msg = "Number of passed names did not match number of header fields in the file"

with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])

Expand Down Expand Up @@ -166,10 +176,13 @@ def test_usecols_index_col_false(all_parsers, data):
def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
if parser.engine == "pyarrow" and isinstance(usecols[0], int):
mark = pytest.mark.xfail(raises=TypeError, match="expected bytes, int found")
request.applymarker(mark)
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"

if parser.engine == "pyarrow" and isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
return

expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))

result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
Expand Down Expand Up @@ -274,8 +287,9 @@ def test_usecols_with_integer_like_header(all_parsers, usecols, expected, reques
4000,5000,6000"""

if parser.engine == "pyarrow" and isinstance(usecols[0], int):
mark = pytest.mark.xfail(raises=TypeError, reason="expected bytes, int found")
request.applymarker(mark)
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), usecols=usecols)
return

result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
Expand All @@ -302,7 +316,6 @@ def test_np_array_usecols(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # TypeError: 'function' object is not iterable
@pytest.mark.parametrize(
"usecols,expected",
[
Expand Down Expand Up @@ -331,6 +344,12 @@ def test_callable_usecols(all_parsers, usecols, expected):
3.568935038,7,False,a"""
parser = all_parsers

if parser.engine == "pyarrow":
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), usecols=usecols)
return

result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -447,19 +466,28 @@ def test_raises_on_usecols_names_mismatch(
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # TypeError: expected bytes, int found
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
names = ["A", "B", "C", "D"]
parser = all_parsers

if parser.engine == "pyarrow":
if isinstance(usecols[0], int):
with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
return
mark = pytest.mark.xfail(
reason="pyarrow.lib.ArrowKeyError: Column 'A' in include_columns "
"does not exist"
)
request.applymarker(mark)

result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # TypeError: expected bytes, int found
@pytest.mark.parametrize("names", [None, ["a", "b"]])
def test_usecols_indices_out_of_bounds(all_parsers, names):
# GH#25623 & GH 41130; enforced in 2.0
Expand All @@ -468,7 +496,14 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
a,b
1,2
"""
with pytest.raises(ParserError, match="Defining usecols with out-of-bounds"):

err = ParserError
msg = "Defining usecols with out-of-bounds"
if parser.engine == "pyarrow":
err = ValueError
msg = _msg_pyarrow_requires_names

with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)


Expand All @@ -478,8 +513,8 @@ def test_usecols_additional_columns(all_parsers):
usecols = lambda header: header.strip() in ["a", "b", "c"]

if parser.engine == "pyarrow":
msg = "'function' object is not iterable"
with pytest.raises(TypeError, match=msg):
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
return
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
Expand All @@ -492,8 +527,8 @@ def test_usecols_additional_columns_integer_columns(all_parsers):
parser = all_parsers
usecols = lambda header: header.strip() in ["0", "1"]
if parser.engine == "pyarrow":
msg = "'function' object is not iterable"
with pytest.raises(TypeError, match=msg):
msg = "The pyarrow engine does not allow 'usecols' to be a callable"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
return
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
Expand Down