Skip to content

TST: de-xfail chunksize pyarrow tests #56041

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 100 additions & 20 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


@xfail_pyarrow # The 'chunksize' option is not supported
@pytest.mark.parametrize("index_col", [0, "index"])
def test_read_chunksize_with_index(all_parsers, index_col):
parser = all_parsers
Expand All @@ -48,14 +46,20 @@ def test_read_chunksize_with_index(all_parsers, index_col):
)
expected = expected.set_index("index")

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
list(reader)
return

with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])


@xfail_pyarrow # AssertionError: Regex pattern did not match
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
def test_read_chunksize_bad(all_parsers, chunksize):
data = """index,A,B,C,D
Expand All @@ -68,13 +72,14 @@ def test_read_chunksize_bad(all_parsers, chunksize):
"""
parser = all_parsers
msg = r"'chunksize' must be an integer >=1"
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"

with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
pass


@xfail_pyarrow # The 'nrows' option is not supported
@pytest.mark.parametrize("chunksize", [2, 8])
def test_read_chunksize_and_nrows(all_parsers, chunksize):
# see gh-15755
Expand All @@ -89,12 +94,17 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return

expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), expected)


@xfail_pyarrow # The 'chunksize' option is not supported
def test_read_chunksize_and_nrows_changing_size(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
Expand All @@ -107,6 +117,12 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return

expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
Expand All @@ -116,7 +132,6 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
reader.get_chunk(size=3)


@xfail_pyarrow # The 'chunksize' option is not supported
def test_get_chunk_passed_chunksize(all_parsers):
parser = all_parsers
data = """A,B,C
Expand All @@ -125,14 +140,20 @@ def test_get_chunk_passed_chunksize(all_parsers):
7,8,9
1,2,3"""

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2) as reader:
reader.get_chunk()
return

with parser.read_csv(StringIO(data), chunksize=2) as reader:
result = reader.get_chunk()

expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # The 'chunksize' option is not supported
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
def test_read_chunksize_compat(all_parsers, kwargs):
# see gh-12185
Expand All @@ -146,17 +167,35 @@ def test_read_chunksize_compat(all_parsers, kwargs):
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
concat(reader)
return

with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), result)
via_reader = concat(reader)
tm.assert_frame_equal(via_reader, result)


@xfail_pyarrow # The 'chunksize' option is not supported
def test_read_chunksize_jagged_names(all_parsers):
# see gh-23509
parser = all_parsers
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])

expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(
StringIO(data), names=range(10), chunksize=4
) as reader:
concat(reader)
return

with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
result = concat(reader)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -194,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_warn_if_chunks_have_mismatched_type(all_parsers):
warning_type = None
parser = all_parsers
Expand All @@ -212,17 +250,24 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):

buf = StringIO(data)

df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)
if parser.engine == "pyarrow":
df = parser.read_csv_check_warnings(
DeprecationWarning,
"Passing a BlockManager to DataFrame is deprecated",
buf,
check_stacklevel=False,
)
else:
df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)

assert df.a.dtype == object


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
@pytest.mark.parametrize("iterator", [True, False])
def test_empty_with_nrows_chunksize(all_parsers, iterator):
# see gh-9535
Expand All @@ -232,6 +277,18 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
nrows = 10
data = StringIO("foo,bar\n")

if parser.engine == "pyarrow":
msg = (
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
)
with pytest.raises(ValueError, match=msg):
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
next(iter(reader))
else:
parser.read_csv(data, nrows=nrows)
return

if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
result = next(iter(reader))
Expand All @@ -241,7 +298,6 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
Expand All @@ -254,12 +310,19 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
for i in range(1000):
f.write(str(i) + "\n")

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
return

with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
Expand All @@ -268,6 +331,18 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
9,10,11
"""

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
return

result_chunks = parser.read_csv(
StringIO(data),
names=["a", "b"],
Expand All @@ -285,7 +360,6 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
tm.assert_frame_equal(result, expected_frames[i])


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_chunksize_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
Expand All @@ -295,6 +369,12 @@ def test_chunksize_second_block_shorter(all_parsers):
9,10,11
"""

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), chunksize=2)
return

result_chunks = parser.read_csv(StringIO(data), chunksize=2)

expected_frames = [
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,18 @@ def read_csv_check_warnings(
warn_msg: str,
*args,
raise_on_extra_warnings=True,
check_stacklevel: bool = True,
**kwargs,
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
warn_type,
match=warn_msg,
raise_on_extra_warnings=raise_on_extra_warnings,
check_stacklevel=check_stacklevel,
):
return read_csv(*args, **kwargs)

Expand Down