Skip to content

Commit 47a596e

Browse files
authored
TST: de-xfail chunksize pyarrow tests (#56041)
1 parent 4ac5cf6 commit 47a596e

File tree

2 files changed

+105
-21
lines changed

2 files changed

+105
-21
lines changed

pandas/tests/io/parser/common/test_chunksize.py

+100-20
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,11 @@
1616
)
1717
import pandas._testing as tm
1818

19-
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
2019
pytestmark = pytest.mark.filterwarnings(
2120
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
2221
)
2322

2423

25-
@xfail_pyarrow # The 'chunksize' option is not supported
2624
@pytest.mark.parametrize("index_col", [0, "index"])
2725
def test_read_chunksize_with_index(all_parsers, index_col):
2826
parser = all_parsers
@@ -48,14 +46,20 @@ def test_read_chunksize_with_index(all_parsers, index_col):
4846
)
4947
expected = expected.set_index("index")
5048

49+
if parser.engine == "pyarrow":
50+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
51+
with pytest.raises(ValueError, match=msg):
52+
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
53+
list(reader)
54+
return
55+
5156
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
5257
chunks = list(reader)
5358
tm.assert_frame_equal(chunks[0], expected[:2])
5459
tm.assert_frame_equal(chunks[1], expected[2:4])
5560
tm.assert_frame_equal(chunks[2], expected[4:])
5661

5762

58-
@xfail_pyarrow # AssertionError: Regex pattern did not match
5963
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
6064
def test_read_chunksize_bad(all_parsers, chunksize):
6165
data = """index,A,B,C,D
@@ -68,13 +72,14 @@ def test_read_chunksize_bad(all_parsers, chunksize):
6872
"""
6973
parser = all_parsers
7074
msg = r"'chunksize' must be an integer >=1"
75+
if parser.engine == "pyarrow":
76+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
7177

7278
with pytest.raises(ValueError, match=msg):
7379
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
7480
pass
7581

7682

77-
@xfail_pyarrow # The 'nrows' option is not supported
7883
@pytest.mark.parametrize("chunksize", [2, 8])
7984
def test_read_chunksize_and_nrows(all_parsers, chunksize):
8085
# see gh-15755
@@ -89,12 +94,17 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
8994
parser = all_parsers
9095
kwargs = {"index_col": 0, "nrows": 5}
9196

97+
if parser.engine == "pyarrow":
98+
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
99+
with pytest.raises(ValueError, match=msg):
100+
parser.read_csv(StringIO(data), **kwargs)
101+
return
102+
92103
expected = parser.read_csv(StringIO(data), **kwargs)
93104
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
94105
tm.assert_frame_equal(concat(reader), expected)
95106

96107

97-
@xfail_pyarrow # The 'chunksize' option is not supported
98108
def test_read_chunksize_and_nrows_changing_size(all_parsers):
99109
data = """index,A,B,C,D
100110
foo,2,3,4,5
@@ -107,6 +117,12 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
107117
parser = all_parsers
108118
kwargs = {"index_col": 0, "nrows": 5}
109119

120+
if parser.engine == "pyarrow":
121+
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
122+
with pytest.raises(ValueError, match=msg):
123+
parser.read_csv(StringIO(data), **kwargs)
124+
return
125+
110126
expected = parser.read_csv(StringIO(data), **kwargs)
111127
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
112128
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
@@ -116,7 +132,6 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
116132
reader.get_chunk(size=3)
117133

118134

119-
@xfail_pyarrow # The 'chunksize' option is not supported
120135
def test_get_chunk_passed_chunksize(all_parsers):
121136
parser = all_parsers
122137
data = """A,B,C
@@ -125,14 +140,20 @@ def test_get_chunk_passed_chunksize(all_parsers):
125140
7,8,9
126141
1,2,3"""
127142

143+
if parser.engine == "pyarrow":
144+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
145+
with pytest.raises(ValueError, match=msg):
146+
with parser.read_csv(StringIO(data), chunksize=2) as reader:
147+
reader.get_chunk()
148+
return
149+
128150
with parser.read_csv(StringIO(data), chunksize=2) as reader:
129151
result = reader.get_chunk()
130152

131153
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
132154
tm.assert_frame_equal(result, expected)
133155

134156

135-
@xfail_pyarrow # The 'chunksize' option is not supported
136157
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
137158
def test_read_chunksize_compat(all_parsers, kwargs):
138159
# see gh-12185
@@ -146,17 +167,35 @@ def test_read_chunksize_compat(all_parsers, kwargs):
146167
"""
147168
parser = all_parsers
148169
result = parser.read_csv(StringIO(data), **kwargs)
170+
171+
if parser.engine == "pyarrow":
172+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
173+
with pytest.raises(ValueError, match=msg):
174+
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
175+
concat(reader)
176+
return
177+
149178
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
150-
tm.assert_frame_equal(concat(reader), result)
179+
via_reader = concat(reader)
180+
tm.assert_frame_equal(via_reader, result)
151181

152182

153-
@xfail_pyarrow # The 'chunksize' option is not supported
154183
def test_read_chunksize_jagged_names(all_parsers):
155184
# see gh-23509
156185
parser = all_parsers
157186
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
158187

159188
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
189+
190+
if parser.engine == "pyarrow":
191+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
192+
with pytest.raises(ValueError, match=msg):
193+
with parser.read_csv(
194+
StringIO(data), names=range(10), chunksize=4
195+
) as reader:
196+
concat(reader)
197+
return
198+
160199
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
161200
result = concat(reader)
162201
tm.assert_frame_equal(result, expected)
@@ -194,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
194233
assert result.a.dtype == float
195234

196235

197-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
198236
def test_warn_if_chunks_have_mismatched_type(all_parsers):
199237
warning_type = None
200238
parser = all_parsers
@@ -212,17 +250,24 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
212250

213251
buf = StringIO(data)
214252

215-
df = parser.read_csv_check_warnings(
216-
warning_type,
217-
r"Columns \(0\) have mixed types. "
218-
"Specify dtype option on import or set low_memory=False.",
219-
buf,
220-
)
253+
if parser.engine == "pyarrow":
254+
df = parser.read_csv_check_warnings(
255+
DeprecationWarning,
256+
"Passing a BlockManager to DataFrame is deprecated",
257+
buf,
258+
check_stacklevel=False,
259+
)
260+
else:
261+
df = parser.read_csv_check_warnings(
262+
warning_type,
263+
r"Columns \(0\) have mixed types. "
264+
"Specify dtype option on import or set low_memory=False.",
265+
buf,
266+
)
221267

222268
assert df.a.dtype == object
223269

224270

225-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
226271
@pytest.mark.parametrize("iterator", [True, False])
227272
def test_empty_with_nrows_chunksize(all_parsers, iterator):
228273
# see gh-9535
@@ -232,6 +277,18 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
232277
nrows = 10
233278
data = StringIO("foo,bar\n")
234279

280+
if parser.engine == "pyarrow":
281+
msg = (
282+
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
283+
)
284+
with pytest.raises(ValueError, match=msg):
285+
if iterator:
286+
with parser.read_csv(data, chunksize=nrows) as reader:
287+
next(iter(reader))
288+
else:
289+
parser.read_csv(data, nrows=nrows)
290+
return
291+
235292
if iterator:
236293
with parser.read_csv(data, chunksize=nrows) as reader:
237294
result = next(iter(reader))
@@ -241,7 +298,6 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
241298
tm.assert_frame_equal(result, expected)
242299

243300

244-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
245301
def test_read_csv_memory_growth_chunksize(all_parsers):
246302
# see gh-24805
247303
#
@@ -254,12 +310,19 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
254310
for i in range(1000):
255311
f.write(str(i) + "\n")
256312

313+
if parser.engine == "pyarrow":
314+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
315+
with pytest.raises(ValueError, match=msg):
316+
with parser.read_csv(path, chunksize=20) as result:
317+
for _ in result:
318+
pass
319+
return
320+
257321
with parser.read_csv(path, chunksize=20) as result:
258322
for _ in result:
259323
pass
260324

261325

262-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
263326
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
264327
# GH#21211
265328
parser = all_parsers
@@ -268,6 +331,18 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
268331
9,10,11
269332
"""
270333

334+
if parser.engine == "pyarrow":
335+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
336+
with pytest.raises(ValueError, match=msg):
337+
parser.read_csv(
338+
StringIO(data),
339+
names=["a", "b"],
340+
chunksize=2,
341+
usecols=[0, 1],
342+
header=None,
343+
)
344+
return
345+
271346
result_chunks = parser.read_csv(
272347
StringIO(data),
273348
names=["a", "b"],
@@ -285,7 +360,6 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
285360
tm.assert_frame_equal(result, expected_frames[i])
286361

287362

288-
@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
289363
def test_chunksize_second_block_shorter(all_parsers):
290364
# GH#21211
291365
parser = all_parsers
@@ -295,6 +369,12 @@ def test_chunksize_second_block_shorter(all_parsers):
295369
9,10,11
296370
"""
297371

372+
if parser.engine == "pyarrow":
373+
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
374+
with pytest.raises(ValueError, match=msg):
375+
parser.read_csv(StringIO(data), chunksize=2)
376+
return
377+
298378
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
299379

300380
expected_frames = [

pandas/tests/io/parser/conftest.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,18 @@ def read_csv_check_warnings(
3434
warn_msg: str,
3535
*args,
3636
raise_on_extra_warnings=True,
37+
check_stacklevel: bool = True,
3738
**kwargs,
3839
):
3940
# We need to check the stacklevel here instead of in the tests
4041
# since this is where read_csv is called and where the warning
4142
# should point to.
4243
kwargs = self.update_kwargs(kwargs)
4344
with tm.assert_produces_warning(
44-
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
45+
warn_type,
46+
match=warn_msg,
47+
raise_on_extra_warnings=raise_on_extra_warnings,
48+
check_stacklevel=check_stacklevel,
4549
):
4650
return read_csv(*args, **kwargs)
4751

0 commit comments

Comments
 (0)