Skip to content

Commit f6aec03

Browse files
authored
TST/REF: split test_common into multiple files (#38897)
1 parent 586b490 commit f6aec03

13 files changed

+2500
-2374
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
"""
2+
Tests that work on both the Python and C engines but do not have a
3+
specific classification into the other test modules.
4+
"""
5+
from io import StringIO
6+
7+
import numpy as np
8+
import pytest
9+
10+
from pandas.errors import DtypeWarning
11+
12+
from pandas import DataFrame, concat
13+
import pandas._testing as tm
14+
15+
16+
@pytest.mark.parametrize("index_col", [0, "index"])
17+
def test_read_chunksize_with_index(all_parsers, index_col):
18+
parser = all_parsers
19+
data = """index,A,B,C,D
20+
foo,2,3,4,5
21+
bar,7,8,9,10
22+
baz,12,13,14,15
23+
qux,12,13,14,15
24+
foo2,12,13,14,15
25+
bar2,12,13,14,15
26+
"""
27+
28+
expected = DataFrame(
29+
[
30+
["foo", 2, 3, 4, 5],
31+
["bar", 7, 8, 9, 10],
32+
["baz", 12, 13, 14, 15],
33+
["qux", 12, 13, 14, 15],
34+
["foo2", 12, 13, 14, 15],
35+
["bar2", 12, 13, 14, 15],
36+
],
37+
columns=["index", "A", "B", "C", "D"],
38+
)
39+
expected = expected.set_index("index")
40+
41+
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
42+
chunks = list(reader)
43+
tm.assert_frame_equal(chunks[0], expected[:2])
44+
tm.assert_frame_equal(chunks[1], expected[2:4])
45+
tm.assert_frame_equal(chunks[2], expected[4:])
46+
47+
48+
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
49+
def test_read_chunksize_bad(all_parsers, chunksize):
50+
data = """index,A,B,C,D
51+
foo,2,3,4,5
52+
bar,7,8,9,10
53+
baz,12,13,14,15
54+
qux,12,13,14,15
55+
foo2,12,13,14,15
56+
bar2,12,13,14,15
57+
"""
58+
parser = all_parsers
59+
msg = r"'chunksize' must be an integer >=1"
60+
61+
with pytest.raises(ValueError, match=msg):
62+
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
63+
pass
64+
65+
66+
@pytest.mark.parametrize("chunksize", [2, 8])
67+
def test_read_chunksize_and_nrows(all_parsers, chunksize):
68+
# see gh-15755
69+
data = """index,A,B,C,D
70+
foo,2,3,4,5
71+
bar,7,8,9,10
72+
baz,12,13,14,15
73+
qux,12,13,14,15
74+
foo2,12,13,14,15
75+
bar2,12,13,14,15
76+
"""
77+
parser = all_parsers
78+
kwargs = {"index_col": 0, "nrows": 5}
79+
80+
expected = parser.read_csv(StringIO(data), **kwargs)
81+
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
82+
tm.assert_frame_equal(concat(reader), expected)
83+
84+
85+
def test_read_chunksize_and_nrows_changing_size(all_parsers):
86+
data = """index,A,B,C,D
87+
foo,2,3,4,5
88+
bar,7,8,9,10
89+
baz,12,13,14,15
90+
qux,12,13,14,15
91+
foo2,12,13,14,15
92+
bar2,12,13,14,15
93+
"""
94+
parser = all_parsers
95+
kwargs = {"index_col": 0, "nrows": 5}
96+
97+
expected = parser.read_csv(StringIO(data), **kwargs)
98+
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
99+
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
100+
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
101+
102+
with pytest.raises(StopIteration, match=""):
103+
reader.get_chunk(size=3)
104+
105+
106+
def test_get_chunk_passed_chunksize(all_parsers):
107+
parser = all_parsers
108+
data = """A,B,C
109+
1,2,3
110+
4,5,6
111+
7,8,9
112+
1,2,3"""
113+
114+
with parser.read_csv(StringIO(data), chunksize=2) as reader:
115+
result = reader.get_chunk()
116+
117+
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
118+
tm.assert_frame_equal(result, expected)
119+
120+
121+
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
122+
def test_read_chunksize_compat(all_parsers, kwargs):
123+
# see gh-12185
124+
data = """index,A,B,C,D
125+
foo,2,3,4,5
126+
bar,7,8,9,10
127+
baz,12,13,14,15
128+
qux,12,13,14,15
129+
foo2,12,13,14,15
130+
bar2,12,13,14,15
131+
"""
132+
parser = all_parsers
133+
result = parser.read_csv(StringIO(data), **kwargs)
134+
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
135+
tm.assert_frame_equal(concat(reader), result)
136+
137+
138+
def test_read_chunksize_jagged_names(all_parsers):
139+
# see gh-23509
140+
parser = all_parsers
141+
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
142+
143+
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
144+
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
145+
result = concat(reader)
146+
tm.assert_frame_equal(result, expected)
147+
148+
149+
def test_chunk_begins_with_newline_whitespace(all_parsers):
150+
# see gh-10022
151+
parser = all_parsers
152+
data = "\n hello\nworld\n"
153+
154+
result = parser.read_csv(StringIO(data), header=None)
155+
expected = DataFrame([" hello", "world"])
156+
tm.assert_frame_equal(result, expected)
157+
158+
159+
@pytest.mark.xfail(reason="GH38630, sometimes gives ResourceWarning", strict=False)
160+
def test_chunks_have_consistent_numerical_type(all_parsers):
161+
parser = all_parsers
162+
integers = [str(i) for i in range(499999)]
163+
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
164+
165+
# Coercions should work without warnings.
166+
with tm.assert_produces_warning(None):
167+
result = parser.read_csv(StringIO(data))
168+
169+
assert type(result.a[0]) is np.float64
170+
assert result.a.dtype == float
171+
172+
173+
def test_warn_if_chunks_have_mismatched_type(all_parsers):
174+
warning_type = None
175+
parser = all_parsers
176+
integers = [str(i) for i in range(499999)]
177+
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
178+
179+
# see gh-3866: if chunks are different types and can't
180+
# be coerced using numerical types, then issue warning.
181+
if parser.engine == "c" and parser.low_memory:
182+
warning_type = DtypeWarning
183+
184+
with tm.assert_produces_warning(warning_type):
185+
df = parser.read_csv(StringIO(data))
186+
assert df.a.dtype == object
187+
188+
189+
@pytest.mark.parametrize("iterator", [True, False])
190+
def test_empty_with_nrows_chunksize(all_parsers, iterator):
191+
# see gh-9535
192+
parser = all_parsers
193+
expected = DataFrame(columns=["foo", "bar"])
194+
195+
nrows = 10
196+
data = StringIO("foo,bar\n")
197+
198+
if iterator:
199+
with parser.read_csv(data, chunksize=nrows) as reader:
200+
result = next(iter(reader))
201+
else:
202+
result = parser.read_csv(data, nrows=nrows)
203+
204+
tm.assert_frame_equal(result, expected)
205+
206+
207+
def test_read_csv_memory_growth_chunksize(all_parsers):
208+
# see gh-24805
209+
#
210+
# Let's just make sure that we don't crash
211+
# as we iteratively process all chunks.
212+
parser = all_parsers
213+
214+
with tm.ensure_clean() as path:
215+
with open(path, "w") as f:
216+
for i in range(1000):
217+
f.write(str(i) + "\n")
218+
219+
with parser.read_csv(path, chunksize=20) as result:
220+
for _ in result:
221+
pass

0 commit comments

Comments
 (0)