Skip to content

Commit e99d20e

Browse files
committed
TST: clean up some parser test infrastructure
1 parent f32b44f commit e99d20e

File tree

1 file changed

+82
-113
lines changed

1 file changed

+82
-113
lines changed

pandas/io/tests/test_parsers.py

+82-113
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,51 @@ def setUp(self):
6969
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
7070
self.xls1 = os.path.join(self.dirpath, 'test.xls')
7171

72+
def construct_dataframe(self, num_rows):
73+
74+
df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde'))
75+
df['foo'] = 'foo'
76+
df['bar'] = 'bar'
77+
df['baz'] = 'baz'
78+
df['date'] = pd.date_range('20000101 09:00:00',
79+
periods=num_rows,
80+
freq='s')
81+
df['int'] = np.arange(num_rows, dtype='int64')
82+
return df
83+
84+
def generate_multithread_dataframe(self, path, num_rows, num_tasks):
85+
86+
def reader(arg):
87+
start, nrows = arg
88+
89+
if not start:
90+
return pd.read_csv(path, index_col=0, header=0, nrows=nrows,
91+
parse_dates=['date'])
92+
93+
return pd.read_csv(path,
94+
index_col=0,
95+
header=None,
96+
skiprows=int(start) + 1,
97+
nrows=nrows,
98+
parse_dates=[9])
99+
100+
tasks = [
101+
(num_rows * i / num_tasks,
102+
num_rows / num_tasks) for i in range(num_tasks)
103+
]
104+
105+
pool = ThreadPool(processes=num_tasks)
106+
107+
results = pool.map(reader, tasks)
108+
109+
header = results[0].columns
110+
for r in results[1:]:
111+
r.columns = header
112+
113+
final_dataframe = pd.concat(results)
114+
115+
return final_dataframe
116+
72117
def test_converters_type_must_be_dict(self):
73118
with tm.assertRaisesRegexp(TypeError, 'Type converters.+'):
74119
self.read_csv(StringIO(self.data1), converters=0)
@@ -3361,8 +3406,43 @@ def test_variable_width_unicode(self):
33613406
tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')),
33623407
header=None, encoding='utf8'))
33633408

3409+
class CParserTests(ParserTests):
3410+
""" base class for CParser Testsing """
3411+
3412+
def test_buffer_overflow(self):
3413+
# GH9205
3414+
# test certain malformed input files that cause buffer overflows in
3415+
# tokenizer.c
3416+
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
3417+
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
3418+
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
3419+
for malf in (malfw, malfs, malfl):
3420+
try:
3421+
df = self.read_table(StringIO(malf))
3422+
except Exception as cperr:
3423+
self.assertIn(
3424+
'Buffer overflow caught - possible malformed input file.', str(cperr))
3425+
3426+
def test_buffer_rd_bytes(self):
3427+
# GH 12098
3428+
# src->buffer can be freed twice leading to a segfault if a corrupt
3429+
# gzip file is read with read_csv and the buffer is filled more than
3430+
# once before gzip throws an exception
3431+
3432+
data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
3433+
'\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
3434+
'\xA6\x4D' + '\x55' * 267 + \
3435+
'\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
3436+
'\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
3437+
for i in range(100):
3438+
try:
3439+
_ = self.read_csv(StringIO(data),
3440+
compression='gzip',
3441+
delim_whitespace=True)
3442+
except Exception as e:
3443+
pass
33643444

3365-
class TestCParserHighMemory(ParserTests, tm.TestCase):
3445+
class TestCParserHighMemory(CParserTests, tm.TestCase):
33663446

33673447
def read_csv(self, *args, **kwds):
33683448
kwds = kwds.copy()
@@ -3653,39 +3733,6 @@ def test_fallback_to_python(self):
36533733
with tm.assertRaisesRegexp(ValueError, 'does not support'):
36543734
self.read_table(StringIO(data), engine='c', skip_footer=1)
36553735

3656-
def test_buffer_overflow(self):
3657-
# GH9205
3658-
# test certain malformed input files that cause buffer overflows in
3659-
# tokenizer.c
3660-
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
3661-
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
3662-
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
3663-
for malf in (malfw, malfs, malfl):
3664-
try:
3665-
df = self.read_table(StringIO(malf))
3666-
except Exception as cperr:
3667-
self.assertIn(
3668-
'Buffer overflow caught - possible malformed input file.', str(cperr))
3669-
3670-
def test_buffer_rd_bytes(self):
3671-
# GH 12098
3672-
# src->buffer can be freed twice leading to a segfault if a corrupt
3673-
# gzip file is read with read_csv and the buffer is filled more than
3674-
# once before gzip throws an exception
3675-
3676-
data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
3677-
'\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
3678-
'\xA6\x4D' + '\x55' * 267 + \
3679-
'\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
3680-
'\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
3681-
for i in range(100):
3682-
try:
3683-
_ = self.read_csv(StringIO(data),
3684-
compression='gzip',
3685-
delim_whitespace=True)
3686-
except Exception as e:
3687-
pass
3688-
36893736
def test_single_char_leading_whitespace(self):
36903737
# GH 9710
36913738
data = """\
@@ -3706,7 +3753,7 @@ def test_single_char_leading_whitespace(self):
37063753
tm.assert_frame_equal(result, expected)
37073754

37083755

3709-
class TestCParserLowMemory(ParserTests, tm.TestCase):
3756+
class TestCParserLowMemory(CParserTests, tm.TestCase):
37103757

37113758
def read_csv(self, *args, **kwds):
37123759
kwds = kwds.copy()
@@ -4213,39 +4260,6 @@ def test_raise_on_sep_with_delim_whitespace(self):
42134260
with tm.assertRaisesRegexp(ValueError, 'you can only specify one'):
42144261
self.read_table(StringIO(data), sep='\s', delim_whitespace=True)
42154262

4216-
def test_buffer_overflow(self):
4217-
# GH9205
4218-
# test certain malformed input files that cause buffer overflows in
4219-
# tokenizer.c
4220-
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
4221-
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
4222-
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
4223-
for malf in (malfw, malfs, malfl):
4224-
try:
4225-
df = self.read_table(StringIO(malf))
4226-
except Exception as cperr:
4227-
self.assertIn(
4228-
'Buffer overflow caught - possible malformed input file.', str(cperr))
4229-
4230-
def test_buffer_rd_bytes(self):
4231-
# GH 12098
4232-
# src->buffer can be freed twice leading to a segfault if a corrupt
4233-
# gzip file is read with read_csv and the buffer is filled more than
4234-
# once before gzip throws an exception
4235-
4236-
data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
4237-
'\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
4238-
'\xA6\x4D' + '\x55' * 267 + \
4239-
'\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
4240-
'\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
4241-
for i in range(100):
4242-
try:
4243-
_ = self.read_csv(StringIO(data),
4244-
compression='gzip',
4245-
delim_whitespace=True)
4246-
except Exception as e:
4247-
pass
4248-
42494263
def test_single_char_leading_whitespace(self):
42504264
# GH 9710
42514265
data = """\
@@ -4300,51 +4314,6 @@ def test_multithread_stringio_read_csv(self):
43004314
for result in results:
43014315
tm.assert_frame_equal(first_result, result)
43024316

4303-
def construct_dataframe(self, num_rows):
4304-
4305-
df = DataFrame(np.random.rand(num_rows, 5), columns=list('abcde'))
4306-
df['foo'] = 'foo'
4307-
df['bar'] = 'bar'
4308-
df['baz'] = 'baz'
4309-
df['date'] = pd.date_range('20000101 09:00:00',
4310-
periods=num_rows,
4311-
freq='s')
4312-
df['int'] = np.arange(num_rows, dtype='int64')
4313-
return df
4314-
4315-
def generate_multithread_dataframe(self, path, num_rows, num_tasks):
4316-
4317-
def reader(arg):
4318-
start, nrows = arg
4319-
4320-
if not start:
4321-
return pd.read_csv(path, index_col=0, header=0, nrows=nrows,
4322-
parse_dates=['date'])
4323-
4324-
return pd.read_csv(path,
4325-
index_col=0,
4326-
header=None,
4327-
skiprows=int(start) + 1,
4328-
nrows=nrows,
4329-
parse_dates=[9])
4330-
4331-
tasks = [
4332-
(num_rows * i / num_tasks,
4333-
num_rows / num_tasks) for i in range(num_tasks)
4334-
]
4335-
4336-
pool = ThreadPool(processes=num_tasks)
4337-
4338-
results = pool.map(reader, tasks)
4339-
4340-
header = results[0].columns
4341-
for r in results[1:]:
4342-
r.columns = header
4343-
4344-
final_dataframe = pd.concat(results)
4345-
4346-
return final_dataframe
4347-
43484317
def test_multithread_path_multipart_read_csv(self):
43494318
# GH 11786
43504319
num_tasks = 4

0 commit comments

Comments
 (0)