Skip to content

Commit d59624e

Browse files
committed
Moved the test to 'c_parser_only'
1 parent 9b521f6 commit d59624e

File tree

2 files changed

+70
-80
lines changed

2 files changed

+70
-80
lines changed

pandas/io/tests/parser/c_parser_only.py

+70
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,73 @@ def test_empty_header_read(count):
381381

382382
for count in range(1, 101):
383383
test_empty_header_read(count)
384+
385+
def test_parse_trim_buffers(self):
386+
# This test is part of a bugfix for issue #13703. It attmepts to
387+
# to stress the system memory allocator, to cause it to move the
388+
# stream buffer and either let the OS reclaim the region, or let
389+
# other memory requests of parser otherwise modify the contents
390+
# of memory space, where it was formely located.
391+
# This test is designed to cause a `segfault` with unpatched
392+
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
393+
# times it fails due to memory corruption, which causes the
394+
# loaded DataFrame to differ from the expected one.
395+
396+
# Generate a large mixed-type CSV file on-the-fly (one record is
397+
# approx 1.5KiB).
398+
record_ = \
399+
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
400+
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
401+
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
402+
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
403+
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
404+
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
405+
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
406+
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
407+
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
408+
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
409+
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
410+
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
411+
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
412+
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
413+
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
414+
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
415+
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
416+
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
417+
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
418+
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
419+
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
420+
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
421+
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
422+
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
423+
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
424+
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
425+
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
426+
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
427+
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
428+
429+
# Set the number of lines so that a call to `parser_trim_buffers`
430+
# is triggered: after a couple of full chunks are consumed a
431+
# relatively small 'residual' chunk would cause reallocation
432+
# within the parser.
433+
chunksize, n_lines = 128, 2 * 128 + 15
434+
csv_data = "\n".join([record_] * n_lines) + "\n"
435+
436+
# We will use StringIO to load the CSV from this text buffer.
437+
# pd.read_csv() will iterate over the file in chunks and will
438+
# finally read a residual chunk of really small size.
439+
440+
# Generate the expected output: manually create the dataframe
441+
# by splitting by comma and repeating the `n_lines` times.
442+
row = tuple(val_ if val_ else float("nan")
443+
for val_ in record_.split(","))
444+
expected = pd.DataFrame([row for _ in range(n_lines)],
445+
dtype=object, columns=None, index=None)
446+
447+
# Iterate over the CSV file in chunks of `chunksize` lines
448+
chunks_ = self.read_csv(StringIO(csv_data), header=None,
449+
dtype=object, chunksize=chunksize)
450+
result = pd.concat(chunks_, axis=0, ignore_index=True)
451+
452+
# Check for data corruption if there was no segfault
453+
tm.assert_frame_equal(result, expected)

pandas/io/tests/parser/common.py

-80
Original file line numberDiff line numberDiff line change
@@ -1491,83 +1491,3 @@ def test_memory_map(self):
14911491

14921492
out = self.read_csv(mmap_file, memory_map=True)
14931493
tm.assert_frame_equal(out, expected)
1494-
1495-
def test_parse_trim_buffers(self):
1496-
# This test is part of a bugfix for issue #13703. It attmepts to
1497-
# to stress the system memory allocator, to cause it to move the
1498-
# stream buffer and either let the OS reclaim the region, or let
1499-
# other memory requests of parser otherwise modify the contents
1500-
# of memory space, where it was formely located.
1501-
# This test is designed to cause a `segfault` with unpatched
1502-
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
1503-
# times it fails due to memory corruption, which causes the
1504-
# loaded DataFrame to differ from the expected one.
1505-
1506-
# Generate a large mixed-type CSV file on-the-fly (one record is
1507-
# approx 1.5KiB).
1508-
record_ = \
1509-
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
1510-
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
1511-
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
1512-
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
1513-
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
1514-
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
1515-
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
1516-
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
1517-
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
1518-
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
1519-
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
1520-
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
1521-
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
1522-
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
1523-
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
1524-
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
1525-
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
1526-
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
1527-
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
1528-
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
1529-
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
1530-
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
1531-
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
1532-
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
1533-
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
1534-
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
1535-
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
1536-
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
1537-
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
1538-
1539-
# Set the number of line so that a call to `parser_trim_buffers`
1540-
# is trgiggered: a couple of full chunks and a relatively small
1541-
# 'residual' chunk.
1542-
chunksize, n_lines = 128, 2 * 128 + 15
1543-
csv_data = "\n".join([record_] * n_lines) + "\n"
1544-
1545-
# We will use StringIO to load the CSV from this text buffer.
1546-
# pd.read_csv() will iterate over the file in chunks and will
1547-
# finally read a residual chunk of really small size.
1548-
1549-
# Create the expected output: maually create the dataframe
1550-
# by splitting by comma and repeating the `n_lines` number
1551-
# of times.
1552-
row = tuple(val_ if val_ else float("nan")
1553-
for val_ in record_.split(","))
1554-
expected_ = [row for _ in range(n_lines)]
1555-
expected = pd.DataFrame(expected_, dtype=object,
1556-
columns=None, index=None)
1557-
1558-
# Iterate over the CSV file in chunks of `chunksize` lines
1559-
output_ = []
1560-
try:
1561-
iterator_ = self.read_csv(StringIO(csv_data), header=None,
1562-
dtype=object, chunksize=chunksize)
1563-
for chunk_ in iterator_:
1564-
output_.append(chunk_)
1565-
except ValueError:
1566-
# Ignore unsuported dtype=object by engine=python
1567-
# in this case output_ list is empty
1568-
pass
1569-
1570-
# Check for data corruption if there is any output.
1571-
if output_:
1572-
df = pd.concat(output_, axis=0, ignore_index=True)
1573-
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)