Skip to content

Commit 9b521f6

Browse files
committed
Improved the clarity and logic of the test
1 parent 629198d commit 9b521f6

File tree

1 file changed

+56
-50
lines changed

1 file changed

+56
-50
lines changed

pandas/io/tests/parser/common.py

+56-50
Original file line numberDiff line numberDiff line change
@@ -1502,66 +1502,72 @@ def test_parse_trim_buffers(self):
15021502
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
15031503
# times it fails due to memory corruption, which causes the
15041504
# loaded DataFrame to differ from the expected one.
1505-
n_lines, chunksizes = 173, range(57, 90)
15061505

1507-
# Create the expected output
1508-
expected_ = [(chunksize_, "9999-9", "9999-9")
1509-
for chunksize_ in chunksizes
1510-
for _ in range((n_lines + chunksize_ - 1) // chunksize_)]
1511-
expected = pd.DataFrame(expected_, columns=None, index=None)
1512-
1513-
# Generate a large mixed-type CSV file on-the-fly (approx 272 KiB)
1506+
# Generate a large mixed-type CSV file on-the-fly (one record is
1507+
# approx 1.5KiB).
15141508
record_ = \
1515-
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.""" \
1516-
"""99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-""" \
1517-
"""ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-""" \
1518-
"""ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-""" \
1519-
"""ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-""" \
1520-
"""ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-""" \
1521-
"""ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,""" \
1522-
"""ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-""" \
1523-
"""ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-""" \
1524-
"""ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.""" \
1525-
"""99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.""" \
1526-
"""99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.""" \
1527-
"""99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-""" \
1528-
"""ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-""" \
1529-
"""ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.""" \
1530-
"""999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
1531-
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-""" \
1532-
"""ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-""" \
1533-
"""ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-""" \
1534-
"""ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-""" \
1535-
"""ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-""" \
1536-
"""ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.""" \
1537-
"""99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.""" \
1538-
"""99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9""" \
1539-
""".9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
1540-
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-""" \
1541-
"""ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.""" \
1542-
"""99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.""" \
1543-
"""99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-""" \
1544-
"""ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.""" \
1545-
"""99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.""" \
1546-
"""99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.""" \
1547-
"""99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
1509+
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
1510+
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
1511+
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
1512+
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
1513+
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
1514+
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
1515+
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
1516+
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
1517+
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
1518+
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
1519+
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
1520+
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
1521+
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
1522+
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
1523+
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
1524+
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
1525+
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
1526+
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
1527+
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
1528+
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
1529+
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
1530+
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
1531+
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
1532+
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
1533+
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
1534+
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
1535+
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
1536+
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
1537+
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
1538+
1539+
# Set the number of line so that a call to `parser_trim_buffers`
1540+
# is trgiggered: a couple of full chunks and a relatively small
1541+
# 'residual' chunk.
1542+
chunksize, n_lines = 128, 2 * 128 + 15
15481543
csv_data = "\n".join([record_] * n_lines) + "\n"
15491544

1545+
# We will use StringIO to load the CSV from this text buffer.
1546+
# pd.read_csv() will iterate over the file in chunks and will
1547+
# finally read a residual chunk of really small size.
1548+
1549+
# Create the expected output: maually create the dataframe
1550+
# by splitting by comma and repeating the `n_lines` number
1551+
# of times.
1552+
row = tuple(val_ if val_ else float("nan")
1553+
for val_ in record_.split(","))
1554+
expected_ = [row for _ in range(n_lines)]
1555+
expected = pd.DataFrame(expected_, dtype=object,
1556+
columns=None, index=None)
1557+
1558+
# Iterate over the CSV file in chunks of `chunksize` lines
15501559
output_ = []
15511560
try:
1552-
for chunksize_ in chunksizes:
1553-
iterator_ = self.read_csv(StringIO(csv_data), header=None,
1554-
dtype=object, chunksize=chunksize_,
1555-
na_filter=True)
1556-
for chunk_ in iterator_:
1557-
output_.append((chunksize_,
1558-
chunk_.iloc[0, 0],
1559-
chunk_.iloc[-1, 0]))
1561+
iterator_ = self.read_csv(StringIO(csv_data), header=None,
1562+
dtype=object, chunksize=chunksize)
1563+
for chunk_ in iterator_:
1564+
output_.append(chunk_)
15601565
except ValueError:
15611566
# Ignore unsuported dtype=object by engine=python
15621567
# in this case output_ list is empty
15631568
pass
15641569

1570+
# Check for data corruption if there is any output.
15651571
if output_:
1566-
df = pd.DataFrame(output_, columns=None, index=None)
1572+
df = pd.concat(output_, axis=0, ignore_index=True)
15671573
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)