@@ -1502,66 +1502,72 @@ def test_parse_trim_buffers(self):
1502
1502
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
1503
1503
# times it fails due to memory corruption, which causes the
1504
1504
# loaded DataFrame to differ from the expected one.
1505
- n_lines , chunksizes = 173 , range (57 , 90 )
1506
1505
1507
- # Create the expected output
1508
- expected_ = [(chunksize_ , "9999-9" , "9999-9" )
1509
- for chunksize_ in chunksizes
1510
- for _ in range ((n_lines + chunksize_ - 1 ) // chunksize_ )]
1511
- expected = pd .DataFrame (expected_ , columns = None , index = None )
1512
-
1513
- # Generate a large mixed-type CSV file on-the-fly (approx 272 KiB)
1506
+ # Generate a large mixed-type CSV file on-the-fly (one record is
1507
+ # approx 1.5KiB).
1514
1508
record_ = \
1515
- """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.""" \
1516
- """99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-""" \
1517
- """ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-""" \
1518
- """ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-""" \
1519
- """ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-""" \
1520
- """ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-""" \
1521
- """ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,""" \
1522
- """ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-""" \
1523
- """ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-""" \
1524
- """ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.""" \
1525
- """99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.""" \
1526
- """99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.""" \
1527
- """99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-""" \
1528
- """ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-""" \
1529
- """ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.""" \
1530
- """999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
1531
- """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-""" \
1532
- """ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-""" \
1533
- """ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-""" \
1534
- """ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-""" \
1535
- """ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-""" \
1536
- """ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.""" \
1537
- """99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.""" \
1538
- """99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9""" \
1539
- """.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
1540
- """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-""" \
1541
- """ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.""" \
1542
- """99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.""" \
1543
- """99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-""" \
1544
- """ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.""" \
1545
- """99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.""" \
1546
- """99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.""" \
1547
- """99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
1509
+ """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
1510
+ """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
1511
+ """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
1512
+ """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
1513
+ """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
1514
+ """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
1515
+ """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
1516
+ """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
1517
+ """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
1518
+ """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
1519
+ """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
1520
+ """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
1521
+ """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
1522
+ """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
1523
+ """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
1524
+ """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
1525
+ """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
1526
+ """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
1527
+ """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
1528
+ """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
1529
+ """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
1530
+ """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
1531
+ """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
1532
+ """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
1533
+ """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
1534
+ """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
1535
+ """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
1536
+ """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
1537
+ """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
1538
+
1539
+ # Set the number of line so that a call to `parser_trim_buffers`
1540
+ # is trgiggered: a couple of full chunks and a relatively small
1541
+ # 'residual' chunk.
1542
+ chunksize , n_lines = 128 , 2 * 128 + 15
1548
1543
csv_data = "\n " .join ([record_ ] * n_lines ) + "\n "
1549
1544
1545
+ # We will use StringIO to load the CSV from this text buffer.
1546
+ # pd.read_csv() will iterate over the file in chunks and will
1547
+ # finally read a residual chunk of really small size.
1548
+
1549
+ # Create the expected output: maually create the dataframe
1550
+ # by splitting by comma and repeating the `n_lines` number
1551
+ # of times.
1552
+ row = tuple (val_ if val_ else float ("nan" )
1553
+ for val_ in record_ .split ("," ))
1554
+ expected_ = [row for _ in range (n_lines )]
1555
+ expected = pd .DataFrame (expected_ , dtype = object ,
1556
+ columns = None , index = None )
1557
+
1558
+ # Iterate over the CSV file in chunks of `chunksize` lines
1550
1559
output_ = []
1551
1560
try :
1552
- for chunksize_ in chunksizes :
1553
- iterator_ = self .read_csv (StringIO (csv_data ), header = None ,
1554
- dtype = object , chunksize = chunksize_ ,
1555
- na_filter = True )
1556
- for chunk_ in iterator_ :
1557
- output_ .append ((chunksize_ ,
1558
- chunk_ .iloc [0 , 0 ],
1559
- chunk_ .iloc [- 1 , 0 ]))
1561
+ iterator_ = self .read_csv (StringIO (csv_data ), header = None ,
1562
+ dtype = object , chunksize = chunksize )
1563
+ for chunk_ in iterator_ :
1564
+ output_ .append (chunk_ )
1560
1565
except ValueError :
1561
1566
# Ignore unsuported dtype=object by engine=python
1562
1567
# in this case output_ list is empty
1563
1568
pass
1564
1569
1570
+ # Check for data corruption if there is any output.
1565
1571
if output_ :
1566
- df = pd .DataFrame (output_ , columns = None , index = None )
1572
+ df = pd .concat (output_ , axis = 0 , ignore_index = True )
1567
1573
tm .assert_frame_equal (df , expected )
0 commit comments