263
263
Lines with too many fields (e.g. a csv line with too many commas) will by
264
264
default cause an exception to be raised, and no DataFrame will be returned.
265
265
If False, then these "bad lines" will dropped from the DataFrame that is
266
- returned. (Only valid with C parser)
266
+ returned.
267
267
warn_bad_lines : boolean, default True
268
268
If error_bad_lines is False, and warn_bad_lines is True, a warning for each
269
- "bad line" will be output. (Only valid with C parser).
269
+ "bad line" will be output.
270
270
low_memory : boolean, default True
271
271
Internally process the file in chunks, resulting in lower memory use
272
272
while parsing, but possibly mixed type inference. To ensure no mixed
@@ -485,8 +485,6 @@ def _read(filepath_or_buffer, kwds):
485
485
_python_unsupported = set ([
486
486
'low_memory' ,
487
487
'buffer_lines' ,
488
- 'error_bad_lines' ,
489
- 'warn_bad_lines' ,
490
488
'float_precision' ,
491
489
])
492
490
_deprecated_args = set ([
@@ -1897,6 +1895,9 @@ def __init__(self, f, **kwds):
1897
1895
self .usecols , _ = _validate_usecols_arg (kwds ['usecols' ])
1898
1896
self .skip_blank_lines = kwds ['skip_blank_lines' ]
1899
1897
1898
+ self .warn_bad_lines = kwds ['warn_bad_lines' ]
1899
+ self .error_bad_lines = kwds ['error_bad_lines' ]
1900
+
1900
1901
self .names_passed = kwds ['names' ] or None
1901
1902
1902
1903
self .na_filter = kwds ['na_filter' ]
@@ -2469,16 +2470,19 @@ def _next_line(self):
2469
2470
next (self .data )
2470
2471
2471
2472
while True :
2472
- orig_line = self ._next_iter_line ()
2473
- line = self ._check_comments ([orig_line ])[0 ]
2473
+ orig_line = self ._next_iter_line (row_num = self .pos + 1 )
2474
2474
self .pos += 1
2475
- if (not self .skip_blank_lines and
2476
- (self ._empty (orig_line ) or line )):
2477
- break
2478
- elif self .skip_blank_lines :
2479
- ret = self ._check_empty ([line ])
2480
- if ret :
2481
- line = ret [0 ]
2475
+
2476
+ if orig_line is not None :
2477
+ line = self ._check_comments ([orig_line ])[0 ]
2478
+
2479
+ if self .skip_blank_lines :
2480
+ ret = self ._check_empty ([line ])
2481
+
2482
+ if ret :
2483
+ line = ret [0 ]
2484
+ break
2485
+ elif self ._empty (orig_line ) or line :
2482
2486
break
2483
2487
2484
2488
# This was the first line of the file,
@@ -2491,7 +2495,28 @@ def _next_line(self):
2491
2495
self .buf .append (line )
2492
2496
return line
2493
2497
2494
- def _next_iter_line (self , ** kwargs ):
2498
+ def _alert_malformed (self , msg , row_num ):
2499
+ """
2500
+ Alert a user about a malformed row.
2501
+
2502
+ If `self.error_bad_lines` is True, the alert will be `ParserError`.
2503
+ If `self.warn_bad_lines` is True, the alert will be printed out.
2504
+
2505
+ Parameters
2506
+ ----------
2507
+ msg : The error message to display.
2508
+ row_num : The row number where the parsing error occurred.
2509
+ Because this row number is displayed, we 1-index,
2510
+ even though we 0-index internally.
2511
+ """
2512
+
2513
+ if self .error_bad_lines :
2514
+ raise ParserError (msg )
2515
+ elif self .warn_bad_lines :
2516
+ base = 'Skipping line {row_num}: ' .format (row_num = row_num )
2517
+ sys .stderr .write (base + msg + '\n ' )
2518
+
2519
+ def _next_iter_line (self , row_num ):
2495
2520
"""
2496
2521
Wrapper around iterating through `self.data` (CSV source).
2497
2522
@@ -2501,32 +2526,34 @@ def _next_iter_line(self, **kwargs):
2501
2526
2502
2527
Parameters
2503
2528
----------
2504
- kwargs : Keyword arguments used to customize the error message .
2529
+ row_num : The row number of the line being parsed .
2505
2530
"""
2506
2531
2507
2532
try :
2508
2533
return next (self .data )
2509
2534
except csv .Error as e :
2510
- msg = str (e )
2511
-
2512
- if 'NULL byte' in msg :
2513
- msg = ('NULL byte detected. This byte '
2514
- 'cannot be processed in Python\' s '
2515
- 'native csv library at the moment, '
2516
- 'so please pass in engine=\' c\' instead' )
2517
- elif 'newline inside string' in msg :
2518
- msg = ('EOF inside string starting with '
2519
- 'line ' + str (kwargs ['row_num' ]))
2520
-
2521
- if self .skipfooter > 0 :
2522
- reason = ('Error could possibly be due to '
2523
- 'parsing errors in the skipped footer rows '
2524
- '(the skipfooter keyword is only applied '
2525
- 'after Python\' s csv library has parsed '
2526
- 'all rows).' )
2527
- msg += '. ' + reason
2528
-
2529
- raise csv .Error (msg )
2535
+ if self .warn_bad_lines or self .error_bad_lines :
2536
+ msg = str (e )
2537
+
2538
+ if 'NULL byte' in msg :
2539
+ msg = ('NULL byte detected. This byte '
2540
+ 'cannot be processed in Python\' s '
2541
+ 'native csv library at the moment, '
2542
+ 'so please pass in engine=\' c\' instead' )
2543
+ elif 'newline inside string' in msg :
2544
+ msg = ('EOF inside string starting with '
2545
+ 'line ' + str (row_num ))
2546
+
2547
+ if self .skipfooter > 0 :
2548
+ reason = ('Error could possibly be due to '
2549
+ 'parsing errors in the skipped footer rows '
2550
+ '(the skipfooter keyword is only applied '
2551
+ 'after Python\' s csv library has parsed '
2552
+ 'all rows).' )
2553
+ msg += '. ' + reason
2554
+
2555
+ self ._alert_malformed (msg , row_num )
2556
+ return None
2530
2557
2531
2558
def _check_comments (self , lines ):
2532
2559
if self .comment is None :
@@ -2657,42 +2684,57 @@ def _get_index_name(self, columns):
2657
2684
return index_name , orig_names , columns
2658
2685
2659
2686
def _rows_to_cols (self , content ):
2687
+ if self .skipfooter < 0 :
2688
+ raise ValueError ('skip footer cannot be negative' )
2689
+
2660
2690
col_len = self .num_original_columns
2661
2691
2662
2692
if self ._implicit_index :
2663
2693
col_len += len (self .index_col )
2664
2694
2665
- # see gh-13320
2666
- zipped_content = list (lib .to_object_array (
2667
- content , min_width = col_len ).T )
2668
- zip_len = len (zipped_content )
2669
-
2670
- if self .skipfooter < 0 :
2671
- raise ValueError ('skip footer cannot be negative' )
2695
+ max_len = max ([len (row ) for row in content ])
2672
2696
2673
- # Loop through rows to verify lengths are correct.
2674
- if (col_len != zip_len and
2697
+ # Check that there are no rows with too many
2698
+ # elements in their row (rows with too few
2699
+ # elements are padded with NaN).
2700
+ if (max_len > col_len and
2675
2701
self .index_col is not False and
2676
2702
self .usecols is None ):
2677
- i = 0
2678
- for (i , l ) in enumerate (content ):
2679
- if len (l ) != col_len :
2680
- break
2681
2703
2682
- footers = 0
2683
- if self .skipfooter :
2684
- footers = self .skipfooter
2704
+ footers = self .skipfooter if self .skipfooter else 0
2705
+ bad_lines = []
2685
2706
2686
- row_num = self .pos - (len (content ) - i + footers )
2707
+ iter_content = enumerate (content )
2708
+ content_len = len (content )
2709
+ content = []
2687
2710
2688
- msg = ('Expected %d fields in line %d, saw %d' %
2689
- (col_len , row_num + 1 , zip_len ))
2690
- if len (self .delimiter ) > 1 and self .quoting != csv .QUOTE_NONE :
2691
- # see gh-13374
2692
- reason = ('Error could possibly be due to quotes being '
2693
- 'ignored when a multi-char delimiter is used.' )
2694
- msg += '. ' + reason
2695
- raise ValueError (msg )
2711
+ for (i , l ) in iter_content :
2712
+ actual_len = len (l )
2713
+
2714
+ if actual_len > col_len :
2715
+ if self .error_bad_lines or self .warn_bad_lines :
2716
+ row_num = self .pos - (content_len - i + footers )
2717
+ bad_lines .append ((row_num , actual_len ))
2718
+
2719
+ if self .error_bad_lines :
2720
+ break
2721
+ else :
2722
+ content .append (l )
2723
+
2724
+ for row_num , actual_len in bad_lines :
2725
+ msg = ('Expected %d fields in line %d, saw %d' %
2726
+ (col_len , row_num + 1 , actual_len ))
2727
+ if len (self .delimiter ) > 1 and self .quoting != csv .QUOTE_NONE :
2728
+ # see gh-13374
2729
+ reason = ('Error could possibly be due to quotes being '
2730
+ 'ignored when a multi-char delimiter is used.' )
2731
+ msg += '. ' + reason
2732
+
2733
+ self ._alert_malformed (msg , row_num + 1 )
2734
+
2735
+ # see gh-13320
2736
+ zipped_content = list (lib .to_object_array (
2737
+ content , min_width = col_len ).T )
2696
2738
2697
2739
if self .usecols :
2698
2740
if self ._implicit_index :
@@ -2750,10 +2792,12 @@ def _get_lines(self, rows=None):
2750
2792
2751
2793
while True :
2752
2794
new_row = self ._next_iter_line (
2753
- row_num = self .pos + rows )
2754
- new_rows .append (new_row )
2795
+ row_num = self .pos + rows + 1 )
2755
2796
rows += 1
2756
2797
2798
+ if new_row is not None :
2799
+ new_rows .append (new_row )
2800
+
2757
2801
except StopIteration :
2758
2802
if self .skiprows :
2759
2803
new_rows = [row for i , row in enumerate (new_rows )
0 commit comments