BUG: Support for checking the first row for errors with index_col=False (#40333)

Nicholas J Riasanovsky · Nicholas J Riasanovsky · commit a1ed0de3f151 · 2021-03-24T23:29:45.000-07:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -595,6 +595,7 @@ I/O
 - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
 - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
+- Bug in "func:`pandas.read_csv` failed to raise ParserError when first row had too many columns and index_col=False (:issue:`40333`)
 
 Period
 ^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -215,6 +215,8 @@ cdef extern from "parser/tokenizer.h":
         int64_t header_start        # header row start
         uint64_t header_end         # header row end
 
+        int allow_leading_cols     # Boolean: 1: can infer index col, 0: no index col
+
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
@@ -376,6 +378,7 @@ cdef class TextReader:
         self.encoding_errors = PyBytes_AsString(encoding_errors)
 
         self.parser = parser_new()
+        self.parser.allow_leading_cols = allow_leading_cols
         self.parser.chunksize = tokenize_chunksize
 
         self.mangle_dupe_cols = mangle_dupe_cols
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -445,7 +445,7 @@ static int end_line(parser_t *self) {
         return 0;
     }
 
-    if (!(self->lines <= self->header_end + 1) &&
+    if (!(self->lines <= self->header_end + self->allow_leading_cols) &&
         (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
         // increment file line count
         self->file_lines++;
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -150,6 +150,9 @@ typedef struct parser_t {
     int64_t header_start;  // header row start
     uint64_t header_end;   // header row end
 
+    int allow_leading_cols; // Boolean: 1: can infer index col, 0: no index col
+
+
     void *skipset;
     PyObject *skipfunc;
     int64_t skip_first_N_rows;
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -883,7 +883,7 @@ def _rows_to_cols(self, content):
         # Check that there are no rows with too many
         # elements in their row (rows with too few
         # elements are padded with NaN).
-        if max_len > col_len and self.index_col is not False and self.usecols is None:
+        if max_len > col_len and self.usecols is None:
 
             footers = self.skipfooter if self.skipfooter else 0
             bad_lines = []
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import ParserError
+
 from pandas import (
     DataFrame,
     Index,
@@ -283,3 +285,20 @@ def test_multiindex_columns_index_col_with_data(all_parsers):
         index=Index(["data"]),
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_index_col_false_error(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    with pytest.raises(ParserError, match="Expected 3 fields in line 2, saw 4"):
+        parser.read_csv(StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False)
+
+
+def test_index_col_false_error_ignore(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False, error_bad_lines=False
+    )
+    expected = DataFrame({"a": [1], "b": [2], "c": [3]})
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -445,7 +445,7 @@ static int end_line(parser_t *self) {`
`445`	`445`	`return 0;`
`446`	`446`	`}`
`447`	`447`
`448`		`- if (!(self->lines <= self->header_end + 1) &&`
	`448`	`+ if (!(self->lines <= self->header_end + self->allow_leading_cols) &&`
`449`	`449`	`(self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {`
`450`	`450`	`// increment file line count`
`451`	`451`	`self->file_lines++;`