pandas-dev · njriasan · Mar 25, 2021 · Apr 6, 2021 · Apr 7, 2021 · Apr 7, 2021
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -782,7 +782,10 @@ I/O
 - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
 - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
+- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`)
 - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
+- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`)
+- Bug in :func:`read_csv` failing to raise ParserError when ``names is not None`` and ``header=None`` (:issue:`22144`)
 
 Period
 ^^^^^^

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -215,6 +215,13 @@ cdef extern from "parser/tokenizer.h":
         int64_t header_start        # header row start
         uint64_t header_end         # header row end
 
+        bint allow_leading_cols     # Boolean: 1: can infer index col, 0: no index col
+        bint skip_header_end        # Boolean: 1: Header=None,
+                                    # 0 Header is not None
+                                    # This is used because header_end is
+                                    # uint64_t so there is no valid NULL
+                                    # value (i.e. header_end == -1).
+
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
@@ -378,6 +385,7 @@ cdef class TextReader:
         self.encoding_errors = PyBytes_AsString(encoding_errors)
 
         self.parser = parser_new()
+        self.parser.allow_leading_cols = allow_leading_cols
         self.parser.chunksize = tokenize_chunksize
 
         self.mangle_dupe_cols = mangle_dupe_cols
@@ -517,11 +525,13 @@ cdef class TextReader:
         if header is None:
             # sentinel value
             self.parser.header_start = -1
-            self.parser.header_end = -1
+            self.parser.skip_header_end = True
+            self.parser.header_end = 0
             self.parser.header = -1
             self.parser_start = 0
             prelim_header = []
         else:
+            self.parser.skip_header_end = False
             if isinstance(header, list):
                 if len(header) > 1:
                     # need to artificially skip the final line

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -444,9 +444,28 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
         return 0;
     }
-
-    if (!(self->lines <= self->header_end + 1) &&
-        (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
+    // Explanation of each condition:
+    // Cond1: (self->skip_header_end ||
+    // !(self->lines <= (self->header_end + self->allow_leading_cols)))
+    // We don't check the expected number of fields within the header
+    // lines and we are allowed to infer the index.
+    // We check for if Header=None is specified with self->skip_header_end.
+    // Cond2: (ex_fields > 0) && (fields > ex_fields)
+    // We only throw an error if we know how many fields
+    // to expect and have encountered too many fields.
+    // Cond3: !(self->usecols)
+    // Ignore field parsing errors if we will use a subset of the columns.
+    // Cond4: !(((fields - 1) == ex_fields)
+    // && !self->stream[self->stream_len - 2])
+    // Ignore a trailing delimter (see gh-2442) by checking if
+    // the last field is empty. We determine this if the next
+    // to last character is null (last character must be null).
+    if ((self->skip_header_end
+        || !(self->lines <= (self->header_end + self->allow_leading_cols)))
+        && (ex_fields > 0 && fields > ex_fields)
+        && !(self->usecols)
+        && !(((fields - 1) == ex_fields) &&
+        !self->stream[self->stream_len - 2])) {
         // increment file line count
         self->file_lines++;
 

diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -150,6 +150,12 @@ typedef struct parser_t {
     int64_t header_start;  // header row start
     uint64_t header_end;   // header row end
 
+    int allow_leading_cols;  // Boolean: 1: can infer index col, 0: no index col
+    int skip_header_end;     // Boolean: 1: Header=None, 0 Header is not None
+                             // This is used because header_end
+                             // is uint64_t so there is no valid NULL value
+                             // (i.e. header_end == -1).
+
     void *skipset;
     PyObject *skipfunc;
     int64_t skip_first_N_rows;

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -883,7 +883,7 @@ def _rows_to_cols(self, content):
         # Check that there are no rows with too many
         # elements in their row (rows with too few
         # elements are padded with NaN).
-        if max_len > col_len and self.index_col is not False and self.usecols is None:
+        if max_len > col_len and self.usecols is None:
 
             footers = self.skipfooter if self.skipfooter else 0
             bad_lines = []
@@ -894,6 +894,10 @@ def _rows_to_cols(self, content):
 
             for (i, l) in iter_content:
                 actual_len = len(l)
+                # Check and remove trailing delimiters see gh-2442
+                if actual_len == (col_len + 1) and l[-1] == "":
+                    l.pop()
+                    actual_len -= 1
 
                 if actual_len > col_len:
                     if self.error_bad_lines or self.warn_bad_lines:

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -667,11 +667,10 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
 def test_no_header_two_extra_columns(all_parsers):
     # GH 26218
     column_names = ["one", "two", "three"]
-    ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
     stream = StringIO("foo,bar,baz,bam,blah")
     parser = all_parsers
-    df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
-    tm.assert_frame_equal(df, ref)
+    with pytest.raises(ParserError, match="Expected 3 fields in line 1, saw 5"):
+        parser.read_csv(stream, header=None, names=column_names, index_col=False)
 
 
 def test_read_csv_names_not_accepting_sets(all_parsers):

diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import ParserError
+
 from pandas import (
     DataFrame,
     Index,
@@ -283,3 +285,20 @@ def test_multiindex_columns_index_col_with_data(all_parsers):
         index=Index(["data"]),
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_index_col_false_error(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    with pytest.raises(ParserError, match="Expected 3 fields in line 2, saw 4"):
+        parser.read_csv(StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False)
+
+
+def test_index_col_false_error_ignore(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False, error_bad_lines=False
+    )
+    expected = DataFrame({"a": [1], "b": [2], "c": [3]})
+    tm.assert_frame_equal(result, expected)