From 478bbc7134947b4a07dba9eaf45c9182adad8bf1 Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Wed, 24 Mar 2021 21:48:26 -0700
Subject: [PATCH 1/9] BUG: Support for checking the first row for errors with
 index_col=False (#40333)

---
 doc/source/whatsnew/v1.3.0.rst           |   1 +
 pandas/_libs/parsers.pyx                 |   3 +
 pandas/_libs/src/parser/tokenizer.c      | 145 +++++++++++------------
 pandas/_libs/src/parser/tokenizer.h      |  38 +++---
 pandas/io/parsers/python_parser.py       |   2 +-
 pandas/tests/io/parser/test_index_col.py |  19 +++
 6 files changed, 112 insertions(+), 96 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 512e6e6cbb391..b609ce37a3996 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -595,6 +595,7 @@ I/O
 - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
 - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
+- Bug in "func:`pandas.read_csv` failed to raise ParserError when first row had too many columns and index_col=False (:issue:`40333`)
 
 Period
 ^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index a11bf370412d2..045ac2a7db689 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -215,6 +215,8 @@ cdef extern from "parser/tokenizer.h":
         int64_t header_start        # header row start
         uint64_t header_end         # header row end
 
+        int allow_leading_cols      # Boolean: 1: can infer index col, 0: no index col
+
         void *skipset
         PyObject *skipfunc
         int64_t skip_first_N_rows
@@ -376,6 +378,7 @@ cdef class TextReader:
         self.encoding_errors = PyBytes_AsString(encoding_errors)
 
         self.parser = parser_new()
+        self.parser.allow_leading_cols = allow_leading_cols
         self.parser.chunksize = tokenize_chunksize
 
         self.mangle_dupe_cols = mangle_dupe_cols
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 49eb1e7855098..cb452a8c97c1d 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -217,9 +217,7 @@ void parser_free(parser_t *self) {
     parser_cleanup(self);
 }
 
-void parser_del(parser_t *self) {
-    free(self);
-}
+void parser_del(parser_t *self) { free(self); }
 
 static int make_stream_space(parser_t *self, size_t nbytes) {
     uint64_t i, cap, length;
@@ -278,9 +276,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
     }
 
     self->words =
-        (char **)grow_buffer((void *)self->words, length,
-                             &self->words_cap, nbytes,
-                             sizeof(char *), &status);
+        (char **)grow_buffer((void *)self->words, length, &self->words_cap,
+                             nbytes, sizeof(char *), &status);
     TRACE(
         ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
          "%d)\n",
@@ -308,10 +305,9 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
       LINE VECTORS
     */
     cap = self->lines_cap;
-    self->line_start =
-        (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
-                           &self->lines_cap, nbytes,
-                           sizeof(int64_t), &status);
+    self->line_start = (int64_t *)grow_buffer((void *)self->line_start,
+                                              self->lines + 1, &self->lines_cap,
+                                              nbytes, sizeof(int64_t), &status);
     TRACE((
         "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
         self->lines + 1, self->lines_cap, nbytes, status))
@@ -445,7 +441,7 @@ static int end_line(parser_t *self) {
         return 0;
     }
 
-    if (!(self->lines <= self->header_end + 1) &&
+    if (!(self->lines <= self->header_end + self->allow_leading_cols) &&
         (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
         // increment file line count
         self->file_lines++;
@@ -460,8 +456,9 @@ static int end_line(parser_t *self) {
         if (self->error_bad_lines) {
             self->error_msg = malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                    "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
-                    ex_fields, self->file_lines, fields);
+                     "Expected %d fields in line %" PRIu64 ", saw %" PRId64
+                     "\n",
+                     ex_fields, self->file_lines, fields);
 
             TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
 
@@ -472,16 +469,16 @@ static int end_line(parser_t *self) {
                 // pass up error message
                 msg = malloc(bufsize);
                 snprintf(msg, bufsize,
-                        "Skipping line %" PRIu64 ": expected %d fields, saw %"
-                        PRId64 "\n", self->file_lines, ex_fields, fields);
+                         "Skipping line %" PRIu64
+                         ": expected %d fields, saw %" PRId64 "\n",
+                         self->file_lines, ex_fields, fields);
                 append_warning(self, msg);
                 free(msg);
             }
         }
     } else {
         // missing trailing delimiters
-        if ((self->lines >= self->header_end + 1) &&
-                fields < ex_fields) {
+        if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
             // might overrun the buffer when closing fields
             if (make_stream_space(self, ex_fields - fields) < 0) {
                 int64_t bufsize = 100;
@@ -592,20 +589,20 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
 
 */
 
-#define PUSH_CHAR(c)                                                          \
-    TRACE(                                                                    \
-        ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
-         c, slen, self->stream_cap, self->stream_len))                        \
-    if (slen >= self->stream_cap) {                                           \
-        TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,      \
-               self->stream_cap))                                             \
-        int64_t bufsize = 100;                                                \
-        self->error_msg = malloc(bufsize);                                    \
-        snprintf(self->error_msg, bufsize,                                    \
-                 "Buffer overflow caught - possible malformed input file.\n");\
-        return PARSER_OUT_OF_MEMORY;                                          \
-    }                                                                         \
-    *stream++ = c;                                                            \
+#define PUSH_CHAR(c)                                                           \
+    TRACE(                                                                     \
+        ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n",  \
+         c, slen, self->stream_cap, self->stream_len))                         \
+    if (slen >= self->stream_cap) {                                            \
+        TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,       \
+               self->stream_cap))                                              \
+        int64_t bufsize = 100;                                                 \
+        self->error_msg = malloc(bufsize);                                     \
+        snprintf(self->error_msg, bufsize,                                     \
+                 "Buffer overflow caught - possible malformed input file.\n"); \
+        return PARSER_OUT_OF_MEMORY;                                           \
+    }                                                                          \
+    *stream++ = c;                                                             \
     slen++;
 
 // This is a little bit of a hack but works for now
@@ -647,8 +644,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
 
 #define END_LINE() END_LINE_STATE(START_RECORD)
 
-#define IS_TERMINATOR(c)                            \
-    (c == line_terminator)
+#define IS_TERMINATOR(c) (c == line_terminator)
 
 #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
 
@@ -708,8 +704,7 @@ int skip_this_line(parser_t *self, int64_t rownum) {
     }
 }
 
-int tokenize_bytes(parser_t *self,
-                   size_t line_limit, uint64_t start_lines) {
+int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) {
     int64_t i;
     uint64_t slen;
     int should_skip;
@@ -717,16 +712,16 @@ int tokenize_bytes(parser_t *self,
     char *stream;
     char *buf = self->data + self->datapos;
 
-    const char line_terminator = (self->lineterminator == '\0') ?
-            '\n' : self->lineterminator;
+    const char line_terminator =
+        (self->lineterminator == '\0') ? '\n' : self->lineterminator;
 
     // 1000 is something that couldn't fit in "char"
     // thus comparing a char to it would always be "false"
     const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
-    const int comment_symbol = (self->commentchar != '\0') ?
-            self->commentchar : 1000;
-    const int escape_symbol = (self->escapechar != '\0') ?
-            self->escapechar : 1000;
+    const int comment_symbol =
+        (self->commentchar != '\0') ? self->commentchar : 1000;
+    const int escape_symbol =
+        (self->escapechar != '\0') ? self->escapechar : 1000;
 
     if (make_stream_space(self, self->datalen - self->datapos) < 0) {
         int64_t bufsize = 100;
@@ -833,7 +828,7 @@ int tokenize_bytes(parser_t *self,
                     }
                     break;
                 }
-            // fall through
+                // fall through
 
             case EAT_WHITESPACE:
                 if (IS_TERMINATOR(c)) {
@@ -1061,10 +1056,10 @@ int tokenize_bytes(parser_t *self,
                 } else {
                     if (self->delim_whitespace) {
                         /* XXX
-                        * first character of a new record--need to back up and
-                        * reread
-                        * to handle properly...
-                        */
+                         * first character of a new record--need to back up and
+                         * reread
+                         * to handle properly...
+                         */
                         i--;
                         buf--;  // back up one character (HACK!)
                         END_LINE_STATE(START_RECORD);
@@ -1144,8 +1139,8 @@ static int parser_handle_eof(parser_t *self) {
         case IN_QUOTED_FIELD:
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                    "EOF inside string starting at row %" PRIu64,
-                    self->file_lines);
+                     "EOF inside string starting at row %" PRIu64,
+                     self->file_lines);
             return -1;
 
         case ESCAPED_CHAR:
@@ -1267,8 +1262,8 @@ int parser_trim_buffers(parser_t *self) {
         if (self->words == NULL) {
             return PARSER_OUT_OF_MEMORY;
         }
-        self->word_starts = realloc(self->word_starts,
-                                    new_cap * sizeof(int64_t));
+        self->word_starts =
+            realloc(self->word_starts, new_cap * sizeof(int64_t));
         if (self->word_starts == NULL) {
             return PARSER_OUT_OF_MEMORY;
         }
@@ -1311,15 +1306,13 @@ int parser_trim_buffers(parser_t *self) {
     new_cap = _next_pow2(self->lines) + 1;
     if (new_cap < self->lines_cap) {
         TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
-        newptr = realloc(self->line_start,
-                              new_cap * sizeof(int64_t));
+        newptr = realloc(self->line_start, new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
             self->line_start = newptr;
         }
-        newptr = realloc(self->line_fields,
-                              new_cap * sizeof(int64_t));
+        newptr = realloc(self->line_fields, new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
@@ -1353,8 +1346,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all,
         if (!all && self->lines - start_lines >= nrows) break;
 
         if (self->datapos == self->datalen) {
-            status = parser_buffer_bytes(self, self->chunksize,
-                                         encoding_errors);
+            status =
+                parser_buffer_bytes(self, self->chunksize, encoding_errors);
 
             if (status == REACHED_EOF) {
                 // close out last line
@@ -1413,11 +1406,11 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
  */
 int to_boolean(const char *item, uint8_t *val) {
     if (strcasecmp(item, "TRUE") == 0) {
-      *val = 1;
-      return 0;
+        *val = 1;
+        return 0;
     } else if (strcasecmp(item, "FALSE") == 0) {
-      *val = 0;
-      return 0;
+        *val = 0;
+        return 0;
     }
 
     return -1;
@@ -1611,9 +1604,9 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     return number;
 }
 
-double precise_xstrtod(const char *str, char **endptr, char decimal,
-                       char sci, char tsep, int skip_trailing,
-                       int *error, int *maybe_int) {
+double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
+                       char tsep, int skip_trailing, int *error,
+                       int *maybe_int) {
     double number;
     int exponent;
     int negative;
@@ -1751,7 +1744,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
     } else if (exponent > 0) {
         number *= e[exponent];
     } else if (exponent < -308) {  // Subnormal
-        if (exponent < -616) {  // Prevent invalid array access.
+        if (exponent < -616) {     // Prevent invalid array access.
             number = 0.;
         } else {
             number /= e[-308 - exponent];
@@ -1779,7 +1772,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
    with a call to `free`.
 */
 
-char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
+char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
                               char tsep) {
     const char *p = s;
     size_t length = strlen(s);
@@ -1796,17 +1789,15 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
     }
     // Replace `decimal` with '.'
     if (*p == decimal) {
-       *dst++ = '.';
-       p++;
+        *dst++ = '.';
+        p++;
     }
     // Copy the remainder of the string as is.
     strncpy(dst, p, length + 1 - (p - s));
-    if (endpos != NULL)
-        *endpos = (char *)(s + length);
+    if (endpos != NULL) *endpos = (char *)(s + length);
     return s_copy;
 }
 
-
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
     // 'normalize' representation to C-locale; replace decimal with '.' and
@@ -1822,20 +1813,22 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
     // PyOS_string_to_double needs to consume the whole string
     if (endpc == pc + strlen(pc)) {
         if (q != NULL) {
-           // report endptr from source string (p)
+            // report endptr from source string (p)
             *q = endptr;
         }
     } else {
         *error = -1;
         if (q != NULL) {
-           // p and pc are different len due to tsep removal. Can't report
-           // how much it has consumed of p. Just rewind to beginning.
-           *q = (char *)p;  // TODO(willayd): this could be undefined behavior
+            // p and pc are different len due to tsep removal. Can't report
+            // how much it has consumed of p. Just rewind to beginning.
+            *q = (char *)p;  // TODO(willayd): this could be undefined behavior
         }
     }
     if (maybe_int != NULL) *maybe_int = 0;
-    if (PyErr_Occurred() != NULL) *error = -1;
-    else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
+    if (PyErr_Occurred() != NULL)
+        *error = -1;
+    else if (r == Py_HUGE_VAL)
+        *error = (int)Py_HUGE_VAL;
     PyErr_Clear();
 
     PyGILState_Release(gstate);
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index f69fee4993d34..81a8c8936c2a9 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -19,10 +19,9 @@ See LICENSE for the license
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
 
+#include "../headers/portable.h"
 #include "../headers/stdint.h"
 #include "../inline_helper.h"
-#include "../headers/portable.h"
-
 #include "khash.h"
 
 #define STREAM_INIT_SIZE 32
@@ -30,7 +29,6 @@ See LICENSE for the license
 #define REACHED_EOF 1
 #define CALLING_READ_FAILED 2
 
-
 /*
 
   C flat file parsing low level code for pandas / NumPy
@@ -93,9 +91,9 @@ typedef struct parser_t {
     io_callback cb_io;
     io_cleanup cb_cleanup;
 
-    int64_t chunksize;      // Number of bytes to prepare for each chunk
-    char *data;             // pointer to data to be processed
-    int64_t datalen;        // amount of data available
+    int64_t chunksize;  // Number of bytes to prepare for each chunk
+    char *data;         // pointer to data to be processed
+    int64_t datalen;    // amount of data available
     int64_t datapos;
 
     // where to write out tokenized data
@@ -105,19 +103,19 @@ typedef struct parser_t {
 
     // Store words in (potentially ragged) matrix for now, hmm
     char **words;
-    int64_t *word_starts;   // where we are in the stream
+    int64_t *word_starts;  // where we are in the stream
     uint64_t words_len;
     uint64_t words_cap;
     uint64_t max_words_cap;  // maximum word cap encountered
 
-    char *pword_start;      // pointer to stream start of current field
-    int64_t word_start;     // position start of current field
+    char *pword_start;   // pointer to stream start of current field
+    int64_t word_start;  // position start of current field
 
-    int64_t *line_start;    // position in words for start of line
-    int64_t *line_fields;   // Number of fields in each line
-    uint64_t lines;         // Number of (good) lines observed
-    uint64_t file_lines;    // Number of lines (including bad or skipped)
-    uint64_t lines_cap;     // Vector capacity
+    int64_t *line_start;   // position in words for start of line
+    int64_t *line_fields;  // Number of fields in each line
+    uint64_t lines;        // Number of (good) lines observed
+    uint64_t file_lines;   // Number of lines (including bad or skipped)
+    uint64_t lines_cap;    // Vector capacity
 
     // Tokenizing stuff
     ParserState state;
@@ -150,12 +148,14 @@ typedef struct parser_t {
     int64_t header_start;  // header row start
     uint64_t header_end;   // header row end
 
+    int allow_leading_cols;  // Boolean: 1: can infer index col, 0: no index col
+
     void *skipset;
     PyObject *skipfunc;
     int64_t skip_first_N_rows;
     int64_t skip_footer;
-    double (*double_converter)(const char *, char **,
-                               char, char, char, int, int *, int *);
+    double (*double_converter)(const char *, char **, char, char, char, int,
+                               int *, int *);
 
     // error handling
     char *warn_msg;
@@ -219,9 +219,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
                int skip_trailing, int *error, int *maybe_int);
-double precise_xstrtod(const char *p, char **q, char decimal,
-                       char sci, char tsep, int skip_trailing,
-                       int *error, int *maybe_int);
+double precise_xstrtod(const char *p, char **q, char decimal, char sci,
+                       char tsep, int skip_trailing, int *error,
+                       int *maybe_int);
 
 // GH-15140 - round_trip requires and acquires the GIL on its own
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 37f553c724c9e..d30af554b08f7 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -883,7 +883,7 @@ def _rows_to_cols(self, content):
         # Check that there are no rows with too many
         # elements in their row (rows with too few
         # elements are padded with NaN).
-        if max_len > col_len and self.index_col is not False and self.usecols is None:
+        if max_len > col_len and self.usecols is None:
 
             footers = self.skipfooter if self.skipfooter else 0
             bad_lines = []
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
index 2f876a28c56cd..ede51ea1b6631 100644
--- a/pandas/tests/io/parser/test_index_col.py
+++ b/pandas/tests/io/parser/test_index_col.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas.errors import ParserError
+
 from pandas import (
     DataFrame,
     Index,
@@ -283,3 +285,20 @@ def test_multiindex_columns_index_col_with_data(all_parsers):
         index=Index(["data"]),
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_index_col_false_error(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    with pytest.raises(ParserError, match="Expected 3 fields in line 2, saw 4"):
+        parser.read_csv(StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False)
+
+
+def test_index_col_false_error_ignore(all_parsers):
+    # GH#40333
+    parser = all_parsers
+    result = parser.read_csv(
+        StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False, error_bad_lines=False
+    )
+    expected = DataFrame({"a": [1], "b": [2], "c": [3]})
+    tm.assert_frame_equal(result, expected)

From a1b2e5c922c48c87b972fed2dba0ab9b8a3c5225 Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Tue, 6 Apr 2021 01:05:56 -0700
Subject: [PATCH 2/9] Update PR changes

---
 doc/source/whatsnew/v1.3.0.rst      |   2 +-
 pandas/_libs/parsers.pyx            |   2 +-
 pandas/_libs/src/parser/tokenizer.c | 143 +++++++++++++++-------------
 pandas/_libs/src/parser/tokenizer.h |  36 +++----
 4 files changed, 96 insertions(+), 87 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index b609ce37a3996..4b7efdce43861 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -595,7 +595,7 @@ I/O
 - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
 - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
-- Bug in "func:`pandas.read_csv` failed to raise ParserError when first row had too many columns and index_col=False (:issue:`40333`)
+- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`)
 
 Period
 ^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 045ac2a7db689..7492b13593435 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -215,7 +215,7 @@ cdef extern from "parser/tokenizer.h":
         int64_t header_start        # header row start
         uint64_t header_end         # header row end
 
-        int allow_leading_cols      # Boolean: 1: can infer index col, 0: no index col
+        bint allow_leading_cols      # Boolean: 1: can infer index col, 0: no index col
 
         void *skipset
         PyObject *skipfunc
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index cb452a8c97c1d..104686c34a0b7 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -217,7 +217,9 @@ void parser_free(parser_t *self) {
     parser_cleanup(self);
 }
 
-void parser_del(parser_t *self) { free(self); }
+void parser_del(parser_t *self) {
+    free(self);
+}
 
 static int make_stream_space(parser_t *self, size_t nbytes) {
     uint64_t i, cap, length;
@@ -276,8 +278,9 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
     }
 
     self->words =
-        (char **)grow_buffer((void *)self->words, length, &self->words_cap,
-                             nbytes, sizeof(char *), &status);
+        (char **)grow_buffer((void *)self->words, length,
+                             &self->words_cap, nbytes,
+                             sizeof(char *), &status);
     TRACE(
         ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
          "%d)\n",
@@ -305,9 +308,10 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
       LINE VECTORS
     */
     cap = self->lines_cap;
-    self->line_start = (int64_t *)grow_buffer((void *)self->line_start,
-                                              self->lines + 1, &self->lines_cap,
-                                              nbytes, sizeof(int64_t), &status);
+    self->line_start =
+        (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
+                           &self->lines_cap, nbytes,
+                           sizeof(int64_t), &status);
     TRACE((
         "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
         self->lines + 1, self->lines_cap, nbytes, status))
@@ -456,9 +460,8 @@ static int end_line(parser_t *self) {
         if (self->error_bad_lines) {
             self->error_msg = malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                     "Expected %d fields in line %" PRIu64 ", saw %" PRId64
-                     "\n",
-                     ex_fields, self->file_lines, fields);
+                    "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n",
+                    ex_fields, self->file_lines, fields);
 
             TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
 
@@ -469,16 +472,16 @@ static int end_line(parser_t *self) {
                 // pass up error message
                 msg = malloc(bufsize);
                 snprintf(msg, bufsize,
-                         "Skipping line %" PRIu64
-                         ": expected %d fields, saw %" PRId64 "\n",
-                         self->file_lines, ex_fields, fields);
+                        "Skipping line %" PRIu64 ": expected %d fields, saw %"
+                        PRId64 "\n", self->file_lines, ex_fields, fields);
                 append_warning(self, msg);
                 free(msg);
             }
         }
     } else {
         // missing trailing delimiters
-        if ((self->lines >= self->header_end + 1) && fields < ex_fields) {
+        if ((self->lines >= self->header_end + 1) &&
+                fields < ex_fields) {
             // might overrun the buffer when closing fields
             if (make_stream_space(self, ex_fields - fields) < 0) {
                 int64_t bufsize = 100;
@@ -589,20 +592,20 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
 
 */
 
-#define PUSH_CHAR(c)                                                           \
-    TRACE(                                                                     \
-        ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n",  \
-         c, slen, self->stream_cap, self->stream_len))                         \
-    if (slen >= self->stream_cap) {                                            \
-        TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,       \
-               self->stream_cap))                                              \
-        int64_t bufsize = 100;                                                 \
-        self->error_msg = malloc(bufsize);                                     \
-        snprintf(self->error_msg, bufsize,                                     \
-                 "Buffer overflow caught - possible malformed input file.\n"); \
-        return PARSER_OUT_OF_MEMORY;                                           \
-    }                                                                          \
-    *stream++ = c;                                                             \
+#define PUSH_CHAR(c)                                                          \
+    TRACE(                                                                    \
+        ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
+         c, slen, self->stream_cap, self->stream_len))                        \
+    if (slen >= self->stream_cap) {                                           \
+        TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,      \
+               self->stream_cap))                                             \
+        int64_t bufsize = 100;                                                \
+        self->error_msg = malloc(bufsize);                                    \
+        snprintf(self->error_msg, bufsize,                                    \
+                 "Buffer overflow caught - possible malformed input file.\n");\
+        return PARSER_OUT_OF_MEMORY;                                          \
+    }                                                                         \
+    *stream++ = c;                                                            \
     slen++;
 
 // This is a little bit of a hack but works for now
@@ -644,7 +647,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes,
 
 #define END_LINE() END_LINE_STATE(START_RECORD)
 
-#define IS_TERMINATOR(c) (c == line_terminator)
+#define IS_TERMINATOR(c)                            \
+    (c == line_terminator)
 
 #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
 
@@ -704,7 +708,8 @@ int skip_this_line(parser_t *self, int64_t rownum) {
     }
 }
 
-int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) {
+int tokenize_bytes(parser_t *self,
+                   size_t line_limit, uint64_t start_lines) {
     int64_t i;
     uint64_t slen;
     int should_skip;
@@ -712,16 +717,16 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) {
     char *stream;
     char *buf = self->data + self->datapos;
 
-    const char line_terminator =
-        (self->lineterminator == '\0') ? '\n' : self->lineterminator;
+    const char line_terminator = (self->lineterminator == '\0') ?
+            '\n' : self->lineterminator;
 
     // 1000 is something that couldn't fit in "char"
     // thus comparing a char to it would always be "false"
     const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
-    const int comment_symbol =
-        (self->commentchar != '\0') ? self->commentchar : 1000;
-    const int escape_symbol =
-        (self->escapechar != '\0') ? self->escapechar : 1000;
+    const int comment_symbol = (self->commentchar != '\0') ?
+            self->commentchar : 1000;
+    const int escape_symbol = (self->escapechar != '\0') ?
+            self->escapechar : 1000;
 
     if (make_stream_space(self, self->datalen - self->datapos) < 0) {
         int64_t bufsize = 100;
@@ -828,7 +833,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) {
                     }
                     break;
                 }
-                // fall through
+            // fall through
 
             case EAT_WHITESPACE:
                 if (IS_TERMINATOR(c)) {
@@ -1056,10 +1061,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) {
                 } else {
                     if (self->delim_whitespace) {
                         /* XXX
-                         * first character of a new record--need to back up and
-                         * reread
-                         * to handle properly...
-                         */
+                        * first character of a new record--need to back up and
+                        * reread
+                        * to handle properly...
+                        */
                         i--;
                         buf--;  // back up one character (HACK!)
                         END_LINE_STATE(START_RECORD);
@@ -1139,8 +1144,8 @@ static int parser_handle_eof(parser_t *self) {
         case IN_QUOTED_FIELD:
             self->error_msg = (char *)malloc(bufsize);
             snprintf(self->error_msg, bufsize,
-                     "EOF inside string starting at row %" PRIu64,
-                     self->file_lines);
+                    "EOF inside string starting at row %" PRIu64,
+                    self->file_lines);
             return -1;
 
         case ESCAPED_CHAR:
@@ -1262,8 +1267,8 @@ int parser_trim_buffers(parser_t *self) {
         if (self->words == NULL) {
             return PARSER_OUT_OF_MEMORY;
         }
-        self->word_starts =
-            realloc(self->word_starts, new_cap * sizeof(int64_t));
+        self->word_starts = realloc(self->word_starts,
+                                    new_cap * sizeof(int64_t));
         if (self->word_starts == NULL) {
             return PARSER_OUT_OF_MEMORY;
         }
@@ -1306,13 +1311,15 @@ int parser_trim_buffers(parser_t *self) {
     new_cap = _next_pow2(self->lines) + 1;
     if (new_cap < self->lines_cap) {
         TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
-        newptr = realloc(self->line_start, new_cap * sizeof(int64_t));
+        newptr = realloc(self->line_start,
+                              new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
             self->line_start = newptr;
         }
-        newptr = realloc(self->line_fields, new_cap * sizeof(int64_t));
+        newptr = realloc(self->line_fields,
+                              new_cap * sizeof(int64_t));
         if (newptr == NULL) {
             return PARSER_OUT_OF_MEMORY;
         } else {
@@ -1346,8 +1353,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all,
         if (!all && self->lines - start_lines >= nrows) break;
 
         if (self->datapos == self->datalen) {
-            status =
-                parser_buffer_bytes(self, self->chunksize, encoding_errors);
+            status = parser_buffer_bytes(self, self->chunksize,
+                                         encoding_errors);
 
             if (status == REACHED_EOF) {
                 // close out last line
@@ -1406,11 +1413,11 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
  */
 int to_boolean(const char *item, uint8_t *val) {
     if (strcasecmp(item, "TRUE") == 0) {
-        *val = 1;
-        return 0;
+      *val = 1;
+      return 0;
     } else if (strcasecmp(item, "FALSE") == 0) {
-        *val = 0;
-        return 0;
+      *val = 0;
+      return 0;
     }
 
     return -1;
@@ -1604,9 +1611,9 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
     return number;
 }
 
-double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
-                       char tsep, int skip_trailing, int *error,
-                       int *maybe_int) {
+double precise_xstrtod(const char *str, char **endptr, char decimal,
+                       char sci, char tsep, int skip_trailing,
+                       int *error, int *maybe_int) {
     double number;
     int exponent;
     int negative;
@@ -1744,7 +1751,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
     } else if (exponent > 0) {
         number *= e[exponent];
     } else if (exponent < -308) {  // Subnormal
-        if (exponent < -616) {     // Prevent invalid array access.
+        if (exponent < -616) {  // Prevent invalid array access.
             number = 0.;
         } else {
             number /= e[-308 - exponent];
@@ -1772,7 +1779,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
    with a call to `free`.
 */
 
-char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
+char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
                               char tsep) {
     const char *p = s;
     size_t length = strlen(s);
@@ -1789,15 +1796,17 @@ char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
     }
     // Replace `decimal` with '.'
     if (*p == decimal) {
-        *dst++ = '.';
-        p++;
+       *dst++ = '.';
+       p++;
     }
     // Copy the remainder of the string as is.
     strncpy(dst, p, length + 1 - (p - s));
-    if (endpos != NULL) *endpos = (char *)(s + length);
+    if (endpos != NULL)
+        *endpos = (char *)(s + length);
     return s_copy;
 }
 
+
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
                   int skip_trailing, int *error, int *maybe_int) {
     // 'normalize' representation to C-locale; replace decimal with '.' and
@@ -1813,22 +1822,20 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
     // PyOS_string_to_double needs to consume the whole string
     if (endpc == pc + strlen(pc)) {
         if (q != NULL) {
-            // report endptr from source string (p)
+           // report endptr from source string (p)
             *q = endptr;
         }
     } else {
         *error = -1;
         if (q != NULL) {
-            // p and pc are different len due to tsep removal. Can't report
-            // how much it has consumed of p. Just rewind to beginning.
-            *q = (char *)p;  // TODO(willayd): this could be undefined behavior
+           // p and pc are different len due to tsep removal. Can't report
+           // how much it has consumed of p. Just rewind to beginning.
+           *q = (char *)p;  // TODO(willayd): this could be undefined behavior
         }
     }
     if (maybe_int != NULL) *maybe_int = 0;
-    if (PyErr_Occurred() != NULL)
-        *error = -1;
-    else if (r == Py_HUGE_VAL)
-        *error = (int)Py_HUGE_VAL;
+    if (PyErr_Occurred() != NULL) *error = -1;
+    else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
     PyErr_Clear();
 
     PyGILState_Release(gstate);
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index 81a8c8936c2a9..5e423231854d8 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -19,9 +19,10 @@ See LICENSE for the license
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
 
-#include "../headers/portable.h"
 #include "../headers/stdint.h"
 #include "../inline_helper.h"
+#include "../headers/portable.h"
+
 #include "khash.h"
 
 #define STREAM_INIT_SIZE 32
@@ -29,6 +30,7 @@ See LICENSE for the license
 #define REACHED_EOF 1
 #define CALLING_READ_FAILED 2
 
+
 /*
 
   C flat file parsing low level code for pandas / NumPy
@@ -91,9 +93,9 @@ typedef struct parser_t {
     io_callback cb_io;
     io_cleanup cb_cleanup;
 
-    int64_t chunksize;  // Number of bytes to prepare for each chunk
-    char *data;         // pointer to data to be processed
-    int64_t datalen;    // amount of data available
+    int64_t chunksize;      // Number of bytes to prepare for each chunk
+    char *data;             // pointer to data to be processed
+    int64_t datalen;        // amount of data available
     int64_t datapos;
 
     // where to write out tokenized data
@@ -103,19 +105,19 @@ typedef struct parser_t {
 
     // Store words in (potentially ragged) matrix for now, hmm
     char **words;
-    int64_t *word_starts;  // where we are in the stream
+    int64_t *word_starts;   // where we are in the stream
     uint64_t words_len;
     uint64_t words_cap;
     uint64_t max_words_cap;  // maximum word cap encountered
 
-    char *pword_start;   // pointer to stream start of current field
-    int64_t word_start;  // position start of current field
+    char *pword_start;      // pointer to stream start of current field
+    int64_t word_start;     // position start of current field
 
-    int64_t *line_start;   // position in words for start of line
-    int64_t *line_fields;  // Number of fields in each line
-    uint64_t lines;        // Number of (good) lines observed
-    uint64_t file_lines;   // Number of lines (including bad or skipped)
-    uint64_t lines_cap;    // Vector capacity
+    int64_t *line_start;    // position in words for start of line
+    int64_t *line_fields;   // Number of fields in each line
+    uint64_t lines;         // Number of (good) lines observed
+    uint64_t file_lines;    // Number of lines (including bad or skipped)
+    uint64_t lines_cap;     // Vector capacity
 
     // Tokenizing stuff
     ParserState state;
@@ -154,8 +156,8 @@ typedef struct parser_t {
     PyObject *skipfunc;
     int64_t skip_first_N_rows;
     int64_t skip_footer;
-    double (*double_converter)(const char *, char **, char, char, char, int,
-                               int *, int *);
+    double (*double_converter)(const char *, char **,
+                               char, char, char, int, int *, int *);
 
     // error handling
     char *warn_msg;
@@ -219,9 +221,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
                int skip_trailing, int *error, int *maybe_int);
-double precise_xstrtod(const char *p, char **q, char decimal, char sci,
-                       char tsep, int skip_trailing, int *error,
-                       int *maybe_int);
+double precise_xstrtod(const char *p, char **q, char decimal,
+                       char sci, char tsep, int skip_trailing,
+                       int *error, int *maybe_int);
 
 // GH-15140 - round_trip requires and acquires the GIL on its own
 double round_trip(const char *p, char **q, char decimal, char sci, char tsep,

From 806e4f6709ab724a664b6b7e426fc14c870c659a Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Wed, 7 Apr 2021 02:01:15 -0700
Subject: [PATCH 3/9] fixed trailing delimiter test

---
 pandas/_libs/src/parser/tokenizer.c | 5 +++--
 pandas/io/parsers/python_parser.py  | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 104686c34a0b7..9c5da72eb9bbf 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -444,9 +444,10 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
         return 0;
     }
-
+    // Ignore any trailing delimters see gh-2442
     if (!(self->lines <= self->header_end + self->allow_leading_cols) &&
-        (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
+        (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)
+        && !((fields - 1) == ex_fields) && strlen(self->pword_start) == 0) {
         // increment file line count
         self->file_lines++;
 
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index d30af554b08f7..aa0a65a0b693d 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -894,6 +894,10 @@ def _rows_to_cols(self, content):
 
             for (i, l) in iter_content:
                 actual_len = len(l)
+                # Check and remove trailing delimiters see gh-2442
+                if actual_len == (col_len + 1) and l[-1] == "":
+                    l.pop()
+                    actual_len -= 1
 
                 if actual_len > col_len:
                     if self.error_bad_lines or self.warn_bad_lines:

From 0d55f5d2a2382d8b713c5a6a225eaaed99d3a832 Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Wed, 7 Apr 2021 03:11:56 -0700
Subject: [PATCH 4/9] fixed trailing delimiter detection

---
 pandas/_libs/src/parser/tokenizer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 9c5da72eb9bbf..4598b6b0c48fe 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -444,10 +444,13 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
         return 0;
     }
-    // Ignore any trailing delimters see gh-2442
+    // Ignore any trailing delimters see gh-2442 by checking if
+    // the last field is empty. We determine this if the next
+    // to last character is null (last character must be null).
     if (!(self->lines <= self->header_end + self->allow_leading_cols) &&
         (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)
-        && !((fields - 1) == ex_fields) && strlen(self->pword_start) == 0) {
+        && !(((fields - 1) == ex_fields) &&
+        !self->stream[self->stream_len - 2])) {
         // increment file line count
         self->file_lines++;
 

From 1151b93968fbe73c96d01363eaa508afe7ee12a7 Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Sun, 18 Apr 2021 14:30:55 -0700
Subject: [PATCH 5/9] fixed bug in name detection

---
 pandas/_libs/parsers.pyx                      | 11 ++++++++--
 pandas/_libs/src/parser/tokenizer.c           | 21 ++++++++++++++++---
 pandas/_libs/src/parser/tokenizer.h           |  4 ++++
 .../io/parser/common/test_common_basic.py     |  5 ++---
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 7492b13593435..bb01d51e21c68 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -215,7 +215,12 @@ cdef extern from "parser/tokenizer.h":
         int64_t header_start        # header row start
         uint64_t header_end         # header row end
 
-        bint allow_leading_cols      # Boolean: 1: can infer index col, 0: no index col
+        bint allow_leading_cols     # Boolean: 1: can infer index col, 0: no index col
+        bint skip_header_end        # Boolean: 1: Header=None,
+                                    # 0 Header is not None
+                                    # This is used because header_end is
+                                    # uint64_t so there is no valid NULL
+                                    # value (i.e. header_end == -1).
 
         void *skipset
         PyObject *skipfunc
@@ -518,11 +523,13 @@ cdef class TextReader:
         if header is None:
             # sentinel value
             self.parser.header_start = -1
-            self.parser.header_end = -1
+            self.parser.skip_header_end = True
+            self.parser.header_end = 0
             self.parser.header = -1
             self.parser_start = 0
             self.header = []
         else:
+            self.parser.skip_header_end = False
             if isinstance(header, list):
                 if len(header) > 1:
                     # need to artificially skip the final line
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 4598b6b0c48fe..83d2d7ee13c9e 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -444,11 +444,26 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
         return 0;
     }
-    // Ignore any trailing delimters see gh-2442 by checking if
+    // Explanation of each condition:
+    // Cond1: (self->skip_header_end ||
+    // !(self->lines <= (self->header_end + self->allow_leading_cols)))
+    // We don't check the expected number of fields within the header
+    // lines and we are allowed to infer the index.
+    // We check for if Header=None is specified with self->skip_header_end.
+    // Cond2: (ex_fields > 0) && (fields > ex_fields)
+    // We only throw an error if we know how many fields
+    // to expect and have encountered too many fields.
+    // Cond3: !(self->usecols)
+    // Ignore field parsing errors if we will use a subset of the columns.
+    // Cond4: !(((fields - 1) == ex_fields)
+    // && !self->stream[self->stream_len - 2])
+    // Ignore a trailing delimter (see gh-2442) by checking if
     // the last field is empty. We determine this if the next
     // to last character is null (last character must be null).
-    if (!(self->lines <= self->header_end + self->allow_leading_cols) &&
-        (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)
+    if ((self->skip_header_end
+        || !(self->lines <= (self->header_end + self->allow_leading_cols)))
+        && (ex_fields > 0 && fields > ex_fields)
+        && !(self->usecols)
         && !(((fields - 1) == ex_fields) &&
         !self->stream[self->stream_len - 2])) {
         // increment file line count
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index 5e423231854d8..2865c64f1f9cf 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -151,6 +151,10 @@ typedef struct parser_t {
     uint64_t header_end;   // header row end
 
     int allow_leading_cols;  // Boolean: 1: can infer index col, 0: no index col
+    int skip_header_end;     // Boolean: 1: Header=None, 0 Header is not None
+                             // This is used because header_end
+                             // is uint64_t so there is no valid NULL value
+                             // (i.e. header_end == -1).
 
     void *skipset;
     PyObject *skipfunc;
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index 572bc09c96886..24e4a5c58b48e 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -667,11 +667,10 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
 def test_no_header_two_extra_columns(all_parsers):
     # GH 26218
     column_names = ["one", "two", "three"]
-    ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
     stream = StringIO("foo,bar,baz,bam,blah")
     parser = all_parsers
-    df = parser.read_csv(stream, header=None, names=column_names, index_col=False)
-    tm.assert_frame_equal(df, ref)
+    with pytest.raises(ParserError, match="Expected 3 fields in line 1, saw 5"):
+        parser.read_csv(stream, header=None, names=column_names, index_col=False)
 
 
 def test_read_csv_names_not_accepting_sets(all_parsers):

From 185f62e70cf7e3cc9aca11ff104300f006e4855b Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Sun, 18 Apr 2021 15:47:21 -0700
Subject: [PATCH 6/9] fixed index col tests

---
 pandas/_libs/src/parser/tokenizer.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 83d2d7ee13c9e..573d08291f870 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -421,10 +421,10 @@ static int end_line(parser_t *self) {
 
     TRACE(("end_line: lines: %d\n", self->lines));
     if (self->lines > 0) {
-        if (self->expected_fields >= 0) {
-            ex_fields = self->expected_fields;
+        if (self->expected_fields > self->line_fields[self->lines - 1]) {
+             ex_fields = self->expected_fields;
         } else {
-            ex_fields = self->line_fields[self->lines - 1];
+             ex_fields = self->line_fields[self->lines - 1];
         }
     }
     TRACE(("end_line: ex_fields: %d\n", ex_fields));
@@ -445,11 +445,14 @@ static int end_line(parser_t *self) {
         return 0;
     }
     // Explanation of each condition:
-    // Cond1: (self->skip_header_end ||
-    // !(self->lines <= (self->header_end + self->allow_leading_cols)))
-    // We don't check the expected number of fields within the header
-    // lines and we are allowed to infer the index.
-    // We check for if Header=None is specified with self->skip_header_end.
+    // Cond1: !((self->skip_header_end
+    //           && (self->lines < self->allow_leading_cols))
+    //        || (!self->skip_header_end
+    //            && (self->lines <=
+    //            (self->header_end + self->allow_leading_cols))))
+    // Allow extra fields if there is no header, but there may be index columns
+    // in the first line or we are within the header and we may
+    // have index columns.
     // Cond2: (ex_fields > 0) && (fields > ex_fields)
     // We only throw an error if we know how many fields
     // to expect and have encountered too many fields.
@@ -460,8 +463,10 @@ static int end_line(parser_t *self) {
     // Ignore a trailing delimter (see gh-2442) by checking if
     // the last field is empty. We determine this if the next
     // to last character is null (last character must be null).
-    if ((self->skip_header_end
-        || !(self->lines <= (self->header_end + self->allow_leading_cols)))
+    if (!((self->skip_header_end && (self->lines < self->allow_leading_cols))
+            || (!self->skip_header_end
+                && (self->lines <=
+                (self->header_end + self->allow_leading_cols))))
         && (ex_fields > 0 && fields > ex_fields)
         && !(self->usecols)
         && !(((fields - 1) == ex_fields) &&

From a30526874e79144e033eca509b2867aad8f9bd1d Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Sun, 18 Apr 2021 15:58:24 -0700
Subject: [PATCH 7/9] Made the requested changes

---
 doc/source/whatsnew/v1.3.0.rst      |  1 -
 pandas/_libs/parsers.pyx            | 10 ++++-----
 pandas/_libs/src/parser/tokenizer.c | 34 +++++++++++------------------
 pandas/_libs/src/parser/tokenizer.h |  9 ++++----
 4 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 33e7e5836017c..e354e35498f14 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -782,7 +782,6 @@ I/O
 - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
 - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
 - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`)
-- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`)
 - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`)
 - Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`)
 - Bug in :func:`read_csv` failing to raise ParserError when ``names is not None`` and ``header=None`` (:issue:`22144`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 43ad9e26b4d66..08ad317473ae8 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -216,11 +216,11 @@ cdef extern from "parser/tokenizer.h":
         uint64_t header_end         # header row end
 
         bint allow_leading_cols     # Boolean: 1: can infer index col, 0: no index col
-        bint skip_header_end        # Boolean: 1: Header=None,
-                                    # 0 Header is not None
-                                    # This is used because header_end is
-                                    # uint64_t so there is no valid NULL
-                                    # value (i.e. header_end == -1).
+
+        # Boolean: 1: Header=None, 0 Header is not None. This is used because
+        # header_end is uint64_t so there is no valid NULL value
+        # (i.e. header_end == -1).
+        bint skip_header_end
 
         void *skipset
         PyObject *skipfunc
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 573d08291f870..acbb372729f69 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -444,33 +444,25 @@ static int end_line(parser_t *self) {
         self->line_fields[self->lines] = 0;
         return 0;
     }
-    // Explanation of each condition:
-    // Cond1: !((self->skip_header_end
-    //           && (self->lines < self->allow_leading_cols))
-    //        || (!self->skip_header_end
-    //            && (self->lines <=
-    //            (self->header_end + self->allow_leading_cols))))
-    // Allow extra fields if there is no header, but there may be index columns
-    // in the first line or we are within the header and we may
-    // have index columns.
-    // Cond2: (ex_fields > 0) && (fields > ex_fields)
-    // We only throw an error if we know how many fields
-    // to expect and have encountered too many fields.
-    // Cond3: !(self->usecols)
-    // Ignore field parsing errors if we will use a subset of the columns.
-    // Cond4: !(((fields - 1) == ex_fields)
-    // && !self->stream[self->stream_len - 2])
-    // Ignore a trailing delimter (see gh-2442) by checking if
-    // the last field is empty. We determine this if the next
-    // to last character is null (last character must be null).
-    if (!((self->skip_header_end && (self->lines < self->allow_leading_cols))
+    if (
+        // Allow extra fields if there is no header, but there may be
+        // index columns in the first line or we are within the header
+        // and we may have index columns.
+        !((self->skip_header_end && (self->lines < self->allow_leading_cols))
             || (!self->skip_header_end
                 && (self->lines <=
                 (self->header_end + self->allow_leading_cols))))
+        // We only throw an error if we know how many fields
+        // to expect and have encountered too many fields.
         && (ex_fields > 0 && fields > ex_fields)
+        // Ignore field parsing errors if we will use a subset of the columns.
         && !(self->usecols)
+        // Ignore a trailing delimter (see gh-2442) by checking if
+        // the last field is empty. We determine this if the next
+        // to last character is null (last character must be null).
         && !(((fields - 1) == ex_fields) &&
-        !self->stream[self->stream_len - 2])) {
+        !self->stream[self->stream_len - 2])
+    ) {
         // increment file line count
         self->file_lines++;
 
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index 2865c64f1f9cf..f072059882f07 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -151,10 +151,11 @@ typedef struct parser_t {
     uint64_t header_end;   // header row end
 
     int allow_leading_cols;  // Boolean: 1: can infer index col, 0: no index col
-    int skip_header_end;     // Boolean: 1: Header=None, 0 Header is not None
-                             // This is used because header_end
-                             // is uint64_t so there is no valid NULL value
-                             // (i.e. header_end == -1).
+
+    // Boolean: 1: Header=None, 0 Header is not None. This is used because
+    // header_end is uint64_t so there is no valid NULL value
+    // (i.e. header_end == -1).
+    int skip_header_end;
 
     void *skipset;
     PyObject *skipfunc;

From 7d973e6a3acc04b32207dbd26109e70b3a8e2fdc Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Sun, 18 Apr 2021 16:29:40 -0700
Subject: [PATCH 8/9] fixed compiler warning

---
 pandas/_libs/src/parser/tokenizer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index acbb372729f69..fa825f8deeaf0 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -448,7 +448,8 @@ static int end_line(parser_t *self) {
         // Allow extra fields if there is no header, but there may be
         // index columns in the first line or we are within the header
         // and we may have index columns.
-        !((self->skip_header_end && (self->lines < self->allow_leading_cols))
+        !((self->skip_header_end &&
+            (self->lines < (uint64_t) self->allow_leading_cols))
             || (!self->skip_header_end
                 && (self->lines <=
                 (self->header_end + self->allow_leading_cols))))

From 546e1065f83ee7ea612fd3a4a354000467c5f4df Mon Sep 17 00:00:00 2001
From: Nicholas J Riasanovsky <nick@bodo.ai>
Date: Mon, 19 Apr 2021 15:13:55 -0700
Subject: [PATCH 9/9] fixed pre-commit bug

---
 pandas/io/parsers/python_parser.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 4958c0061dc0c..46e404bc45134 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -897,12 +897,10 @@ def _rows_to_cols(self, content):
         # Check that there are no rows with too many
         # elements in their row (rows with too few
         # elements are padded with NaN).
-        # error: Non-overlapping identity check (left operand type: "List[int]",
+        # error: Non-overlapping identity check
+        # (left operand type: "List[int]",
         # right operand type: "Literal[False]")
-        if (
-            max_len > col_len
-            and self.usecols is None
-        ):
+        if max_len > col_len and self.usecols is None:
 
             footers = self.skipfooter if self.skipfooter else 0
             bad_lines = []