From 478bbc7134947b4a07dba9eaf45c9182adad8bf1 Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Wed, 24 Mar 2021 21:48:26 -0700 Subject: [PATCH 1/9] BUG: Support for checking the first row for errors with index_col=False (#40333) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/parsers.pyx | 3 + pandas/_libs/src/parser/tokenizer.c | 145 +++++++++++------------ pandas/_libs/src/parser/tokenizer.h | 38 +++--- pandas/io/parsers/python_parser.py | 2 +- pandas/tests/io/parser/test_index_col.py | 19 +++ 6 files changed, 112 insertions(+), 96 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 512e6e6cbb391..b609ce37a3996 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -595,6 +595,7 @@ I/O - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) +- Bug in "func:`pandas.read_csv` failed to raise ParserError when first row had too many columns and index_col=False (:issue:`40333`) Period ^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a11bf370412d2..045ac2a7db689 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -215,6 +215,8 @@ cdef extern from "parser/tokenizer.h": int64_t header_start # header row start uint64_t header_end # header row end + int allow_leading_cols # Boolean: 1: can infer index col, 0: no index col + void *skipset PyObject *skipfunc int64_t skip_first_N_rows @@ -376,6 +378,7 @@ cdef class TextReader: self.encoding_errors = PyBytes_AsString(encoding_errors) self.parser = parser_new() + self.parser.allow_leading_cols = allow_leading_cols self.parser.chunksize = tokenize_chunksize self.mangle_dupe_cols = mangle_dupe_cols diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49eb1e7855098..cb452a8c97c1d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -217,9 +217,7 @@ void parser_free(parser_t *self) { parser_cleanup(self); } -void parser_del(parser_t *self) { - free(self); -} +void parser_del(parser_t *self) { free(self); } static int make_stream_space(parser_t *self, size_t nbytes) { uint64_t i, cap, length; @@ -278,9 +276,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { } self->words = - (char **)grow_buffer((void *)self->words, length, - &self->words_cap, nbytes, - sizeof(char *), &status); + (char **)grow_buffer((void *)self->words, length, &self->words_cap, + nbytes, sizeof(char *), &status); TRACE( ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " "%d)\n", @@ -308,10 +305,9 @@ static int make_stream_space(parser_t *self, size_t nbytes) { LINE VECTORS */ cap = self->lines_cap; - self->line_start = - (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, - &self->lines_cap, nbytes, - sizeof(int64_t), &status); + self->line_start = (int64_t *)grow_buffer((void *)self->line_start, + self->lines + 1, &self->lines_cap, + nbytes, sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) @@ -445,7 +441,7 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= self->header_end + 1) && + if (!(self->lines <= self->header_end + self->allow_leading_cols) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; @@ -460,8 +456,9 @@ static int end_line(parser_t *self) { if (self->error_bad_lines) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, - "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", - ex_fields, self->file_lines, fields); + "Expected %d fields in line %" PRIu64 ", saw %" PRId64 + "\n", + ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -472,16 +469,16 @@ static int end_line(parser_t *self) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, - "Skipping line %" PRIu64 ": expected %d fields, saw %" - PRId64 "\n", self->file_lines, ex_fields, fields); + "Skipping line %" PRIu64 + ": expected %d fields, saw %" PRId64 "\n", + self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); } } } else { // missing trailing delimiters - if ((self->lines >= self->header_end + 1) && - fields < ex_fields) { + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { int64_t bufsize = 100; @@ -592,20 +589,20 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, */ -#define PUSH_CHAR(c) \ - TRACE( \ - ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ - c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= self->stream_cap) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ - self->stream_cap)) \ - int64_t bufsize = 100; \ - self->error_msg = malloc(bufsize); \ - snprintf(self->error_msg, bufsize, \ - "Buffer overflow caught - possible malformed input file.\n");\ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ +#define PUSH_CHAR(c) \ + TRACE( \ + ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ + c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= self->stream_cap) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ + self->stream_cap)) \ + int64_t bufsize = 100; \ + self->error_msg = malloc(bufsize); \ + snprintf(self->error_msg, bufsize, \ + "Buffer overflow caught - possible malformed input file.\n"); \ + return PARSER_OUT_OF_MEMORY; \ + } \ + *stream++ = c; \ slen++; // This is a little bit of a hack but works for now @@ -647,8 +644,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, #define END_LINE() END_LINE_STATE(START_RECORD) -#define IS_TERMINATOR(c) \ - (c == line_terminator) +#define IS_TERMINATOR(c) (c == line_terminator) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) @@ -708,8 +704,7 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, - size_t line_limit, uint64_t start_lines) { +int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { int64_t i; uint64_t slen; int should_skip; @@ -717,16 +712,16 @@ int tokenize_bytes(parser_t *self, char *stream; char *buf = self->data + self->datapos; - const char line_terminator = (self->lineterminator == '\0') ? - '\n' : self->lineterminator; + const char line_terminator = + (self->lineterminator == '\0') ? '\n' : self->lineterminator; // 1000 is something that couldn't fit in "char" // thus comparing a char to it would always be "false" const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; - const int comment_symbol = (self->commentchar != '\0') ? - self->commentchar : 1000; - const int escape_symbol = (self->escapechar != '\0') ? - self->escapechar : 1000; + const int comment_symbol = + (self->commentchar != '\0') ? self->commentchar : 1000; + const int escape_symbol = + (self->escapechar != '\0') ? self->escapechar : 1000; if (make_stream_space(self, self->datalen - self->datapos) < 0) { int64_t bufsize = 100; @@ -833,7 +828,7 @@ int tokenize_bytes(parser_t *self, } break; } - // fall through + // fall through case EAT_WHITESPACE: if (IS_TERMINATOR(c)) { @@ -1061,10 +1056,10 @@ int tokenize_bytes(parser_t *self, } else { if (self->delim_whitespace) { /* XXX - * first character of a new record--need to back up and - * reread - * to handle properly... - */ + * first character of a new record--need to back up and + * reread + * to handle properly... + */ i--; buf--; // back up one character (HACK!) END_LINE_STATE(START_RECORD); @@ -1144,8 +1139,8 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %" PRIu64, - self->file_lines); + "EOF inside string starting at row %" PRIu64, + self->file_lines); return -1; case ESCAPED_CHAR: @@ -1267,8 +1262,8 @@ int parser_trim_buffers(parser_t *self) { if (self->words == NULL) { return PARSER_OUT_OF_MEMORY; } - self->word_starts = realloc(self->word_starts, - new_cap * sizeof(int64_t)); + self->word_starts = + realloc(self->word_starts, new_cap * sizeof(int64_t)); if (self->word_starts == NULL) { return PARSER_OUT_OF_MEMORY; } @@ -1311,15 +1306,13 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc(self->line_start, - new_cap * sizeof(int64_t)); + newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_start = newptr; } - newptr = realloc(self->line_fields, - new_cap * sizeof(int64_t)); + newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -1353,8 +1346,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all, if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { - status = parser_buffer_bytes(self, self->chunksize, - encoding_errors); + status = + parser_buffer_bytes(self, self->chunksize, encoding_errors); if (status == REACHED_EOF) { // close out last line @@ -1413,11 +1406,11 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) { */ int to_boolean(const char *item, uint8_t *val) { if (strcasecmp(item, "TRUE") == 0) { - *val = 1; - return 0; + *val = 1; + return 0; } else if (strcasecmp(item, "FALSE") == 0) { - *val = 0; - return 0; + *val = 0; + return 0; } return -1; @@ -1611,9 +1604,9 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } -double precise_xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing, - int *error, int *maybe_int) { +double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing, int *error, + int *maybe_int) { double number; int exponent; int negative; @@ -1751,7 +1744,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal - if (exponent < -616) { // Prevent invalid array access. + if (exponent < -616) { // Prevent invalid array access. number = 0.; } else { number /= e[-308 - exponent]; @@ -1779,7 +1772,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, with a call to `free`. */ -char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, +char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep) { const char *p = s; size_t length = strlen(s); @@ -1796,17 +1789,15 @@ char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, } // Replace `decimal` with '.' if (*p == decimal) { - *dst++ = '.'; - p++; + *dst++ = '.'; + p++; } // Copy the remainder of the string as is. strncpy(dst, p, length + 1 - (p - s)); - if (endpos != NULL) - *endpos = (char *)(s + length); + if (endpos != NULL) *endpos = (char *)(s + length); return s_copy; } - double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and @@ -1822,20 +1813,22 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, // PyOS_string_to_double needs to consume the whole string if (endpc == pc + strlen(pc)) { if (q != NULL) { - // report endptr from source string (p) + // report endptr from source string (p) *q = endptr; } } else { *error = -1; if (q != NULL) { - // p and pc are different len due to tsep removal. Can't report - // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; // TODO(willayd): this could be undefined behavior + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; // TODO(willayd): this could be undefined behavior } } if (maybe_int != NULL) *maybe_int = 0; - if (PyErr_Occurred() != NULL) *error = -1; - else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; + if (PyErr_Occurred() != NULL) + *error = -1; + else if (r == Py_HUGE_VAL) + *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f69fee4993d34..81a8c8936c2a9 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -19,10 +19,9 @@ See LICENSE for the license #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 +#include "../headers/portable.h" #include "../headers/stdint.h" #include "../inline_helper.h" -#include "../headers/portable.h" - #include "khash.h" #define STREAM_INIT_SIZE 32 @@ -30,7 +29,6 @@ See LICENSE for the license #define REACHED_EOF 1 #define CALLING_READ_FAILED 2 - /* C flat file parsing low level code for pandas / NumPy @@ -93,9 +91,9 @@ typedef struct parser_t { io_callback cb_io; io_cleanup cb_cleanup; - int64_t chunksize; // Number of bytes to prepare for each chunk - char *data; // pointer to data to be processed - int64_t datalen; // amount of data available + int64_t chunksize; // Number of bytes to prepare for each chunk + char *data; // pointer to data to be processed + int64_t datalen; // amount of data available int64_t datapos; // where to write out tokenized data @@ -105,19 +103,19 @@ typedef struct parser_t { // Store words in (potentially ragged) matrix for now, hmm char **words; - int64_t *word_starts; // where we are in the stream + int64_t *word_starts; // where we are in the stream uint64_t words_len; uint64_t words_cap; uint64_t max_words_cap; // maximum word cap encountered - char *pword_start; // pointer to stream start of current field - int64_t word_start; // position start of current field + char *pword_start; // pointer to stream start of current field + int64_t word_start; // position start of current field - int64_t *line_start; // position in words for start of line - int64_t *line_fields; // Number of fields in each line - uint64_t lines; // Number of (good) lines observed - uint64_t file_lines; // Number of lines (including bad or skipped) - uint64_t lines_cap; // Vector capacity + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -150,12 +148,14 @@ typedef struct parser_t { int64_t header_start; // header row start uint64_t header_end; // header row end + int allow_leading_cols; // Boolean: 1: can infer index col, 0: no index col + void *skipset; PyObject *skipfunc; int64_t skip_first_N_rows; int64_t skip_footer; - double (*double_converter)(const char *, char **, - char, char, char, int, int *, int *); + double (*double_converter)(const char *, char **, char, char, char, int, + int *, int *); // error handling char *warn_msg; @@ -219,9 +219,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); -double precise_xstrtod(const char *p, char **q, char decimal, - char sci, char tsep, int skip_trailing, - int *error, int *maybe_int); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing, int *error, + int *maybe_int); // GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 37f553c724c9e..d30af554b08f7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -883,7 +883,7 @@ def _rows_to_cols(self, content): # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - if max_len > col_len and self.index_col is not False and self.usecols is None: + if max_len > col_len and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = [] diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 2f876a28c56cd..ede51ea1b6631 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.errors import ParserError + from pandas import ( DataFrame, Index, @@ -283,3 +285,20 @@ def test_multiindex_columns_index_col_with_data(all_parsers): index=Index(["data"]), ) tm.assert_frame_equal(result, expected) + + +def test_index_col_false_error(all_parsers): + # GH#40333 + parser = all_parsers + with pytest.raises(ParserError, match="Expected 3 fields in line 2, saw 4"): + parser.read_csv(StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False) + + +def test_index_col_false_error_ignore(all_parsers): + # GH#40333 + parser = all_parsers + result = parser.read_csv( + StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False, error_bad_lines=False + ) + expected = DataFrame({"a": [1], "b": [2], "c": [3]}) + tm.assert_frame_equal(result, expected) From a1b2e5c922c48c87b972fed2dba0ab9b8a3c5225 Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Tue, 6 Apr 2021 01:05:56 -0700 Subject: [PATCH 2/9] Update PR changes --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/_libs/parsers.pyx | 2 +- pandas/_libs/src/parser/tokenizer.c | 143 +++++++++++++++------------- pandas/_libs/src/parser/tokenizer.h | 36 +++---- 4 files changed, 96 insertions(+), 87 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b609ce37a3996..4b7efdce43861 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -595,7 +595,7 @@ I/O - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) -- Bug in "func:`pandas.read_csv` failed to raise ParserError when first row had too many columns and index_col=False (:issue:`40333`) +- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`) Period ^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 045ac2a7db689..7492b13593435 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -215,7 +215,7 @@ cdef extern from "parser/tokenizer.h": int64_t header_start # header row start uint64_t header_end # header row end - int allow_leading_cols # Boolean: 1: can infer index col, 0: no index col + bint allow_leading_cols # Boolean: 1: can infer index col, 0: no index col void *skipset PyObject *skipfunc diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index cb452a8c97c1d..104686c34a0b7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -217,7 +217,9 @@ void parser_free(parser_t *self) { parser_cleanup(self); } -void parser_del(parser_t *self) { free(self); } +void parser_del(parser_t *self) { + free(self); +} static int make_stream_space(parser_t *self, size_t nbytes) { uint64_t i, cap, length; @@ -276,8 +278,9 @@ static int make_stream_space(parser_t *self, size_t nbytes) { } self->words = - (char **)grow_buffer((void *)self->words, length, &self->words_cap, - nbytes, sizeof(char *), &status); + (char **)grow_buffer((void *)self->words, length, + &self->words_cap, nbytes, + sizeof(char *), &status); TRACE( ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " "%d)\n", @@ -305,9 +308,10 @@ static int make_stream_space(parser_t *self, size_t nbytes) { LINE VECTORS */ cap = self->lines_cap; - self->line_start = (int64_t *)grow_buffer((void *)self->line_start, - self->lines + 1, &self->lines_cap, - nbytes, sizeof(int64_t), &status); + self->line_start = + (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, + &self->lines_cap, nbytes, + sizeof(int64_t), &status); TRACE(( "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) @@ -456,9 +460,8 @@ static int end_line(parser_t *self) { if (self->error_bad_lines) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, - "Expected %d fields in line %" PRIu64 ", saw %" PRId64 - "\n", - ex_fields, self->file_lines, fields); + "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", + ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -469,16 +472,16 @@ static int end_line(parser_t *self) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, - "Skipping line %" PRIu64 - ": expected %d fields, saw %" PRId64 "\n", - self->file_lines, ex_fields, fields); + "Skipping line %" PRIu64 ": expected %d fields, saw %" + PRId64 "\n", self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); } } } else { // missing trailing delimiters - if ((self->lines >= self->header_end + 1) && fields < ex_fields) { + if ((self->lines >= self->header_end + 1) && + fields < ex_fields) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { int64_t bufsize = 100; @@ -589,20 +592,20 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, */ -#define PUSH_CHAR(c) \ - TRACE( \ - ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ - c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= self->stream_cap) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ - self->stream_cap)) \ - int64_t bufsize = 100; \ - self->error_msg = malloc(bufsize); \ - snprintf(self->error_msg, bufsize, \ - "Buffer overflow caught - possible malformed input file.\n"); \ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ +#define PUSH_CHAR(c) \ + TRACE( \ + ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ + c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= self->stream_cap) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ + self->stream_cap)) \ + int64_t bufsize = 100; \ + self->error_msg = malloc(bufsize); \ + snprintf(self->error_msg, bufsize, \ + "Buffer overflow caught - possible malformed input file.\n");\ + return PARSER_OUT_OF_MEMORY; \ + } \ + *stream++ = c; \ slen++; // This is a little bit of a hack but works for now @@ -644,7 +647,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes, #define END_LINE() END_LINE_STATE(START_RECORD) -#define IS_TERMINATOR(c) (c == line_terminator) +#define IS_TERMINATOR(c) \ + (c == line_terminator) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) @@ -704,7 +708,8 @@ int skip_this_line(parser_t *self, int64_t rownum) { } } -int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { +int tokenize_bytes(parser_t *self, + size_t line_limit, uint64_t start_lines) { int64_t i; uint64_t slen; int should_skip; @@ -712,16 +717,16 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { char *stream; char *buf = self->data + self->datapos; - const char line_terminator = - (self->lineterminator == '\0') ? '\n' : self->lineterminator; + const char line_terminator = (self->lineterminator == '\0') ? + '\n' : self->lineterminator; // 1000 is something that couldn't fit in "char" // thus comparing a char to it would always be "false" const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; - const int comment_symbol = - (self->commentchar != '\0') ? self->commentchar : 1000; - const int escape_symbol = - (self->escapechar != '\0') ? self->escapechar : 1000; + const int comment_symbol = (self->commentchar != '\0') ? + self->commentchar : 1000; + const int escape_symbol = (self->escapechar != '\0') ? + self->escapechar : 1000; if (make_stream_space(self, self->datalen - self->datapos) < 0) { int64_t bufsize = 100; @@ -828,7 +833,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { } break; } - // fall through + // fall through case EAT_WHITESPACE: if (IS_TERMINATOR(c)) { @@ -1056,10 +1061,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit, uint64_t start_lines) { } else { if (self->delim_whitespace) { /* XXX - * first character of a new record--need to back up and - * reread - * to handle properly... - */ + * first character of a new record--need to back up and + * reread + * to handle properly... + */ i--; buf--; // back up one character (HACK!) END_LINE_STATE(START_RECORD); @@ -1139,8 +1144,8 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %" PRIu64, - self->file_lines); + "EOF inside string starting at row %" PRIu64, + self->file_lines); return -1; case ESCAPED_CHAR: @@ -1262,8 +1267,8 @@ int parser_trim_buffers(parser_t *self) { if (self->words == NULL) { return PARSER_OUT_OF_MEMORY; } - self->word_starts = - realloc(self->word_starts, new_cap * sizeof(int64_t)); + self->word_starts = realloc(self->word_starts, + new_cap * sizeof(int64_t)); if (self->word_starts == NULL) { return PARSER_OUT_OF_MEMORY; } @@ -1306,13 +1311,15 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); + newptr = realloc(self->line_start, + new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_start = newptr; } - newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); + newptr = realloc(self->line_fields, + new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -1346,8 +1353,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all, if (!all && self->lines - start_lines >= nrows) break; if (self->datapos == self->datalen) { - status = - parser_buffer_bytes(self, self->chunksize, encoding_errors); + status = parser_buffer_bytes(self, self->chunksize, + encoding_errors); if (status == REACHED_EOF) { // close out last line @@ -1406,11 +1413,11 @@ int tokenize_all_rows(parser_t *self, const char *encoding_errors) { */ int to_boolean(const char *item, uint8_t *val) { if (strcasecmp(item, "TRUE") == 0) { - *val = 1; - return 0; + *val = 1; + return 0; } else if (strcasecmp(item, "FALSE") == 0) { - *val = 0; - return 0; + *val = 0; + return 0; } return -1; @@ -1604,9 +1611,9 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } -double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *error, - int *maybe_int) { +double precise_xstrtod(const char *str, char **endptr, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int) { double number; int exponent; int negative; @@ -1744,7 +1751,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal - if (exponent < -616) { // Prevent invalid array access. + if (exponent < -616) { // Prevent invalid array access. number = 0.; } else { number /= e[-308 - exponent]; @@ -1772,7 +1779,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, with a call to `free`. */ -char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, +char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep) { const char *p = s; size_t length = strlen(s); @@ -1789,15 +1796,17 @@ char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, } // Replace `decimal` with '.' if (*p == decimal) { - *dst++ = '.'; - p++; + *dst++ = '.'; + p++; } // Copy the remainder of the string as is. strncpy(dst, p, length + 1 - (p - s)); - if (endpos != NULL) *endpos = (char *)(s + length); + if (endpos != NULL) + *endpos = (char *)(s + length); return s_copy; } + double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and @@ -1813,22 +1822,20 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, // PyOS_string_to_double needs to consume the whole string if (endpc == pc + strlen(pc)) { if (q != NULL) { - // report endptr from source string (p) + // report endptr from source string (p) *q = endptr; } } else { *error = -1; if (q != NULL) { - // p and pc are different len due to tsep removal. Can't report - // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; // TODO(willayd): this could be undefined behavior + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; // TODO(willayd): this could be undefined behavior } } if (maybe_int != NULL) *maybe_int = 0; - if (PyErr_Occurred() != NULL) - *error = -1; - else if (r == Py_HUGE_VAL) - *error = (int)Py_HUGE_VAL; + if (PyErr_Occurred() != NULL) *error = -1; + else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 81a8c8936c2a9..5e423231854d8 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -19,9 +19,10 @@ See LICENSE for the license #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 -#include "../headers/portable.h" #include "../headers/stdint.h" #include "../inline_helper.h" +#include "../headers/portable.h" + #include "khash.h" #define STREAM_INIT_SIZE 32 @@ -29,6 +30,7 @@ See LICENSE for the license #define REACHED_EOF 1 #define CALLING_READ_FAILED 2 + /* C flat file parsing low level code for pandas / NumPy @@ -91,9 +93,9 @@ typedef struct parser_t { io_callback cb_io; io_cleanup cb_cleanup; - int64_t chunksize; // Number of bytes to prepare for each chunk - char *data; // pointer to data to be processed - int64_t datalen; // amount of data available + int64_t chunksize; // Number of bytes to prepare for each chunk + char *data; // pointer to data to be processed + int64_t datalen; // amount of data available int64_t datapos; // where to write out tokenized data @@ -103,19 +105,19 @@ typedef struct parser_t { // Store words in (potentially ragged) matrix for now, hmm char **words; - int64_t *word_starts; // where we are in the stream + int64_t *word_starts; // where we are in the stream uint64_t words_len; uint64_t words_cap; uint64_t max_words_cap; // maximum word cap encountered - char *pword_start; // pointer to stream start of current field - int64_t word_start; // position start of current field + char *pword_start; // pointer to stream start of current field + int64_t word_start; // position start of current field - int64_t *line_start; // position in words for start of line - int64_t *line_fields; // Number of fields in each line - uint64_t lines; // Number of (good) lines observed - uint64_t file_lines; // Number of lines (including bad or skipped) - uint64_t lines_cap; // Vector capacity + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity // Tokenizing stuff ParserState state; @@ -154,8 +156,8 @@ typedef struct parser_t { PyObject *skipfunc; int64_t skip_first_N_rows; int64_t skip_footer; - double (*double_converter)(const char *, char **, char, char, char, int, - int *, int *); + double (*double_converter)(const char *, char **, + char, char, char, int, int *, int *); // error handling char *warn_msg; @@ -219,9 +221,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); -double precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *error, - int *maybe_int); +double precise_xstrtod(const char *p, char **q, char decimal, + char sci, char tsep, int skip_trailing, + int *error, int *maybe_int); // GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, From 806e4f6709ab724a664b6b7e426fc14c870c659a Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Wed, 7 Apr 2021 02:01:15 -0700 Subject: [PATCH 3/9] fixed trailing delimiter test --- pandas/_libs/src/parser/tokenizer.c | 5 +++-- pandas/io/parsers/python_parser.py | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 104686c34a0b7..9c5da72eb9bbf 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -444,9 +444,10 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; return 0; } - + // Ignore any trailing delimters see gh-2442 if (!(self->lines <= self->header_end + self->allow_leading_cols) && - (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { + (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols) + && !((fields - 1) == ex_fields) && strlen(self->pword_start) == 0) { // increment file line count self->file_lines++; diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index d30af554b08f7..aa0a65a0b693d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -894,6 +894,10 @@ def _rows_to_cols(self, content): for (i, l) in iter_content: actual_len = len(l) + # Check and remove trailing delimiters see gh-2442 + if actual_len == (col_len + 1) and l[-1] == "": + l.pop() + actual_len -= 1 if actual_len > col_len: if self.error_bad_lines or self.warn_bad_lines: From 0d55f5d2a2382d8b713c5a6a225eaaed99d3a832 Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Wed, 7 Apr 2021 03:11:56 -0700 Subject: [PATCH 4/9] fixed trailing delimiter detection --- pandas/_libs/src/parser/tokenizer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 9c5da72eb9bbf..4598b6b0c48fe 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -444,10 +444,13 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; return 0; } - // Ignore any trailing delimters see gh-2442 + // Ignore any trailing delimters see gh-2442 by checking if + // the last field is empty. We determine this if the next + // to last character is null (last character must be null). if (!(self->lines <= self->header_end + self->allow_leading_cols) && (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols) - && !((fields - 1) == ex_fields) && strlen(self->pword_start) == 0) { + && !(((fields - 1) == ex_fields) && + !self->stream[self->stream_len - 2])) { // increment file line count self->file_lines++; From 1151b93968fbe73c96d01363eaa508afe7ee12a7 Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Sun, 18 Apr 2021 14:30:55 -0700 Subject: [PATCH 5/9] fixed bug in name detection --- pandas/_libs/parsers.pyx | 11 ++++++++-- pandas/_libs/src/parser/tokenizer.c | 21 ++++++++++++++++--- pandas/_libs/src/parser/tokenizer.h | 4 ++++ .../io/parser/common/test_common_basic.py | 5 ++--- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7492b13593435..bb01d51e21c68 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -215,7 +215,12 @@ cdef extern from "parser/tokenizer.h": int64_t header_start # header row start uint64_t header_end # header row end - bint allow_leading_cols # Boolean: 1: can infer index col, 0: no index col + bint allow_leading_cols # Boolean: 1: can infer index col, 0: no index col + bint skip_header_end # Boolean: 1: Header=None, + # 0 Header is not None + # This is used because header_end is + # uint64_t so there is no valid NULL + # value (i.e. header_end == -1). void *skipset PyObject *skipfunc @@ -518,11 +523,13 @@ cdef class TextReader: if header is None: # sentinel value self.parser.header_start = -1 - self.parser.header_end = -1 + self.parser.skip_header_end = True + self.parser.header_end = 0 self.parser.header = -1 self.parser_start = 0 self.header = [] else: + self.parser.skip_header_end = False if isinstance(header, list): if len(header) > 1: # need to artificially skip the final line diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4598b6b0c48fe..83d2d7ee13c9e 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -444,11 +444,26 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; return 0; } - // Ignore any trailing delimters see gh-2442 by checking if + // Explanation of each condition: + // Cond1: (self->skip_header_end || + // !(self->lines <= (self->header_end + self->allow_leading_cols))) + // We don't check the expected number of fields within the header + // lines and we are allowed to infer the index. + // We check for if Header=None is specified with self->skip_header_end. + // Cond2: (ex_fields > 0) && (fields > ex_fields) + // We only throw an error if we know how many fields + // to expect and have encountered too many fields. + // Cond3: !(self->usecols) + // Ignore field parsing errors if we will use a subset of the columns. + // Cond4: !(((fields - 1) == ex_fields) + // && !self->stream[self->stream_len - 2]) + // Ignore a trailing delimter (see gh-2442) by checking if // the last field is empty. We determine this if the next // to last character is null (last character must be null). - if (!(self->lines <= self->header_end + self->allow_leading_cols) && - (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols) + if ((self->skip_header_end + || !(self->lines <= (self->header_end + self->allow_leading_cols))) + && (ex_fields > 0 && fields > ex_fields) + && !(self->usecols) && !(((fields - 1) == ex_fields) && !self->stream[self->stream_len - 2])) { // increment file line count diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 5e423231854d8..2865c64f1f9cf 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -151,6 +151,10 @@ typedef struct parser_t { uint64_t header_end; // header row end int allow_leading_cols; // Boolean: 1: can infer index col, 0: no index col + int skip_header_end; // Boolean: 1: Header=None, 0 Header is not None + // This is used because header_end + // is uint64_t so there is no valid NULL value + // (i.e. header_end == -1). void *skipset; PyObject *skipfunc; diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 572bc09c96886..24e4a5c58b48e 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -667,11 +667,10 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] - ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) + with pytest.raises(ParserError, match="Expected 3 fields in line 1, saw 5"): + parser.read_csv(stream, header=None, names=column_names, index_col=False) def test_read_csv_names_not_accepting_sets(all_parsers): From 185f62e70cf7e3cc9aca11ff104300f006e4855b Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Sun, 18 Apr 2021 15:47:21 -0700 Subject: [PATCH 6/9] fixed index col tests --- pandas/_libs/src/parser/tokenizer.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 83d2d7ee13c9e..573d08291f870 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -421,10 +421,10 @@ static int end_line(parser_t *self) { TRACE(("end_line: lines: %d\n", self->lines)); if (self->lines > 0) { - if (self->expected_fields >= 0) { - ex_fields = self->expected_fields; + if (self->expected_fields > self->line_fields[self->lines - 1]) { + ex_fields = self->expected_fields; } else { - ex_fields = self->line_fields[self->lines - 1]; + ex_fields = self->line_fields[self->lines - 1]; } } TRACE(("end_line: ex_fields: %d\n", ex_fields)); @@ -445,11 +445,14 @@ static int end_line(parser_t *self) { return 0; } // Explanation of each condition: - // Cond1: (self->skip_header_end || - // !(self->lines <= (self->header_end + self->allow_leading_cols))) - // We don't check the expected number of fields within the header - // lines and we are allowed to infer the index. - // We check for if Header=None is specified with self->skip_header_end. + // Cond1: !((self->skip_header_end + // && (self->lines < self->allow_leading_cols)) + // || (!self->skip_header_end + // && (self->lines <= + // (self->header_end + self->allow_leading_cols)))) + // Allow extra fields if there is no header, but there may be index columns + // in the first line or we are within the header and we may + // have index columns. // Cond2: (ex_fields > 0) && (fields > ex_fields) // We only throw an error if we know how many fields // to expect and have encountered too many fields. @@ -460,8 +463,10 @@ static int end_line(parser_t *self) { // Ignore a trailing delimter (see gh-2442) by checking if // the last field is empty. We determine this if the next // to last character is null (last character must be null). - if ((self->skip_header_end - || !(self->lines <= (self->header_end + self->allow_leading_cols))) + if (!((self->skip_header_end && (self->lines < self->allow_leading_cols)) + || (!self->skip_header_end + && (self->lines <= + (self->header_end + self->allow_leading_cols)))) && (ex_fields > 0 && fields > ex_fields) && !(self->usecols) && !(((fields - 1) == ex_fields) && From a30526874e79144e033eca509b2867aad8f9bd1d Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Sun, 18 Apr 2021 15:58:24 -0700 Subject: [PATCH 7/9] Made the requested changes --- doc/source/whatsnew/v1.3.0.rst | 1 - pandas/_libs/parsers.pyx | 10 ++++----- pandas/_libs/src/parser/tokenizer.c | 34 +++++++++++------------------ pandas/_libs/src/parser/tokenizer.h | 9 ++++---- 4 files changed, 23 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 33e7e5836017c..e354e35498f14 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -782,7 +782,6 @@ I/O - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) -- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`) - Bug in :func:`read_csv` failing to raise ParserError when ``names is not None`` and ``header=None`` (:issue:`22144`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 43ad9e26b4d66..08ad317473ae8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -216,11 +216,11 @@ cdef extern from "parser/tokenizer.h": uint64_t header_end # header row end bint allow_leading_cols # Boolean: 1: can infer index col, 0: no index col - bint skip_header_end # Boolean: 1: Header=None, - # 0 Header is not None - # This is used because header_end is - # uint64_t so there is no valid NULL - # value (i.e. header_end == -1). + + # Boolean: 1: Header=None, 0 Header is not None. This is used because + # header_end is uint64_t so there is no valid NULL value + # (i.e. header_end == -1). + bint skip_header_end void *skipset PyObject *skipfunc diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 573d08291f870..acbb372729f69 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -444,33 +444,25 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; return 0; } - // Explanation of each condition: - // Cond1: !((self->skip_header_end - // && (self->lines < self->allow_leading_cols)) - // || (!self->skip_header_end - // && (self->lines <= - // (self->header_end + self->allow_leading_cols)))) - // Allow extra fields if there is no header, but there may be index columns - // in the first line or we are within the header and we may - // have index columns. - // Cond2: (ex_fields > 0) && (fields > ex_fields) - // We only throw an error if we know how many fields - // to expect and have encountered too many fields. - // Cond3: !(self->usecols) - // Ignore field parsing errors if we will use a subset of the columns. - // Cond4: !(((fields - 1) == ex_fields) - // && !self->stream[self->stream_len - 2]) - // Ignore a trailing delimter (see gh-2442) by checking if - // the last field is empty. We determine this if the next - // to last character is null (last character must be null). - if (!((self->skip_header_end && (self->lines < self->allow_leading_cols)) + if ( + // Allow extra fields if there is no header, but there may be + // index columns in the first line or we are within the header + // and we may have index columns. + !((self->skip_header_end && (self->lines < self->allow_leading_cols)) || (!self->skip_header_end && (self->lines <= (self->header_end + self->allow_leading_cols)))) + // We only throw an error if we know how many fields + // to expect and have encountered too many fields. && (ex_fields > 0 && fields > ex_fields) + // Ignore field parsing errors if we will use a subset of the columns. && !(self->usecols) + // Ignore a trailing delimter (see gh-2442) by checking if + // the last field is empty. We determine this if the next + // to last character is null (last character must be null). && !(((fields - 1) == ex_fields) && - !self->stream[self->stream_len - 2])) { + !self->stream[self->stream_len - 2]) + ) { // increment file line count self->file_lines++; diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 2865c64f1f9cf..f072059882f07 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -151,10 +151,11 @@ typedef struct parser_t { uint64_t header_end; // header row end int allow_leading_cols; // Boolean: 1: can infer index col, 0: no index col - int skip_header_end; // Boolean: 1: Header=None, 0 Header is not None - // This is used because header_end - // is uint64_t so there is no valid NULL value - // (i.e. header_end == -1). + + // Boolean: 1: Header=None, 0 Header is not None. This is used because + // header_end is uint64_t so there is no valid NULL value + // (i.e. header_end == -1). + int skip_header_end; void *skipset; PyObject *skipfunc; From 7d973e6a3acc04b32207dbd26109e70b3a8e2fdc Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Sun, 18 Apr 2021 16:29:40 -0700 Subject: [PATCH 8/9] fixed compiler warning --- pandas/_libs/src/parser/tokenizer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index acbb372729f69..fa825f8deeaf0 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -448,7 +448,8 @@ static int end_line(parser_t *self) { // Allow extra fields if there is no header, but there may be // index columns in the first line or we are within the header // and we may have index columns. - !((self->skip_header_end && (self->lines < self->allow_leading_cols)) + !((self->skip_header_end && + (self->lines < (uint64_t) self->allow_leading_cols)) || (!self->skip_header_end && (self->lines <= (self->header_end + self->allow_leading_cols)))) From 546e1065f83ee7ea612fd3a4a354000467c5f4df Mon Sep 17 00:00:00 2001 From: Nicholas J Riasanovsky Date: Mon, 19 Apr 2021 15:13:55 -0700 Subject: [PATCH 9/9] fixed pre-commit bug --- pandas/io/parsers/python_parser.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 4958c0061dc0c..46e404bc45134 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -897,12 +897,10 @@ def _rows_to_cols(self, content): # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - # error: Non-overlapping identity check (left operand type: "List[int]", + # error: Non-overlapping identity check + # (left operand type: "List[int]", # right operand type: "Literal[False]") - if ( - max_len > col_len - and self.usecols is None - ): + if max_len > col_len and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = []