diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..e354e35498f14 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -783,6 +783,8 @@ I/O - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) +- Bug in :func:`read_csv` failing to raise ParserError when first row had too many columns and ``index_col=False`` (:issue:`40333`) +- Bug in :func:`read_csv` failing to raise ParserError when ``names is not None`` and ``header=None`` (:issue:`22144`) Period ^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 153ac4b5f0893..08ad317473ae8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -215,6 +215,13 @@ cdef extern from "parser/tokenizer.h": int64_t header_start # header row start uint64_t header_end # header row end + bint allow_leading_cols # Boolean: 1: can infer index col, 0: no index col + + # Boolean: 1: Header=None, 0 Header is not None. This is used because + # header_end is uint64_t so there is no valid NULL value + # (i.e. header_end == -1). + bint skip_header_end + void *skipset PyObject *skipfunc int64_t skip_first_N_rows @@ -378,6 +385,7 @@ cdef class TextReader: self.encoding_errors = PyBytes_AsString(encoding_errors) self.parser = parser_new() + self.parser.allow_leading_cols = allow_leading_cols self.parser.chunksize = tokenize_chunksize self.mangle_dupe_cols = mangle_dupe_cols @@ -517,11 +525,13 @@ cdef class TextReader: if header is None: # sentinel value self.parser.header_start = -1 - self.parser.header_end = -1 + self.parser.skip_header_end = True + self.parser.header_end = 0 self.parser.header = -1 self.parser_start = 0 prelim_header = [] else: + self.parser.skip_header_end = False if isinstance(header, list): if len(header) > 1: # need to artificially skip the final line diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49eb1e7855098..fa825f8deeaf0 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -421,10 +421,10 @@ static int end_line(parser_t *self) { TRACE(("end_line: lines: %d\n", self->lines)); if (self->lines > 0) { - if (self->expected_fields >= 0) { - ex_fields = self->expected_fields; + if (self->expected_fields > self->line_fields[self->lines - 1]) { + ex_fields = self->expected_fields; } else { - ex_fields = self->line_fields[self->lines - 1]; + ex_fields = self->line_fields[self->lines - 1]; } } TRACE(("end_line: ex_fields: %d\n", ex_fields)); @@ -444,9 +444,26 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; return 0; } - - if (!(self->lines <= self->header_end + 1) && - (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) { + if ( + // Allow extra fields if there is no header, but there may be + // index columns in the first line or we are within the header + // and we may have index columns. + !((self->skip_header_end && + (self->lines < (uint64_t) self->allow_leading_cols)) + || (!self->skip_header_end + && (self->lines <= + (self->header_end + self->allow_leading_cols)))) + // We only throw an error if we know how many fields + // to expect and have encountered too many fields. + && (ex_fields > 0 && fields > ex_fields) + // Ignore field parsing errors if we will use a subset of the columns. + && !(self->usecols) + // Ignore a trailing delimter (see gh-2442) by checking if + // the last field is empty. We determine this if the next + // to last character is null (last character must be null). + && !(((fields - 1) == ex_fields) && + !self->stream[self->stream_len - 2]) + ) { // increment file line count self->file_lines++; diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f69fee4993d34..f072059882f07 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -150,6 +150,13 @@ typedef struct parser_t { int64_t header_start; // header row start uint64_t header_end; // header row end + int allow_leading_cols; // Boolean: 1: can infer index col, 0: no index col + + // Boolean: 1: Header=None, 0 Header is not None. This is used because + // header_end is uint64_t so there is no valid NULL value + // (i.e. header_end == -1). + int skip_header_end; + void *skipset; PyObject *skipfunc; int64_t skip_first_N_rows; diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 9f62d63c680f6..46e404bc45134 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -897,13 +897,10 @@ def _rows_to_cols(self, content): # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - # error: Non-overlapping identity check (left operand type: "List[int]", + # error: Non-overlapping identity check + # (left operand type: "List[int]", # right operand type: "Literal[False]") - if ( - max_len > col_len - and self.index_col is not False # type: ignore[comparison-overlap] - and self.usecols is None - ): + if max_len > col_len and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = [] @@ -914,6 +911,10 @@ def _rows_to_cols(self, content): for (i, l) in iter_content: actual_len = len(l) + # Check and remove trailing delimiters see gh-2442 + if actual_len == (col_len + 1) and l[-1] == "": + l.pop() + actual_len -= 1 if actual_len > col_len: if self.error_bad_lines or self.warn_bad_lines: diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 572bc09c96886..24e4a5c58b48e 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -667,11 +667,10 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] - ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) - tm.assert_frame_equal(df, ref) + with pytest.raises(ParserError, match="Expected 3 fields in line 1, saw 5"): + parser.read_csv(stream, header=None, names=column_names, index_col=False) def test_read_csv_names_not_accepting_sets(all_parsers): diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 2f876a28c56cd..ede51ea1b6631 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas.errors import ParserError + from pandas import ( DataFrame, Index, @@ -283,3 +285,20 @@ def test_multiindex_columns_index_col_with_data(all_parsers): index=Index(["data"]), ) tm.assert_frame_equal(result, expected) + + +def test_index_col_false_error(all_parsers): + # GH#40333 + parser = all_parsers + with pytest.raises(ParserError, match="Expected 3 fields in line 2, saw 4"): + parser.read_csv(StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False) + + +def test_index_col_false_error_ignore(all_parsers): + # GH#40333 + parser = all_parsers + result = parser.read_csv( + StringIO("a,b,c\n0,1,2,3\n1,2,3"), index_col=False, error_bad_lines=False + ) + expected = DataFrame({"a": [1], "b": [2], "c": [3]}) + tm.assert_frame_equal(result, expected)