BUG: read_csv not recognizing bad lines with names given (#44646)

phofl · web-flow · commit 94c5ce339de4 · 2021-11-28T14:26:11.000-05:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -662,6 +662,8 @@ I/O
 - Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`)
 - Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`)
 - Bug in :func:`json_normalize` where reading data with missing multi-level metadata would not respect errors="ignore" (:issue:`44312`)
+- Bug in :func:`read_csv` used second row to guess implicit index if ``header`` was set to ``None`` for ``engine="python"`` (:issue:`22144`)
+- Bug in :func:`read_csv` not recognizing bad lines when ``names`` were given for ``engine="c"`` (:issue:`22144`)
 - Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
 - Bug in :func:`read_csv` not applying dtype for ``index_col`` (:issue:`9435`)
 - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -721,10 +721,6 @@ cdef class TextReader:
                 header = [self.names]
 
         elif self.names is not None:
-            # Enforce this unless usecols
-            if not self.has_usecols:
-                self.parser.expected_fields = len(self.names)
-
             # Names passed
             if self.parser.lines < 1:
                 self._tokenize_rows(1)
@@ -735,6 +731,10 @@ cdef class TextReader:
                 field_count = len(header[0])
             else:
                 field_count = self.parser.line_fields[data_line]
+
+            # Enforce this unless usecols
+            if not self.has_usecols:
+                self.parser.expected_fields = max(field_count, len(self.names))
         else:
             # No header passed nor to be found in the file
             if self.parser.lines < 1:
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -446,7 +446,7 @@ static int end_line(parser_t *self) {
     }
 
     if (!(self->lines <= self->header_end + 1) &&
-        (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
+        (fields > ex_fields) && !(self->usecols)) {
         // increment file line count
         self->file_lines++;
 
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -341,6 +341,7 @@ def _infer_columns(self):
         num_original_columns = 0
         clear_buffer = True
         unnamed_cols: set[str | int | None] = set()
+        self._header_line = None
 
         if self.header is not None:
             header = self.header
@@ -481,6 +482,8 @@ def _infer_columns(self):
 
                 line = names[:]
 
+            # Store line, otherwise it is lost for guessing the index
+            self._header_line = line
             ncols = len(line)
             num_original_columns = ncols
 
@@ -852,10 +855,13 @@ def _get_index_name(self, columns):
         orig_names = list(columns)
         columns = list(columns)
 
-        try:
-            line = self._next_line()
-        except StopIteration:
-            line = None
+        if self._header_line is not None:
+            line = self._header_line
+        else:
+            try:
+                line = self._next_line()
+            except StopIteration:
+                line = None
 
         try:
             next_line = self._next_line()
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
@@ -620,3 +620,36 @@ def test_read_csv_multi_header_length_check(all_parsers):
         ParserError, match="Header rows must have an equal number of columns."
     ):
         parser.read_csv(StringIO(case), header=[0, 2])
+
+
+@skip_pyarrow
+def test_header_none_and_implicit_index(all_parsers):
+    # GH#22144
+    parser = all_parsers
+    data = "x,1,5\ny,2\nz,3\n"
+    result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
+    expected = DataFrame(
+        {"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_header_none_and_implicit_index_in_second_row(all_parsers):
+    # GH#22144
+    parser = all_parsers
+    data = "x,1\ny,2,5\nz,3\n"
+    with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
+        parser.read_csv(StringIO(data), names=["a", "b"], header=None)
+
+
+@skip_pyarrow
+def test_header_none_and_on_bad_lines_skip(all_parsers):
+    # GH#22144
+    parser = all_parsers
+    data = "x,1\ny,2,5\nz,3\n"
+    result = parser.read_csv(
+        StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
+    )
+    expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -446,7 +446,7 @@ static int end_line(parser_t *self) {`
`446`	`446`	`}`
`447`	`447`
`448`	`448`	`if (!(self->lines <= self->header_end + 1) &&`
`449`		`- (self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {`
	`449`	`+ (fields > ex_fields) && !(self->usecols)) {`
`450`	`450`	`// increment file line count`
`451`	`451`	`self->file_lines++;`
`452`	`452`