Skip to content

Commit 94c5ce3

Browse files
authored
BUG: read_csv not recognizing bad lines with names given (#44646)
1 parent 4d6a066 commit 94c5ce3

File tree

5 files changed

+50
-9
lines changed

5 files changed

+50
-9
lines changed

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,8 @@ I/O
662662
- Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`)
663663
- Bug in :func:`json_normalize` where multi-character ``sep`` parameter is incorrectly prefixed to every key (:issue:`43831`)
664664
- Bug in :func:`json_normalize` where reading data with missing multi-level metadata would not respect errors="ignore" (:issue:`44312`)
665+
- Bug in :func:`read_csv` used second row to guess implicit index if ``header`` was set to ``None`` for ``engine="python"`` (:issue:`22144`)
666+
- Bug in :func:`read_csv` not recognizing bad lines when ``names`` were given for ``engine="c"`` (:issue:`22144`)
665667
- Bug in :func:`read_csv` with :code:`float_precision="round_trip"` which did not skip initial/trailing whitespace (:issue:`43713`)
666668
- Bug in :func:`read_csv` not applying dtype for ``index_col`` (:issue:`9435`)
667669
- Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)

pandas/_libs/parsers.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -721,10 +721,6 @@ cdef class TextReader:
721721
header = [self.names]
722722

723723
elif self.names is not None:
724-
# Enforce this unless usecols
725-
if not self.has_usecols:
726-
self.parser.expected_fields = len(self.names)
727-
728724
# Names passed
729725
if self.parser.lines < 1:
730726
self._tokenize_rows(1)
@@ -735,6 +731,10 @@ cdef class TextReader:
735731
field_count = len(header[0])
736732
else:
737733
field_count = self.parser.line_fields[data_line]
734+
735+
# Enforce this unless usecols
736+
if not self.has_usecols:
737+
self.parser.expected_fields = max(field_count, len(self.names))
738738
else:
739739
# No header passed nor to be found in the file
740740
if self.parser.lines < 1:

pandas/_libs/src/parser/tokenizer.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ static int end_line(parser_t *self) {
446446
}
447447

448448
if (!(self->lines <= self->header_end + 1) &&
449-
(self->expected_fields < 0 && fields > ex_fields) && !(self->usecols)) {
449+
(fields > ex_fields) && !(self->usecols)) {
450450
// increment file line count
451451
self->file_lines++;
452452

pandas/io/parsers/python_parser.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ def _infer_columns(self):
341341
num_original_columns = 0
342342
clear_buffer = True
343343
unnamed_cols: set[str | int | None] = set()
344+
self._header_line = None
344345

345346
if self.header is not None:
346347
header = self.header
@@ -481,6 +482,8 @@ def _infer_columns(self):
481482

482483
line = names[:]
483484

485+
# Store line, otherwise it is lost for guessing the index
486+
self._header_line = line
484487
ncols = len(line)
485488
num_original_columns = ncols
486489

@@ -852,10 +855,13 @@ def _get_index_name(self, columns):
852855
orig_names = list(columns)
853856
columns = list(columns)
854857

855-
try:
856-
line = self._next_line()
857-
except StopIteration:
858-
line = None
858+
if self._header_line is not None:
859+
line = self._header_line
860+
else:
861+
try:
862+
line = self._next_line()
863+
except StopIteration:
864+
line = None
859865

860866
try:
861867
next_line = self._next_line()

pandas/tests/io/parser/test_header.py

+33
Original file line numberDiff line numberDiff line change
@@ -620,3 +620,36 @@ def test_read_csv_multi_header_length_check(all_parsers):
620620
ParserError, match="Header rows must have an equal number of columns."
621621
):
622622
parser.read_csv(StringIO(case), header=[0, 2])
623+
624+
625+
@skip_pyarrow
626+
def test_header_none_and_implicit_index(all_parsers):
627+
# GH#22144
628+
parser = all_parsers
629+
data = "x,1,5\ny,2\nz,3\n"
630+
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
631+
expected = DataFrame(
632+
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
633+
)
634+
tm.assert_frame_equal(result, expected)
635+
636+
637+
@skip_pyarrow
638+
def test_header_none_and_implicit_index_in_second_row(all_parsers):
639+
# GH#22144
640+
parser = all_parsers
641+
data = "x,1\ny,2,5\nz,3\n"
642+
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
643+
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
644+
645+
646+
@skip_pyarrow
647+
def test_header_none_and_on_bad_lines_skip(all_parsers):
648+
# GH#22144
649+
parser = all_parsers
650+
data = "x,1\ny,2,5\nz,3\n"
651+
result = parser.read_csv(
652+
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
653+
)
654+
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
655+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)