diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 985cd2bb553b7..52cbaa033e707 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -383,7 +383,7 @@ I/O - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`) - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) -- +- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) Period ^^^^^^ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c702c82cdce15..c7516a2df50f6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -343,6 +343,10 @@ def _extract_multi_indexer_columns( # extract the columns field_count = len(header[0]) + # check if header lengths are equal + if not all(len(header_iter) == field_count for header_iter in header[1:]): + raise ParserError("Header rows must have an equal number of columns.") + def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4016ce48be5f6..d4b87070720d1 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -604,3 +604,19 @@ def test_read_csv_multiindex_columns(all_parsers): tm.assert_frame_equal(df1, expected.iloc[:1]) df2 = parser.read_csv(StringIO(s2), header=[0, 1]) tm.assert_frame_equal(df2, expected) + + +@skip_pyarrow +def test_read_csv_multi_header_length_check(all_parsers): + # GH#43102 + parser = all_parsers + + case = """row11,row12,row13 +row21,row22, row23 +row31,row32 +""" + + with pytest.raises( + ParserError, match="Header rows must have an equal number of columns." + ): + parser.read_csv(StringIO(case), header=[0, 2])