BUG: Throw a ParserError when header rows have unequal column counts … (#43118)

quantumalaviya · web-flow · commit 343ac2a4179f · 2021-09-05T08:59:20.000-07:00
* BUG: Throw a ParserError when header rows have unequal column counts (GH43102)

* BUG: Throw a ParserError when header rows have unequal column counts. Updated to comply with PEP8 (GH43102)

* Added Test. (GH43102)

* Added Test. (GH43102)

* Added Test. (GH43102)

* Added Changes. (GH43102)

* Added whatsnew

* Added whatsnew

* Test without whatsnew

* Add whatsnew again

* Update v1.4.0.rst

* Merge upstream

* Skipping test on PyArrow
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -384,7 +384,7 @@ I/O
 - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`)
 - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`)
 - Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
--
+- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -343,6 +343,10 @@ def _extract_multi_indexer_columns(
         # extract the columns
         field_count = len(header[0])
 
+        # check if header lengths are equal
+        if not all(len(header_iter) == field_count for header_iter in header[1:]):
+            raise ParserError("Header rows must have an equal number of columns.")
+
         def extract(r):
             return tuple(r[i] for i in range(field_count) if i not in sic)
 
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
@@ -604,3 +604,19 @@ def test_read_csv_multiindex_columns(all_parsers):
     tm.assert_frame_equal(df1, expected.iloc[:1])
     df2 = parser.read_csv(StringIO(s2), header=[0, 1])
     tm.assert_frame_equal(df2, expected)
+
+
+@skip_pyarrow
+def test_read_csv_multi_header_length_check(all_parsers):
+    # GH#43102
+    parser = all_parsers
+
+    case = """row11,row12,row13
+row21,row22, row23
+row31,row32
+"""
+
+    with pytest.raises(
+        ParserError, match="Header rows must have an equal number of columns."
+    ):
+        parser.read_csv(StringIO(case), header=[0, 2])