BUG: read_fwf raise if len colspec doesnt match len names (pandas-dev#42920)

debnathshoham · web-flow · commit 1625f28523d3 · 2021-08-31T19:43:09.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -334,6 +334,7 @@ I/O
 - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
 - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
 - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
+- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`)
 - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`)
 -
 
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -804,6 +804,24 @@ def read_fwf(
             colspecs.append((col, col + w))
             col += w
 
+    # GH#40830
+    # Ensure length of `colspecs` matches length of `names`
+    names = kwds.get("names")
+    if names is not None:
+        if len(names) != len(colspecs):
+            # need to check len(index_col) as it might contain
+            # unnamed indices, in which case it's name is not required
+            len_index = 0
+            if kwds.get("index_col") is not None:
+                index_col: Any = kwds.get("index_col")
+                if index_col is not False:
+                    if not is_list_like(index_col):
+                        len_index = 1
+                    else:
+                        len_index = len(index_col)
+            if len(names) + len_index != len(colspecs):
+                raise ValueError("Length of colspecs must match length of names")
+
     kwds["colspecs"] = colspecs
     kwds["infer_nrows"] = infer_nrows
     kwds["engine"] = "python-fwf"
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -710,3 +710,146 @@ def test_encoding_mmap(memory_map):
     data.seek(0)
     df_reference = DataFrame([[1, "A", "Ä", 2]])
     tm.assert_frame_equal(df, df_reference)
+
+
+@pytest.mark.parametrize(
+    "colspecs, names, widths, index_col",
+    [
+        (
+            [(0, 6), (6, 12), (12, 18), (18, None)],
+            list("abcde"),
+            None,
+            None,
+        ),
+        (
+            None,
+            list("abcde"),
+            [6] * 4,
+            None,
+        ),
+        (
+            [(0, 6), (6, 12), (12, 18), (18, None)],
+            list("abcde"),
+            None,
+            True,
+        ),
+        (
+            None,
+            list("abcde"),
+            [6] * 4,
+            False,
+        ),
+        (
+            None,
+            list("abcde"),
+            [6] * 4,
+            True,
+        ),
+        (
+            [(0, 6), (6, 12), (12, 18), (18, None)],
+            list("abcde"),
+            None,
+            False,
+        ),
+    ],
+)
+def test_len_colspecs_len_names(colspecs, names, widths, index_col):
+    # GH#40830
+    data = """col1  col2  col3  col4
+    bab   ba    2"""
+    msg = "Length of colspecs must match length of names"
+    with pytest.raises(ValueError, match=msg):
+        read_fwf(
+            StringIO(data),
+            colspecs=colspecs,
+            names=names,
+            widths=widths,
+            index_col=index_col,
+        )
+
+
+@pytest.mark.parametrize(
+    "colspecs, names, widths, index_col, expected",
+    [
+        (
+            [(0, 6), (6, 12), (12, 18), (18, None)],
+            list("abc"),
+            None,
+            0,
+            DataFrame(
+                index=["col1", "ba"],
+                columns=["a", "b", "c"],
+                data=[["col2", "col3", "col4"], ["b   ba", "2", np.nan]],
+            ),
+        ),
+        (
+            [(0, 6), (6, 12), (12, 18), (18, None)],
+            list("ab"),
+            None,
+            [0, 1],
+            DataFrame(
+                index=[["col1", "ba"], ["col2", "b   ba"]],
+                columns=["a", "b"],
+                data=[["col3", "col4"], ["2", np.nan]],
+            ),
+        ),
+        (
+            [(0, 6), (6, 12), (12, 18), (18, None)],
+            list("a"),
+            None,
+            [0, 1, 2],
+            DataFrame(
+                index=[["col1", "ba"], ["col2", "b   ba"], ["col3", "2"]],
+                columns=["a"],
+                data=[["col4"], [np.nan]],
+            ),
+        ),
+        (
+            None,
+            list("abc"),
+            [6] * 4,
+            0,
+            DataFrame(
+                index=["col1", "ba"],
+                columns=["a", "b", "c"],
+                data=[["col2", "col3", "col4"], ["b   ba", "2", np.nan]],
+            ),
+        ),
+        (
+            None,
+            list("ab"),
+            [6] * 4,
+            [0, 1],
+            DataFrame(
+                index=[["col1", "ba"], ["col2", "b   ba"]],
+                columns=["a", "b"],
+                data=[["col3", "col4"], ["2", np.nan]],
+            ),
+        ),
+        (
+            None,
+            list("a"),
+            [6] * 4,
+            [0, 1, 2],
+            DataFrame(
+                index=[["col1", "ba"], ["col2", "b   ba"], ["col3", "2"]],
+                columns=["a"],
+                data=[["col4"], [np.nan]],
+            ),
+        ),
+    ],
+)
+def test_len_colspecs_len_names_with_index_col(
+    colspecs, names, widths, index_col, expected
+):
+    # GH#40830
+    data = """col1  col2  col3  col4
+    bab   ba    2"""
+    result = read_fwf(
+        StringIO(data),
+        colspecs=colspecs,
+        names=names,
+        widths=widths,
+        index_col=index_col,
+    )
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -334,6 +334,7 @@ I/O`
`334`	`334`	- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
`335`	`335`	- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
`336`	`336`	- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
	`337`	+- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`)
`337`	`338`	- Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`)
`338`	`339`	`-`
`339`	`340`