Skip to content

Commit 1625f28

Browse files
BUG: read_fwf raise if len colspec doesnt match len names (pandas-dev#42920)
1 parent 5d01add commit 1625f28

File tree

3 files changed

+162
-0
lines changed

3 files changed

+162
-0
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ I/O
334334
- Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
335335
- Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
336336
- Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
337+
- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`)
337338
- Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`)
338339
-
339340

pandas/io/parsers/readers.py

+18
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,24 @@ def read_fwf(
804804
colspecs.append((col, col + w))
805805
col += w
806806

807+
# GH#40830
808+
# Ensure length of `colspecs` matches length of `names`
809+
names = kwds.get("names")
810+
if names is not None:
811+
if len(names) != len(colspecs):
812+
# need to check len(index_col) as it might contain
813+
# unnamed indices, in which case it's name is not required
814+
len_index = 0
815+
if kwds.get("index_col") is not None:
816+
index_col: Any = kwds.get("index_col")
817+
if index_col is not False:
818+
if not is_list_like(index_col):
819+
len_index = 1
820+
else:
821+
len_index = len(index_col)
822+
if len(names) + len_index != len(colspecs):
823+
raise ValueError("Length of colspecs must match length of names")
824+
807825
kwds["colspecs"] = colspecs
808826
kwds["infer_nrows"] = infer_nrows
809827
kwds["engine"] = "python-fwf"

pandas/tests/io/parser/test_read_fwf.py

+143
Original file line numberDiff line numberDiff line change
@@ -710,3 +710,146 @@ def test_encoding_mmap(memory_map):
710710
data.seek(0)
711711
df_reference = DataFrame([[1, "A", "Ä", 2]])
712712
tm.assert_frame_equal(df, df_reference)
713+
714+
715+
@pytest.mark.parametrize(
716+
"colspecs, names, widths, index_col",
717+
[
718+
(
719+
[(0, 6), (6, 12), (12, 18), (18, None)],
720+
list("abcde"),
721+
None,
722+
None,
723+
),
724+
(
725+
None,
726+
list("abcde"),
727+
[6] * 4,
728+
None,
729+
),
730+
(
731+
[(0, 6), (6, 12), (12, 18), (18, None)],
732+
list("abcde"),
733+
None,
734+
True,
735+
),
736+
(
737+
None,
738+
list("abcde"),
739+
[6] * 4,
740+
False,
741+
),
742+
(
743+
None,
744+
list("abcde"),
745+
[6] * 4,
746+
True,
747+
),
748+
(
749+
[(0, 6), (6, 12), (12, 18), (18, None)],
750+
list("abcde"),
751+
None,
752+
False,
753+
),
754+
],
755+
)
756+
def test_len_colspecs_len_names(colspecs, names, widths, index_col):
757+
# GH#40830
758+
data = """col1 col2 col3 col4
759+
bab ba 2"""
760+
msg = "Length of colspecs must match length of names"
761+
with pytest.raises(ValueError, match=msg):
762+
read_fwf(
763+
StringIO(data),
764+
colspecs=colspecs,
765+
names=names,
766+
widths=widths,
767+
index_col=index_col,
768+
)
769+
770+
771+
@pytest.mark.parametrize(
772+
"colspecs, names, widths, index_col, expected",
773+
[
774+
(
775+
[(0, 6), (6, 12), (12, 18), (18, None)],
776+
list("abc"),
777+
None,
778+
0,
779+
DataFrame(
780+
index=["col1", "ba"],
781+
columns=["a", "b", "c"],
782+
data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]],
783+
),
784+
),
785+
(
786+
[(0, 6), (6, 12), (12, 18), (18, None)],
787+
list("ab"),
788+
None,
789+
[0, 1],
790+
DataFrame(
791+
index=[["col1", "ba"], ["col2", "b ba"]],
792+
columns=["a", "b"],
793+
data=[["col3", "col4"], ["2", np.nan]],
794+
),
795+
),
796+
(
797+
[(0, 6), (6, 12), (12, 18), (18, None)],
798+
list("a"),
799+
None,
800+
[0, 1, 2],
801+
DataFrame(
802+
index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]],
803+
columns=["a"],
804+
data=[["col4"], [np.nan]],
805+
),
806+
),
807+
(
808+
None,
809+
list("abc"),
810+
[6] * 4,
811+
0,
812+
DataFrame(
813+
index=["col1", "ba"],
814+
columns=["a", "b", "c"],
815+
data=[["col2", "col3", "col4"], ["b ba", "2", np.nan]],
816+
),
817+
),
818+
(
819+
None,
820+
list("ab"),
821+
[6] * 4,
822+
[0, 1],
823+
DataFrame(
824+
index=[["col1", "ba"], ["col2", "b ba"]],
825+
columns=["a", "b"],
826+
data=[["col3", "col4"], ["2", np.nan]],
827+
),
828+
),
829+
(
830+
None,
831+
list("a"),
832+
[6] * 4,
833+
[0, 1, 2],
834+
DataFrame(
835+
index=[["col1", "ba"], ["col2", "b ba"], ["col3", "2"]],
836+
columns=["a"],
837+
data=[["col4"], [np.nan]],
838+
),
839+
),
840+
],
841+
)
842+
def test_len_colspecs_len_names_with_index_col(
843+
colspecs, names, widths, index_col, expected
844+
):
845+
# GH#40830
846+
data = """col1 col2 col3 col4
847+
bab ba 2"""
848+
result = read_fwf(
849+
StringIO(data),
850+
colspecs=colspecs,
851+
names=names,
852+
widths=widths,
853+
index_col=index_col,
854+
)
855+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)