From 53a17688a4d36af67bea3e8b2623072250851454 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Fri, 20 Aug 2021 10:48:03 +0530 Subject: [PATCH 01/13] BUG: Throw a ParserError when header rows have unequal column counts (GH43102) --- pandas/io/parsers/base_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5714bbab016c8..25158de1f3943 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -341,6 +341,11 @@ def _extract_multi_indexer_columns( # extract the columns field_count = len(header[0]) + #check if header lengths are equal + for l in range(len(header)): + if len(header[l])!=field_count: + raise ParserError(f"Header rows must have equal number of columns. Mismatch found at row " + str(l)) + def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) From 128b4e3c01256287945a430bd99a29bab2e20459 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Fri, 20 Aug 2021 11:19:23 +0530 Subject: [PATCH 02/13] BUG: Throw a ParserError when header rows have unequal column counts. Updated to comply with PEP8 (GH43102) --- pandas/io/parsers/base_parser.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 25158de1f3943..a228e2e9dc478 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -341,11 +341,12 @@ def _extract_multi_indexer_columns( # extract the columns field_count = len(header[0]) - #check if header lengths are equal - for l in range(len(header)): - if len(header[l])!=field_count: - raise ParserError(f"Header rows must have equal number of columns. Mismatch found at row " + str(l)) - + # check if header lengths are equal + for header_len in range(len(header)): + if len(header[header_len]) != field_count: + raise ParserError("Header rows must have equal number of columns." + f" Mismatch found at row {header_len}") + def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) From 95bac98b2ba892e3e90a152446046080a02f5c3f Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Fri, 20 Aug 2021 22:00:27 +0530 Subject: [PATCH 03/13] Added Test. (GH43102) --- pandas/io/parsers/base_parser.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a228e2e9dc478..df68a180afce3 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -342,10 +342,12 @@ def _extract_multi_indexer_columns( field_count = len(header[0]) # check if header lengths are equal - for header_len in range(len(header)): - if len(header[header_len]) != field_count: - raise ParserError("Header rows must have equal number of columns." - f" Mismatch found at row {header_len}") + for header_iter in range(len(header)): + if len(header[header_iter]) != field_count: + raise ParserError( + "Header rows must have equal number of columns. " + f"Mismatch found at header {header_iter}." + ) def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) From 10422a87d3ecf1a5d1d29422a85f7ab7ee5f5b82 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Fri, 20 Aug 2021 22:06:40 +0530 Subject: [PATCH 04/13] Added Test. (GH43102) --- pandas/tests/io/parser/test_header.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 3b814360d3aa4..8c93e40d11a8f 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -585,3 +585,20 @@ def test_read_csv_multiindex_columns(all_parsers): tm.assert_frame_equal(df1, expected.iloc[:1]) df2 = parser.read_csv(StringIO(s2), header=[0, 1]) tm.assert_frame_equal(df2, expected) + + +def test_read_csv_multi_header_length_check(all_parsers): + # GH#43102 + parser = all_parsers + + case = """row11,row12,row13 +row21,row22, row23 +row31,row32 +""" + + with pytest.raises( + ValueError, + match="Header rows must have equal number of columns. " + "Mismatch found at header 1.", + ): + parser.read_csv(StringIO(case), sep=",", header=[0, 2]) From 658c291bf4458c0b728cd9cbe2b270ac503b8e8a Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Fri, 20 Aug 2021 22:19:56 +0530 Subject: [PATCH 05/13] Added Test. (GH43102) --- pandas/tests/io/parser/test_header.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 8c93e40d11a8f..e902cab485b35 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -597,7 +597,7 @@ def test_read_csv_multi_header_length_check(all_parsers): """ with pytest.raises( - ValueError, + ParserError, match="Header rows must have equal number of columns. " "Mismatch found at header 1.", ): From a02d476a626889314879502f4626b15087233210 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Sat, 21 Aug 2021 00:11:48 +0530 Subject: [PATCH 06/13] Added Changes. (GH43102) --- pandas/io/parsers/base_parser.py | 8 ++------ pandas/tests/io/parser/test_header.py | 6 ++---- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index df68a180afce3..34c51632bff2d 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -342,12 +342,8 @@ def _extract_multi_indexer_columns( field_count = len(header[0]) # check if header lengths are equal - for header_iter in range(len(header)): - if len(header[header_iter]) != field_count: - raise ParserError( - "Header rows must have equal number of columns. " - f"Mismatch found at header {header_iter}." - ) + if not all(len(header_iter) == field_count for header_iter in header[1:]): + raise ParserError("Header rows must have an equal number of columns.") def extract(r): return tuple(r[i] for i in range(field_count) if i not in sic) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index e902cab485b35..2a8f41a3f775e 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -597,8 +597,6 @@ def test_read_csv_multi_header_length_check(all_parsers): """ with pytest.raises( - ParserError, - match="Header rows must have equal number of columns. " - "Mismatch found at header 1.", + ParserError, match="Header rows must have an equal number of columns." ): - parser.read_csv(StringIO(case), sep=",", header=[0, 2]) + parser.read_csv(StringIO(case), header=[0, 2]) From 5f534ea8f2eeab7b07b9d6972ef0b45c2493c70d Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Mon, 23 Aug 2021 00:12:59 +0530 Subject: [PATCH 07/13] Added whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 205a49e7786a7..6649467e2cb9f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -319,7 +319,7 @@ I/O - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) -- +- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raises an ``IndexError`` (:issue:`43102`) Period ^^^^^^ From 5239ece71cdc8abf22b4d0788ebb4129c4c0fd5c Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Mon, 23 Aug 2021 02:16:34 +0530 Subject: [PATCH 08/13] Added whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 6649467e2cb9f..729a3bbb73057 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -319,7 +319,7 @@ I/O - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) -- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raises an ``IndexError`` (:issue:`43102`) +- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) Period ^^^^^^ From 863e9960dc9bd204389e4daeb575d12079f54473 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Tue, 24 Aug 2021 00:16:29 +0530 Subject: [PATCH 09/13] Test without whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 729a3bbb73057..205a49e7786a7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -319,7 +319,7 @@ I/O - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) -- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) +- Period ^^^^^^ From 532e6cb8e0613cc35c300671572b787861525f02 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Tue, 24 Aug 2021 01:16:14 +0530 Subject: [PATCH 10/13] Add whatsnew again --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 205a49e7786a7..729a3bbb73057 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -319,7 +319,7 @@ I/O - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) -- +- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) Period ^^^^^^ From 1caf42db6d7ea1e4a689c4dd70745a8b0e32779b Mon Sep 17 00:00:00 2001 From: quantumalaviya <45961148+quantumalaviya@users.noreply.github.com> Date: Sun, 5 Sep 2021 09:50:02 +0530 Subject: [PATCH 11/13] Update v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a0b1c4e80ab75..e2b42a0ea14f5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -377,7 +377,7 @@ I/O - Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`) -- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) +- Period ^^^^^^ From 2ca6ccf8a0e6026528090ca5b846126507c438d2 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Sun, 5 Sep 2021 10:05:51 +0530 Subject: [PATCH 12/13] Merge upstream --- doc/source/whatsnew/v1.4.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a44825a78b1a0..52cbaa033e707 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -382,7 +382,8 @@ I/O - Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) - Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`) -- +- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) +- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) Period ^^^^^^ From 3f1fb3944bec76ede3adffae1a0f98cc0fba8939 Mon Sep 17 00:00:00 2001 From: Sarvagya Malaviya Date: Sun, 5 Sep 2021 13:56:45 +0530 Subject: [PATCH 13/13] Skipping test on PyArrow --- pandas/tests/io/parser/test_header.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index af08c819baddd..d4b87070720d1 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -606,6 +606,7 @@ def test_read_csv_multiindex_columns(all_parsers): tm.assert_frame_equal(df2, expected) +@skip_pyarrow def test_read_csv_multi_header_length_check(all_parsers): # GH#43102 parser = all_parsers