From 3a21002278be004460c41c9fe49c43aa690e0d88 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 6 Dec 2020 00:42:08 +0100 Subject: [PATCH 1/4] BUG: read_csv raising IndexError with multiple header cols, specified index_col and no data rows --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/parsers.py | 11 ++++++++--- pandas/tests/io/parser/test_index_col.py | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ac930b3e77785..316f7ccbfbf29 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -729,6 +729,7 @@ I/O - :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) - :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) - Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) +- Bug in :meth:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) Period ^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5b623c360c3ef..2a0d7693de99c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1465,9 +1465,14 @@ def _extract_multi_indexer_columns( # clean the index_names index_names = header.pop(-1) - index_names, names, index_col = _clean_index_names( - index_names, self.index_col, self.unnamed_cols - ) + if not index_names: + # In case of no rows and multiindex columns we have to set index_names to + # list of Nones GH#38292 + index_names = [None] * len(ic) + else: + index_names, _, _ = _clean_index_names( + index_names, self.index_col, self.unnamed_cols + ) # extract the columns field_count = len(header[0]) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 9c6cad4b41949..cd2d4402cb801 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -222,3 +222,18 @@ def test_index_col_large_csv(all_parsers): result = parser.read_csv(path, index_col=[0]) tm.assert_frame_equal(result, df.set_index("a")) + + +def test_index_col_multiindex_columns_no_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv( + StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0 + ) + expected = DataFrame( + [], + columns=MultiIndex.from_arrays( + [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"] + ), + ) + tm.assert_frame_equal(result, expected) From 415c246f8ec739472c931bd5b6f311117c32a803 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 8 Dec 2020 00:43:13 +0100 Subject: [PATCH 2/4] Adress comments --- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 316f7ccbfbf29..d7262021b744a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -726,10 +726,10 @@ I/O - :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) - Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`) - Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) -- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) +- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:`read_*` functions (:issue:`37909`) - :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) - Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) -- Bug in :meth:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) +- Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) Period ^^^^^^ From 6371423b0c1fd241058f4d0a1f07b0787b3390c7 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 9 Dec 2020 00:45:47 +0100 Subject: [PATCH 3/4] Add tests and move whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 - doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/io/parsers.py | 16 +++++----- pandas/tests/io/parser/test_index_col.py | 38 ++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8a0cc9af726dd..e398b91b20fdb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -745,7 +745,6 @@ I/O - :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:`read_*` functions (:issue:`37909`) - :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) - Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) -- Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) Period ^^^^^^ diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 90f611c55e710..068cd36c65f1f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -146,7 +146,7 @@ MultiIndex I/O ^^^ -- +- Bug in :func:`read_csv` raising ``IndexError`` with multiple header columns and ``index_col`` specified when file has no data rows (:issue:`38292`) - Period diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2a0d7693de99c..ffde23fc7ee05 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1465,14 +1465,9 @@ def _extract_multi_indexer_columns( # clean the index_names index_names = header.pop(-1) - if not index_names: - # In case of no rows and multiindex columns we have to set index_names to - # list of Nones GH#38292 - index_names = [None] * len(ic) - else: - index_names, _, _ = _clean_index_names( - index_names, self.index_col, self.unnamed_cols - ) + index_names, _, _ = _clean_index_names( + index_names, self.index_col, self.unnamed_cols + ) # extract the columns field_count = len(header[0]) @@ -3477,6 +3472,11 @@ def _clean_index_names(columns, index_col, unnamed_cols): columns = list(columns) + # In case of no rows and multiindex columns we have to set index_names to + # list of Nones GH#38292 + if not columns: + return [None] * len(index_col), columns, index_col + cp_cols = list(columns) index_names = [] diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index cd2d4402cb801..5ad26780dddb9 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -237,3 +237,41 @@ def test_index_col_multiindex_columns_no_data(all_parsers): ), ) tm.assert_frame_equal(result, expected) + + +def test_index_col_header_no_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0) + expected = DataFrame( + [], + columns=["a1", "a2"], + index=Index([], name="a0"), + ) + tm.assert_frame_equal(result, expected) + + +def test_multiindex_columns_no_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1]) + expected = DataFrame( + [], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]]) + ) + tm.assert_frame_equal(result, expected) + + +def test_multiindex_columns_index_col_with_data(all_parsers): + # GH#38292 + parser = all_parsers + result = parser.read_csv( + StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0 + ) + expected = DataFrame( + [["data", "data"]], + columns=MultiIndex.from_arrays( + [["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"] + ), + index=Index(["data"]), + ) + tm.assert_frame_equal(result, expected) From 27dbe008f4b68b2a468f4c2f62de902516b43c1a Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 11 Dec 2020 18:22:17 +0100 Subject: [PATCH 4/4] Revert change --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e398b91b20fdb..ef0a1eb7e82e3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -742,7 +742,7 @@ I/O - :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) - Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`) - Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) -- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:`read_*` functions (:issue:`37909`) +- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) - :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) - Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`)