From 3f8f7c67334e1008a39b9c54df653f1241a95771 Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Thu, 20 Jan 2022 20:05:20 +0000 Subject: [PATCH 01/12] BUG: fix skiprows callable infinite loop --- pandas/io/parsers/python_parser.py | 2 ++ pandas/tests/io/excel/test_readers.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 52fa3be4ff418..5698fccf84743 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -667,6 +667,8 @@ def _is_line_empty(self, line: list[Scalar]) -> bool: def _next_line(self) -> list[Scalar]: if isinstance(self.data, list): while self.skipfunc(self.pos): + if self.pos >= len(self.data): + break self.pos += 1 while True: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 589c98721f139..51e39cee8c70b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1148,6 +1148,22 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=lambda x: x not in [1, 3, 5], + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + # [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + # [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(actual, expected) + actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", From e563d93eaca707ce5ccebd2efbb2a60a3c46835b Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Mon, 24 Jan 2022 12:50:25 +0000 Subject: [PATCH 02/12] BUG: seperate new test --- pandas/tests/io/excel/test_readers.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 51e39cee8c70b..fdfca83c5d1e6 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1151,36 +1151,38 @@ def test_read_excel_skiprows(self, request, read_ext): actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", - skiprows=lambda x: x not in [1, 3, 5], + skiprows=3, + names=["a", "b", "c", "d"], ) expected = DataFrame( [ - [1, 2.5, pd.Timestamp("2015-01-01"), True], - # [2, 3.5, pd.Timestamp("2015-01-02"), False], + # [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], [3, 4.5, pd.Timestamp("2015-01-03"), False], - # [4, 5.5, pd.Timestamp("2015-01-04"), True], + [4, 5.5, pd.Timestamp("2015-01-04"), True], ], columns=["a", "b", "c", "d"], ) tm.assert_frame_equal(actual, expected) + def test_read_excel_skiprows_callable_not_in(self, request, read_ext): actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", - skiprows=3, - names=["a", "b", "c", "d"], + skiprows=lambda x: x not in [1, 3, 5], ) expected = DataFrame( [ - # [1, 2.5, pd.Timestamp("2015-01-01"), True], - [2, 3.5, pd.Timestamp("2015-01-02"), False], + [1, 2.5, pd.Timestamp("2015-01-01"), True], + # [2, 3.5, pd.Timestamp("2015-01-02"), False], [3, 4.5, pd.Timestamp("2015-01-03"), False], - [4, 5.5, pd.Timestamp("2015-01-04"), True], + # [4, 5.5, pd.Timestamp("2015-01-04"), True], ], columns=["a", "b", "c", "d"], ) tm.assert_frame_equal(actual, expected) + def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 From 0e09f5a1107e6bf358a01e0212a51a863396d03e Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Mon, 24 Jan 2022 15:16:05 +0000 Subject: [PATCH 03/12] add whatsnew entry --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1ae76984484af..ce3f2629def28 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -194,6 +194,7 @@ Conversion - Bug in constructing a :class:`Series` from a float-containing list or a floating-dtype ndarray-like (e.g. ``dask.Array``) and an integer dtype raising instead of casting like we would with an ``np.ndarray`` (:issue:`40110`) - Bug in :meth:`Float64Index.astype` to unsigned integer dtype incorrectly casting to ``np.int64`` dtype (:issue:`45309`) - Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` from floating dtype to unsigned integer dtype failing to raise in the presence of negative values (:issue:`45151`) +- Bug in :meth:`python_parser._next_line` causing an infinite loop with certain `skiprows` callables (:issue:`45585`) - Strings From 30cfec642059972527db25ec542bc7822a94ccc9 Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Mon, 24 Jan 2022 16:48:37 +0000 Subject: [PATCH 04/12] make note user-centric --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index ce3f2629def28..dbaed6bf08284 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -194,7 +194,7 @@ Conversion - Bug in constructing a :class:`Series` from a float-containing list or a floating-dtype ndarray-like (e.g. ``dask.Array``) and an integer dtype raising instead of casting like we would with an ``np.ndarray`` (:issue:`40110`) - Bug in :meth:`Float64Index.astype` to unsigned integer dtype incorrectly casting to ``np.int64`` dtype (:issue:`45309`) - Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` from floating dtype to unsigned integer dtype failing to raise in the presence of negative values (:issue:`45151`) -- Bug in :meth:`python_parser._next_line` causing an infinite loop with certain `skiprows` callables (:issue:`45585`) +- Bug in :meth:`read_excel` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) - Strings From 831f69cb1062705e5ba0122ee269be6d8d8e4f5f Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Mon, 24 Jan 2022 16:51:25 +0000 Subject: [PATCH 05/12] add engine --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index dbaed6bf08284..6c9b78438e0ce 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -194,7 +194,7 @@ Conversion - Bug in constructing a :class:`Series` from a float-containing list or a floating-dtype ndarray-like (e.g. ``dask.Array``) and an integer dtype raising instead of casting like we would with an ``np.ndarray`` (:issue:`40110`) - Bug in :meth:`Float64Index.astype` to unsigned integer dtype incorrectly casting to ``np.int64`` dtype (:issue:`45309`) - Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` from floating dtype to unsigned integer dtype failing to raise in the presence of negative values (:issue:`45151`) -- Bug in :meth:`read_excel` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) +- Bug in :meth:`read_excel` with `engine=python` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) - Strings From ac235d4c3215212d13fb2c84f2a112532f6f9bb1 Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Thu, 27 Jan 2022 13:00:50 +0000 Subject: [PATCH 06/12] move note to IO section --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6c9b78438e0ce..4fb626ef586f1 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -194,7 +194,6 @@ Conversion - Bug in constructing a :class:`Series` from a float-containing list or a floating-dtype ndarray-like (e.g. ``dask.Array``) and an integer dtype raising instead of casting like we would with an ``np.ndarray`` (:issue:`40110`) - Bug in :meth:`Float64Index.astype` to unsigned integer dtype incorrectly casting to ``np.int64`` dtype (:issue:`45309`) - Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` from floating dtype to unsigned integer dtype failing to raise in the presence of negative values (:issue:`45151`) -- Bug in :meth:`read_excel` with `engine=python` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) - Strings @@ -228,6 +227,7 @@ MultiIndex I/O ^^^ - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`) +- Bug in :meth:`read_excel` with `engine=python` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) - Period From 9df6e0187770e6f90be014afc4506a44c12eba82 Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Thu, 27 Jan 2022 22:15:18 +0000 Subject: [PATCH 07/12] modify comment --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4fb626ef586f1..f05d933062b34 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -227,7 +227,7 @@ MultiIndex I/O ^^^ - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`) -- Bug in :meth:`read_excel` with `engine=python` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) +- Bug in :meth:`read_excel` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) - Period From 85d5ee1cd75e18085baad3fe0eb8f8d2d51c56b4 Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Fri, 28 Jan 2022 11:13:49 +0000 Subject: [PATCH 08/12] add test for csv --- pandas/tests/io/parser/test_skiprows.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index e88ccf07353b6..a6954573d2e8d 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -240,6 +240,14 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) tm.assert_frame_equal(result, expected) +def test_skip_rows_callable_not_in(all_parsers): + parser = all_parsers + data = "0,a\n1,b\n2,c\n3,d\n4,e" + expected = DataFrame([[1,"b"], [3,"d"]]) + + result = parser.read_csv(StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]) + tm.assert_frame_equal(result, expected) + def test_skip_rows_skip_all(all_parsers): parser = all_parsers From a1f9187859a8994fd6da058c12bbbedf9f4d639f Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Fri, 28 Jan 2022 11:15:53 +0000 Subject: [PATCH 09/12] lint --- pandas/tests/io/parser/test_skiprows.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index a6954573d2e8d..81fafe76b388d 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -240,12 +240,14 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) tm.assert_frame_equal(result, expected) + def test_skip_rows_callable_not_in(all_parsers): parser = all_parsers data = "0,a\n1,b\n2,c\n3,d\n4,e" - expected = DataFrame([[1,"b"], [3,"d"]]) + expected = DataFrame([[1, "b"], [3, "d"]]) - result = parser.read_csv(StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]) + result = parser.read_csv( + StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]) tm.assert_frame_equal(result, expected) From 3e41e7d3e57a885cd833b4d59eece2a8ffd328ac Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Fri, 28 Jan 2022 11:42:23 +0000 Subject: [PATCH 10/12] formatting --- pandas/tests/io/excel/test_readers.py | 1 - pandas/tests/io/parser/test_skiprows.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fdfca83c5d1e6..dbebb40ea63d2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1182,7 +1182,6 @@ def test_read_excel_skiprows_callable_not_in(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 81fafe76b388d..c58e27aacfa00 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -247,7 +247,8 @@ def test_skip_rows_callable_not_in(all_parsers): expected = DataFrame([[1, "b"], [3, "d"]]) result = parser.read_csv( - StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]) + StringIO(data), header=None, skiprows=lambda x: x not in [1, 3] + ) tm.assert_frame_equal(result, expected) From 196b055934a2977e611fcfe4a90ccdc56dcb07ca Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Fri, 28 Jan 2022 13:19:48 +0000 Subject: [PATCH 11/12] fix whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 0698d8d8b7a28..ae0d3723d883c 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -281,7 +281,7 @@ MultiIndex I/O ^^^ - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`) -- Bug in :meth:`read_excel` results in an infinite loop with certain `skiprows` callables (:issue:`45585`) +- Bug in :func:`read_excel` results in an infinite loop with certain ``skiprows`` callables (:issue:`45585`) - Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - From 3abc40a04998392203c1fbc68026cec3e1c7f4da Mon Sep 17 00:00:00 2001 From: Jon Bramley Date: Fri, 28 Jan 2022 18:28:55 +0000 Subject: [PATCH 12/12] test not valid for pyxlsb --- pandas/tests/io/excel/test_readers.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index dbebb40ea63d2..684a8a0c1ad3e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1166,6 +1166,14 @@ def test_read_excel_skiprows(self, request, read_ext): tm.assert_frame_equal(actual, expected) def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + # GH 4903 + if read_ext == ".xlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list",