diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 919ed926f8195..baccd4b45a906 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -289,6 +289,7 @@ MultiIndex I/O ^^^ - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`) +- Bug in :func:`read_excel` results in an infinite loop with certain ``skiprows`` callables (:issue:`45585`) - Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`) - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 52fa3be4ff418..5698fccf84743 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -667,6 +667,8 @@ def _is_line_empty(self, line: list[Scalar]) -> bool: def _next_line(self) -> list[Scalar]: if isinstance(self.data, list): while self.skipfunc(self.pos): + if self.pos >= len(self.data): + break self.pos += 1 while True: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1a6e7e0bf3652..1b37b62870632 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1163,6 +1163,31 @@ def test_read_excel_skiprows(self, request, read_ext): ) tm.assert_frame_equal(actual, expected) + def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + # GH 4903 + if read_ext == ".xlsb": + request.node.add_marker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=lambda x: x not in [1, 3, 5], + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + # [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + # [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(actual, expected) + def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index e88ccf07353b6..c58e27aacfa00 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -241,6 +241,17 @@ def test_skip_rows_callable(all_parsers, kwargs, expected): tm.assert_frame_equal(result, expected) +def test_skip_rows_callable_not_in(all_parsers): + parser = all_parsers + data = "0,a\n1,b\n2,c\n3,d\n4,e" + expected = DataFrame([[1, "b"], [3, "d"]]) + + result = parser.read_csv( + StringIO(data), header=None, skiprows=lambda x: x not in [1, 3] + ) + tm.assert_frame_equal(result, expected) + + def test_skip_rows_skip_all(all_parsers): parser = all_parsers data = "a\n1\n2\n3\n4\n5"