BUG: fix skiprows callable infinite loop (pandas-dev#45586)

bram2000 · jreback · web-flow · commit de6b11d3479d · 2022-02-06T14:41:08.000-08:00
* BUG: fix skiprows callable infinite loop

* BUG: seperate new test

* add whatsnew entry

* make note user-centric

* add engine

* move note to IO section

* modify comment

* add test for csv

* lint

* formatting

* fix whatsnew

* test not valid for pyxlsb

Co-authored-by: Jeff Reback &lt;jeff@reback.net&gt;
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -335,6 +335,7 @@ MultiIndex
 I/O
 ^^^
 - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`)
+- Bug in :func:`read_excel` results in an infinite loop with certain ``skiprows`` callables (:issue:`45585`)
 - Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`)
 - Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`)
 - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -667,6 +667,8 @@ def _is_line_empty(self, line: list[Scalar]) -> bool:
     def _next_line(self) -> list[Scalar]:
         if isinstance(self.data, list):
             while self.skipfunc(self.pos):
+                if self.pos >= len(self.data):
+                    break
                 self.pos += 1
 
             while True:
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1164,6 +1164,31 @@ def test_read_excel_skiprows(self, request, read_ext):
         )
         tm.assert_frame_equal(actual, expected)
 
+    def test_read_excel_skiprows_callable_not_in(self, request, read_ext):
+        # GH 4903
+        if read_ext == ".xlsb":
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason="Sheets containing datetimes not supported by pyxlsb"
+                )
+            )
+
+        actual = pd.read_excel(
+            "testskiprows" + read_ext,
+            sheet_name="skiprows_list",
+            skiprows=lambda x: x not in [1, 3, 5],
+        )
+        expected = DataFrame(
+            [
+                [1, 2.5, pd.Timestamp("2015-01-01"), True],
+                # [2, 3.5, pd.Timestamp("2015-01-02"), False],
+                [3, 4.5, pd.Timestamp("2015-01-03"), False],
+                # [4, 5.5, pd.Timestamp("2015-01-04"), True],
+            ],
+            columns=["a", "b", "c", "d"],
+        )
+        tm.assert_frame_equal(actual, expected)
+
     def test_read_excel_nrows(self, read_ext):
         # GH 16645
         num_rows_to_pull = 5
diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
@@ -241,6 +241,17 @@ def test_skip_rows_callable(all_parsers, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
+def test_skip_rows_callable_not_in(all_parsers):
+    parser = all_parsers
+    data = "0,a\n1,b\n2,c\n3,d\n4,e"
+    expected = DataFrame([[1, "b"], [3, "d"]])
+
+    result = parser.read_csv(
+        StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 def test_skip_rows_skip_all(all_parsers):
     parser = all_parsers
     data = "a\n1\n2\n3\n4\n5"