Skip to content

Commit de6b11d

Browse files
bram2000jreback
andauthored
BUG: fix skiprows callable infinite loop (pandas-dev#45586)
* BUG: fix skiprows callable infinite loop * BUG: seperate new test * add whatsnew entry * make note user-centric * add engine * move note to IO section * modify comment * add test for csv * lint * formatting * fix whatsnew * test not valid for pyxlsb Co-authored-by: Jeff Reback <[email protected]>
1 parent a63db3d commit de6b11d

File tree

4 files changed

+39
-0
lines changed

4 files changed

+39
-0
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ MultiIndex
335335
I/O
336336
^^^
337337
- Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`)
338+
- Bug in :func:`read_excel` results in an infinite loop with certain ``skiprows`` callables (:issue:`45585`)
338339
- Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`)
339340
- Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`)
340341
- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)

pandas/io/parsers/python_parser.py

+2
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,8 @@ def _is_line_empty(self, line: list[Scalar]) -> bool:
667667
def _next_line(self) -> list[Scalar]:
668668
if isinstance(self.data, list):
669669
while self.skipfunc(self.pos):
670+
if self.pos >= len(self.data):
671+
break
670672
self.pos += 1
671673

672674
while True:

pandas/tests/io/excel/test_readers.py

+25
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,31 @@ def test_read_excel_skiprows(self, request, read_ext):
11641164
)
11651165
tm.assert_frame_equal(actual, expected)
11661166

1167+
def test_read_excel_skiprows_callable_not_in(self, request, read_ext):
1168+
# GH 4903
1169+
if read_ext == ".xlsb":
1170+
request.node.add_marker(
1171+
pytest.mark.xfail(
1172+
reason="Sheets containing datetimes not supported by pyxlsb"
1173+
)
1174+
)
1175+
1176+
actual = pd.read_excel(
1177+
"testskiprows" + read_ext,
1178+
sheet_name="skiprows_list",
1179+
skiprows=lambda x: x not in [1, 3, 5],
1180+
)
1181+
expected = DataFrame(
1182+
[
1183+
[1, 2.5, pd.Timestamp("2015-01-01"), True],
1184+
# [2, 3.5, pd.Timestamp("2015-01-02"), False],
1185+
[3, 4.5, pd.Timestamp("2015-01-03"), False],
1186+
# [4, 5.5, pd.Timestamp("2015-01-04"), True],
1187+
],
1188+
columns=["a", "b", "c", "d"],
1189+
)
1190+
tm.assert_frame_equal(actual, expected)
1191+
11671192
def test_read_excel_nrows(self, read_ext):
11681193
# GH 16645
11691194
num_rows_to_pull = 5

pandas/tests/io/parser/test_skiprows.py

+11
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,17 @@ def test_skip_rows_callable(all_parsers, kwargs, expected):
241241
tm.assert_frame_equal(result, expected)
242242

243243

244+
def test_skip_rows_callable_not_in(all_parsers):
245+
parser = all_parsers
246+
data = "0,a\n1,b\n2,c\n3,d\n4,e"
247+
expected = DataFrame([[1, "b"], [3, "d"]])
248+
249+
result = parser.read_csv(
250+
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
251+
)
252+
tm.assert_frame_equal(result, expected)
253+
254+
244255
def test_skip_rows_skip_all(all_parsers):
245256
parser = all_parsers
246257
data = "a\n1\n2\n3\n4\n5"

0 commit comments

Comments
 (0)