Skip to content

Commit 593fa85

Browse files
BUG: Read CSV on python engine fails when skiprows and chunk size are specified (pandas-dev#55677, pandas-dev#56323) (pandas-dev#56250)
* Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. * Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. * Fix -GH 55677: Made changes consistment with mypy checking * Fix -GH 55677: Made changes consistment with mypy checking and pre-commit * Fix -GH 55677 & 56323: This commit: -Fixes GH 56323 by replacing the python engine chunksize logic -Fixes formatting on the added test_skiprows test case -Fixes incorrect test in read_fwf that expected an output chunk of size 3 when chunksize=2 was specified. * Trigger CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 12b80a6 commit 593fa85

File tree

4 files changed

+43
-15
lines changed

4 files changed

+43
-15
lines changed

doc/source/whatsnew/v2.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,8 @@ MultiIndex
591591

592592
I/O
593593
^^^
594+
- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`)
595+
- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`)
594596
- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
595597
- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
596598
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)

pandas/io/parsers/python_parser.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -1117,18 +1117,18 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
11171117
new_rows = []
11181118
try:
11191119
if rows is not None:
1120-
rows_to_skip = 0
1121-
if self.skiprows is not None and self.pos is not None:
1122-
# Only read additional rows if pos is in skiprows
1123-
rows_to_skip = len(
1124-
set(self.skiprows) - set(range(self.pos))
1125-
)
1126-
1127-
for _ in range(rows + rows_to_skip):
1120+
row_index = 0
1121+
row_ct = 0
1122+
offset = self.pos if self.pos is not None else 0
1123+
while row_ct < rows:
11281124
# assert for mypy, data is Iterator[str] or None, would
11291125
# error in next
11301126
assert self.data is not None
1131-
new_rows.append(next(self.data))
1127+
new_row = next(self.data)
1128+
if not self.skipfunc(offset + row_index):
1129+
row_ct += 1
1130+
row_index += 1
1131+
new_rows.append(new_row)
11321132

11331133
len_new_rows = len(new_rows)
11341134
new_rows = self._remove_skipped_rows(new_rows)
@@ -1137,11 +1137,11 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
11371137
rows = 0
11381138

11391139
while True:
1140-
new_row = self._next_iter_line(row_num=self.pos + rows + 1)
1140+
next_row = self._next_iter_line(row_num=self.pos + rows + 1)
11411141
rows += 1
11421142

1143-
if new_row is not None:
1144-
new_rows.append(new_row)
1143+
if next_row is not None:
1144+
new_rows.append(next_row)
11451145
len_new_rows = len(new_rows)
11461146

11471147
except StopIteration:

pandas/tests/io/parser/test_read_fwf.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows():
898898

899899

900900
def test_skiprows_with_iterator():
901-
# GH#10261
901+
# GH#10261, GH#56323
902902
data = """0
903903
1
904904
2
@@ -920,8 +920,8 @@ def test_skiprows_with_iterator():
920920
)
921921
expected_frames = [
922922
DataFrame({"a": [3, 4]}),
923-
DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]),
924-
DataFrame({"a": []}, dtype="object"),
923+
DataFrame({"a": [5, 7]}, index=[2, 3]),
924+
DataFrame({"a": [8]}, index=[4]),
925925
]
926926
for i, result in enumerate(df_iter):
927927
tm.assert_frame_equal(result, expected_frames[i])

pandas/tests/io/parser/test_skiprows.py

+26
Original file line numberDiff line numberDiff line change
@@ -301,3 +301,29 @@ def test_skip_rows_and_n_rows(all_parsers):
301301
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
302302
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
303303
tm.assert_frame_equal(result, expected)
304+
305+
306+
@xfail_pyarrow
307+
def test_skip_rows_with_chunks(all_parsers):
308+
# GH 55677
309+
data = """col_a
310+
10
311+
20
312+
30
313+
40
314+
50
315+
60
316+
70
317+
80
318+
90
319+
100
320+
"""
321+
parser = all_parsers
322+
reader = parser.read_csv(
323+
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
324+
)
325+
df1 = next(reader)
326+
df2 = next(reader)
327+
328+
tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
329+
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))

0 commit comments

Comments
 (0)