BUG: Read CSV on python engine fails when skiprows and chunk size are specified (pandas-dev#55677, pandas-dev#56323) (pandas-dev#56250)

Flytre · pre-commit-ci[bot] · web-flow · commit 593fa85504e3 · 2023-12-04T16:27:36.000-08:00
* Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. * Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. * Fix -GH 55677: Made changes consistment with mypy checking * Fix -GH 55677: Made changes consistment with mypy checking and pre-commit * Fix -GH 55677 & 56323: This commit: -Fixes GH 56323 by replacing the python engine chunksize logic -Fixes formatting on the added test_skiprows test case -Fixes incorrect test in read_fwf that expected an output chunk of size 3 when chunksize=2 was specified. * Trigger CI --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -591,6 +591,8 @@ MultiIndex
 
 I/O
 ^^^
+- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`)
+- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`)
 - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -1117,18 +1117,18 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
                 new_rows = []
                 try:
                     if rows is not None:
-                        rows_to_skip = 0
-                        if self.skiprows is not None and self.pos is not None:
-                            # Only read additional rows if pos is in skiprows
-                            rows_to_skip = len(
-                                set(self.skiprows) - set(range(self.pos))
-                            )
-
-                        for _ in range(rows + rows_to_skip):
+                        row_index = 0
+                        row_ct = 0
+                        offset = self.pos if self.pos is not None else 0
+                        while row_ct < rows:
                             # assert for mypy, data is Iterator[str] or None, would
                             # error in next
                             assert self.data is not None
-                            new_rows.append(next(self.data))
+                            new_row = next(self.data)
+                            if not self.skipfunc(offset + row_index):
+                                row_ct += 1
+                            row_index += 1
+                            new_rows.append(new_row)
 
                         len_new_rows = len(new_rows)
                         new_rows = self._remove_skipped_rows(new_rows)
@@ -1137,11 +1137,11 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
                         rows = 0
 
                         while True:
-                            new_row = self._next_iter_line(row_num=self.pos + rows + 1)
+                            next_row = self._next_iter_line(row_num=self.pos + rows + 1)
                             rows += 1
 
-                            if new_row is not None:
-                                new_rows.append(new_row)
+                            if next_row is not None:
+                                new_rows.append(next_row)
                         len_new_rows = len(new_rows)
 
                 except StopIteration:
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows():
 
 
 def test_skiprows_with_iterator():
-    # GH#10261
+    # GH#10261, GH#56323
     data = """0
 1
 2
@@ -920,8 +920,8 @@ def test_skiprows_with_iterator():
     )
     expected_frames = [
         DataFrame({"a": [3, 4]}),
-        DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]),
-        DataFrame({"a": []}, dtype="object"),
+        DataFrame({"a": [5, 7]}, index=[2, 3]),
+        DataFrame({"a": [8]}, index=[4]),
     ]
     for i, result in enumerate(df_iter):
         tm.assert_frame_equal(result, expected_frames[i])
diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
@@ -301,3 +301,29 @@ def test_skip_rows_and_n_rows(all_parsers):
     result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
     expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+def test_skip_rows_with_chunks(all_parsers):
+    # GH 55677
+    data = """col_a
+10
+20
+30
+40
+50
+60
+70
+80
+90
+100
+"""
+    parser = all_parsers
+    reader = parser.read_csv(
+        StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
+    )
+    df1 = next(reader)
+    df2 = next(reader)
+
+    tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
+    tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))

Original file line number	Diff line number	Diff line change
`@@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows():`
`898`	`898`
`899`	`899`
`900`	`900`	`def test_skiprows_with_iterator():`
`901`		`- # GH#10261`
	`901`	`+ # GH#10261, GH#56323`
`902`	`902`	`data = """0`
`903`	`903`	`1`
`904`	`904`	`2`
`@@ -920,8 +920,8 @@ def test_skiprows_with_iterator():`
`920`	`920`	`)`
`921`	`921`	`expected_frames = [`
`922`	`922`	`DataFrame({"a": [3, 4]}),`
`923`		`- DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]),`
`924`		`- DataFrame({"a": []}, dtype="object"),`
	`923`	`+ DataFrame({"a": [5, 7]}, index=[2, 3]),`
	`924`	`+ DataFrame({"a": [8]}, index=[4]),`
`925`	`925`	`]`
`926`	`926`	`for i, result in enumerate(df_iter):`
`927`	`927`	`tm.assert_frame_equal(result, expected_frames[i])`