Skip to content

Commit a8f966b

Browse files
authored
ENH: Allow callable for on_bad_lines in read_csv when engine="python" (#45146)
1 parent 69f4ccf commit a8f966b

File tree

6 files changed

+196
-9
lines changed

6 files changed

+196
-9
lines changed

doc/source/user_guide/io.rst

+28-4
Original file line numberDiff line numberDiff line change
@@ -1305,14 +1305,38 @@ You can elect to skip bad lines:
13051305
0 1 2 3
13061306
1 8 9 10
13071307
1308+
Or pass a callable function to handle the bad line if ``engine="python"``.
1309+
The bad line will be a list of strings that was split by the ``sep``:
1310+
1311+
.. code-block:: ipython
1312+
1313+
In [29]: external_list = []
1314+
1315+
In [30]: def bad_lines_func(line):
1316+
...: external_list.append(line)
1317+
...: return line[-3:]
1318+
1319+
In [31]: pd.read_csv(StringIO(data), on_bad_lines=bad_lines_func, engine="python")
1320+
Out[31]:
1321+
a b c
1322+
0 1 2 3
1323+
1 5 6 7
1324+
2 8 9 10
1325+
1326+
In [32]: external_list
1327+
Out[32]: [4, 5, 6, 7]
1328+
1329+
.. versionadded:: 1.4.0
1330+
1331+
13081332
You can also use the ``usecols`` parameter to eliminate extraneous column
13091333
data that appear in some lines but not others:
13101334

13111335
.. code-block:: ipython
13121336
1313-
In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
1337+
In [33]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
13141338
1315-
Out[30]:
1339+
Out[33]:
13161340
a b c
13171341
0 1 2 3
13181342
1 4 5 6
@@ -1324,9 +1348,9 @@ fields are filled with ``NaN``.
13241348

13251349
.. code-block:: ipython
13261350
1327-
In [31]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
1351+
In [34]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
13281352
1329-
Out[31]:
1353+
Out[34]:
13301354
a b c d
13311355
0 1 2 3 NaN
13321356
1 4 5 6 7

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ Other enhancements
208208
- :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
209209
- :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
210210
- Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
211+
- :func:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`)
211212
- :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
212213
- :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
213214
- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`)

pandas/io/parsers/python_parser.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -990,7 +990,11 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
990990
actual_len = len(l)
991991

992992
if actual_len > col_len:
993-
if (
993+
if callable(self.on_bad_lines):
994+
new_l = self.on_bad_lines(l)
995+
if new_l is not None:
996+
content.append(new_l)
997+
elif (
994998
self.on_bad_lines == self.BadLineHandleMethod.ERROR
995999
or self.on_bad_lines == self.BadLineHandleMethod.WARN
9961000
):

pandas/io/parsers/readers.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from textwrap import fill
1010
from typing import (
1111
Any,
12+
Callable,
1213
NamedTuple,
1314
)
1415
import warnings
@@ -354,7 +355,7 @@
354355
.. deprecated:: 1.3.0
355356
The ``on_bad_lines`` parameter should be used instead to specify behavior upon
356357
encountering a bad line instead.
357-
on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error'
358+
on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'
358359
Specifies what to do upon encountering a bad line (a line with too many fields).
359360
Allowed values are :
360361
@@ -364,6 +365,16 @@
364365
365366
.. versionadded:: 1.3.0
366367
368+
- callable, function with signature
369+
``(bad_line: list[str]) -> list[str] | None`` that will process a single
370+
bad line. ``bad_line`` is a list of strings split by the ``sep``.
371+
If the function returns ``None`, the bad line will be ignored.
372+
If the function returns a new list of strings with more elements than
373+
expected, a ``ParserWarning`` will be emitted while dropping extra elements.
374+
Only supported when ``engine="python"``
375+
376+
.. versionadded:: 1.4.0
377+
367378
delim_whitespace : bool, default False
368379
Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
369380
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
@@ -1367,7 +1378,7 @@ def _refine_defaults_read(
13671378
sep: str | object,
13681379
error_bad_lines: bool | None,
13691380
warn_bad_lines: bool | None,
1370-
on_bad_lines: str | None,
1381+
on_bad_lines: str | Callable | None,
13711382
names: ArrayLike | None | object,
13721383
prefix: str | None | object,
13731384
defaults: dict[str, Any],
@@ -1399,7 +1410,7 @@ def _refine_defaults_read(
13991410
Whether to error on a bad line or not.
14001411
warn_bad_lines : str or None
14011412
Whether to warn on a bad line or not.
1402-
on_bad_lines : str or None
1413+
on_bad_lines : str, callable or None
14031414
An option for handling bad lines or a sentinel value(None).
14041415
names : array-like, optional
14051416
List of column names to use. If the file contains a header row,
@@ -1503,6 +1514,12 @@ def _refine_defaults_read(
15031514
kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
15041515
elif on_bad_lines == "skip":
15051516
kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
1517+
elif callable(on_bad_lines):
1518+
if engine != "python":
1519+
raise ValueError(
1520+
"on_bad_line can only be a callable function if engine='python'"
1521+
)
1522+
kwds["on_bad_lines"] = on_bad_lines
15061523
else:
15071524
raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
15081525
else:

pandas/tests/io/parser/test_python_parser_only.py

+130-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
these tests out of this module as soon as the C parser can accept further
55
arguments when parsing.
66
"""
7+
from __future__ import annotations
78

89
import csv
910
from io import (
@@ -13,7 +14,10 @@
1314

1415
import pytest
1516

16-
from pandas.errors import ParserError
17+
from pandas.errors import (
18+
ParserError,
19+
ParserWarning,
20+
)
1721

1822
from pandas import (
1923
DataFrame,
@@ -329,3 +333,128 @@ def readline(self):
329333
return self.data
330334

331335
parser.read_csv(NoNextBuffer("a\n1"))
336+
337+
338+
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
339+
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
340+
# GH 5686
341+
parser = python_parser_only
342+
data = """a,b
343+
1,2
344+
2,3,4,5,6
345+
3,4
346+
"""
347+
bad_sio = StringIO(data)
348+
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
349+
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
350+
tm.assert_frame_equal(result, expected)
351+
352+
353+
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
354+
# GH 5686
355+
parser = python_parser_only
356+
data = """a,b
357+
1,2
358+
2,3,4,5,6
359+
3,4
360+
"""
361+
bad_sio = StringIO(data)
362+
lst = []
363+
364+
def bad_line_func(bad_line: list[str]) -> list[str]:
365+
lst.append(bad_line)
366+
return ["2", "3"]
367+
368+
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
369+
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
370+
tm.assert_frame_equal(result, expected)
371+
assert lst == [["2", "3", "4", "5", "6"]]
372+
373+
374+
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
375+
@pytest.mark.parametrize("sep", [",", "111"])
376+
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
377+
# GH 5686
378+
# iterator=True has a separate code path than iterator=False
379+
parser = python_parser_only
380+
data = f"""
381+
0{sep}1
382+
hi{sep}there
383+
foo{sep}bar{sep}baz
384+
good{sep}bye
385+
"""
386+
bad_sio = StringIO(data)
387+
result_iter = parser.read_csv(
388+
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
389+
)
390+
expecteds = [
391+
{"0": "hi", "1": "there"},
392+
{"0": "foo", "1": "bar"},
393+
{"0": "good", "1": "bye"},
394+
]
395+
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
396+
expected = DataFrame(expected, index=range(i, i + 1))
397+
tm.assert_frame_equal(result, expected)
398+
399+
400+
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
401+
# GH 5686
402+
parser = python_parser_only
403+
data = """a,b
404+
1,2
405+
2,3,4,5,6
406+
3,4
407+
"""
408+
bad_sio = StringIO(data)
409+
msg = "This function is buggy."
410+
411+
def bad_line_func(bad_line):
412+
raise ValueError(msg)
413+
414+
with pytest.raises(ValueError, match=msg):
415+
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
416+
417+
418+
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
419+
# GH 5686
420+
parser = python_parser_only
421+
data = """a,b
422+
1,2
423+
2,3,4,5,6
424+
3,4
425+
"""
426+
bad_sio = StringIO(data)
427+
428+
with tm.assert_produces_warning(ParserWarning, match="Length of header or names"):
429+
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x)
430+
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
431+
tm.assert_frame_equal(result, expected)
432+
433+
434+
def test_on_bad_lines_callable_returns_none(python_parser_only):
435+
# GH 5686
436+
parser = python_parser_only
437+
data = """a,b
438+
1,2
439+
2,3,4,5,6
440+
3,4
441+
"""
442+
bad_sio = StringIO(data)
443+
444+
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
445+
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
446+
tm.assert_frame_equal(result, expected)
447+
448+
449+
def test_on_bad_lines_index_col_inferred(python_parser_only):
450+
# GH 5686
451+
parser = python_parser_only
452+
data = """a,b
453+
1,2,3
454+
4,5,6
455+
"""
456+
bad_sio = StringIO(data)
457+
458+
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
459+
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
460+
tm.assert_frame_equal(result, expected)

pandas/tests/io/parser/test_unsupported.py

+12
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,15 @@ def test_pyarrow_engine(self):
149149
kwargs[default] = "warn"
150150
with pytest.raises(ValueError, match=msg):
151151
read_csv(StringIO(data), engine="pyarrow", **kwargs)
152+
153+
def test_on_bad_lines_callable_python_only(self, all_parsers):
154+
# GH 5686
155+
sio = StringIO("a,b\n1,2")
156+
bad_lines_func = lambda x: x
157+
parser = all_parsers
158+
if all_parsers.engine != "python":
159+
msg = "on_bad_line can only be a callable function if engine='python'"
160+
with pytest.raises(ValueError, match=msg):
161+
parser.read_csv(sio, on_bad_lines=bad_lines_func)
162+
else:
163+
parser.read_csv(sio, on_bad_lines=bad_lines_func)

0 commit comments

Comments
 (0)