Skip to content

Commit 524be40

Browse files
committed
BUG: read_csv(on_bad_lines='warn') did not raise a Python warning (pandas-dev#55071)
1 parent af7b504 commit 524be40

File tree

7 files changed

+62
-60
lines changed

7 files changed

+62
-60
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ MultiIndex
227227

228228
I/O
229229
^^^
230+
- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
230231
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
231232

232233
Period

pandas/_libs/parsers.pyx

+9-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ from csv import (
66
QUOTE_NONE,
77
QUOTE_NONNUMERIC,
88
)
9-
import sys
109
import time
1110
import warnings
1211

@@ -880,9 +879,15 @@ cdef class TextReader:
880879

881880
cdef _check_tokenize_status(self, int status):
882881
if self.parser.warn_msg != NULL:
883-
print(PyUnicode_DecodeUTF8(
884-
self.parser.warn_msg, strlen(self.parser.warn_msg),
885-
self.encoding_errors), file=sys.stderr)
882+
warnings.warn(
883+
PyUnicode_DecodeUTF8(
884+
self.parser.warn_msg,
885+
strlen(self.parser.warn_msg),
886+
self.encoding_errors
887+
),
888+
ParserWarning,
889+
stacklevel=find_stack_level()
890+
)
886891
free(self.parser.warn_msg)
887892
self.parser.warn_msg = NULL
888893

pandas/io/parsers/python_parser.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -13,23 +13,25 @@
1313
import csv
1414
from io import StringIO
1515
import re
16-
import sys
1716
from typing import (
1817
IO,
1918
TYPE_CHECKING,
2019
DefaultDict,
2120
Literal,
2221
cast,
2322
)
23+
import warnings
2424

2525
import numpy as np
2626

2727
from pandas._libs import lib
2828
from pandas.errors import (
2929
EmptyDataError,
3030
ParserError,
31+
ParserWarning,
3132
)
3233
from pandas.util._decorators import cache_readonly
34+
from pandas.util._exceptions import find_stack_level
3335

3436
from pandas.core.dtypes.common import (
3537
is_bool_dtype,
@@ -778,8 +780,11 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
778780
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
779781
raise ParserError(msg)
780782
if self.on_bad_lines == self.BadLineHandleMethod.WARN:
781-
base = f"Skipping line {row_num}: "
782-
sys.stderr.write(base + msg + "\n")
783+
warnings.warn(
784+
f"Skipping line {row_num}: {msg}\n",
785+
ParserWarning,
786+
stacklevel=find_stack_level(),
787+
)
783788

784789
def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
785790
"""

pandas/tests/io/parser/common/test_read_errors.py

+14-25
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from pandas.errors import (
1616
EmptyDataError,
1717
ParserError,
18+
ParserWarning,
1819
)
1920

2021
from pandas import DataFrame
@@ -129,18 +130,16 @@ def test_unexpected_keyword_parameter_exception(all_parsers):
129130
parser.read_table("foo.tsv", foo=1)
130131

131132

132-
def test_suppress_error_output(all_parsers, capsys):
133+
def test_suppress_error_output(all_parsers):
133134
# see gh-15925
134135
parser = all_parsers
135136
data = "a\n1\n1,2,3\n4\n5,6,7"
136137
expected = DataFrame({"a": [1, 4]})
137138

138-
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
139+
with tm.assert_produces_warning(None):
140+
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
139141
tm.assert_frame_equal(result, expected)
140142

141-
captured = capsys.readouterr()
142-
assert captured.err == ""
143-
144143

145144
def test_error_bad_lines(all_parsers):
146145
# see gh-15925
@@ -152,19 +151,18 @@ def test_error_bad_lines(all_parsers):
152151
parser.read_csv(StringIO(data), on_bad_lines="error")
153152

154153

155-
def test_warn_bad_lines(all_parsers, capsys):
154+
def test_warn_bad_lines(all_parsers):
156155
# see gh-15925
157156
parser = all_parsers
158157
data = "a\n1\n1,2,3\n4\n5,6,7"
159158
expected = DataFrame({"a": [1, 4]})
160159

161-
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
160+
with tm.assert_produces_warning(
161+
ParserWarning, match="Skipping line", check_stacklevel=False
162+
):
163+
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
162164
tm.assert_frame_equal(result, expected)
163165

164-
captured = capsys.readouterr()
165-
assert "Skipping line 3" in captured.err
166-
assert "Skipping line 5" in captured.err
167-
168166

169167
def test_read_csv_wrong_num_columns(all_parsers):
170168
# Too few columns.
@@ -245,7 +243,7 @@ def test_bad_header_uniform_error(all_parsers):
245243
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
246244

247245

248-
def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
246+
def test_on_bad_lines_warn_correct_formatting(all_parsers):
249247
# see gh-15925
250248
parser = all_parsers
251249
data = """1,2
@@ -256,17 +254,8 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
256254
"""
257255
expected = DataFrame({"1": "a", "2": ["b"] * 2})
258256

259-
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
257+
with tm.assert_produces_warning(
258+
ParserWarning, match="Skipping line", check_stacklevel=False
259+
):
260+
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
260261
tm.assert_frame_equal(result, expected)
261-
262-
captured = capsys.readouterr()
263-
if parser.engine == "c":
264-
warn = """Skipping line 3: expected 2 fields, saw 3
265-
Skipping line 4: expected 2 fields, saw 3
266-
267-
"""
268-
else:
269-
warn = """Skipping line 3: Expected 2 fields in line 3, saw 3
270-
Skipping line 4: Expected 2 fields in line 4, saw 3
271-
"""
272-
assert captured.err == warn

pandas/tests/io/parser/test_c_parser_only.py

+16-14
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919

2020
from pandas.compat import is_ci_environment
2121
from pandas.compat.numpy import np_version_gte1p24
22-
from pandas.errors import ParserError
22+
from pandas.errors import (
23+
ParserError,
24+
ParserWarning,
25+
)
2326
import pandas.util._test_decorators as td
2427

2528
from pandas import (
@@ -461,7 +464,7 @@ def test_data_after_quote(c_parser_only):
461464
tm.assert_frame_equal(result, expected)
462465

463466

464-
def test_comment_whitespace_delimited(c_parser_only, capsys):
467+
def test_comment_whitespace_delimited(c_parser_only):
465468
parser = c_parser_only
466469
test_input = """\
467470
1 2
@@ -474,18 +477,17 @@ def test_comment_whitespace_delimited(c_parser_only, capsys):
474477
8# 1 field, NaN
475478
9 2 3 # skipped line
476479
# comment"""
477-
df = parser.read_csv(
478-
StringIO(test_input),
479-
comment="#",
480-
header=None,
481-
delimiter="\\s+",
482-
skiprows=0,
483-
on_bad_lines="warn",
484-
)
485-
captured = capsys.readouterr()
486-
# skipped lines 2, 3, 4, 9
487-
for line_num in (2, 3, 4, 9):
488-
assert f"Skipping line {line_num}" in captured.err
480+
with tm.assert_produces_warning(
481+
ParserWarning, match="Skipping line", check_stacklevel=False
482+
):
483+
df = parser.read_csv(
484+
StringIO(test_input),
485+
comment="#",
486+
header=None,
487+
delimiter="\\s+",
488+
skiprows=0,
489+
on_bad_lines="warn",
490+
)
489491
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
490492
tm.assert_frame_equal(df, expected)
491493

pandas/tests/io/parser/test_python_parser_only.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ def test_multi_char_sep_quotes(python_parser_only, quoting):
274274
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
275275

276276

277-
def test_none_delimiter(python_parser_only, capsys):
277+
def test_none_delimiter(python_parser_only):
278278
# see gh-13374 and gh-17465
279279
parser = python_parser_only
280280
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
@@ -283,12 +283,14 @@ def test_none_delimiter(python_parser_only, capsys):
283283
# We expect the third line in the data to be
284284
# skipped because it is malformed, but we do
285285
# not expect any errors to occur.
286-
result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn")
286+
with tm.assert_produces_warning(
287+
ParserWarning, match="Skipping line 3", check_stacklevel=False
288+
):
289+
result = parser.read_csv(
290+
StringIO(data), header=0, sep=None, on_bad_lines="warn"
291+
)
287292
tm.assert_frame_equal(result, expected)
288293

289-
captured = capsys.readouterr()
290-
assert "Skipping line 3" in captured.err
291-
292294

293295
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
294296
@pytest.mark.parametrize("skipfooter", [0, 1])

pandas/tests/io/parser/test_textreader.py

+7-9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import pandas._libs.parsers as parser
1414
from pandas._libs.parsers import TextReader
15+
from pandas.errors import ParserWarning
1516

1617
from pandas import DataFrame
1718
import pandas._testing as tm
@@ -125,7 +126,7 @@ def test_integer_thousands_alt(self):
125126
expected = DataFrame([123456, 12500])
126127
tm.assert_frame_equal(result, expected)
127128

128-
def test_skip_bad_lines(self, capsys):
129+
def test_skip_bad_lines(self):
129130
# too many lines, see #2430 for why
130131
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
131132

@@ -145,14 +146,11 @@ def test_skip_bad_lines(self, capsys):
145146
}
146147
assert_array_dicts_equal(result, expected)
147148

148-
reader = TextReader(
149-
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
150-
)
151-
reader.read()
152-
captured = capsys.readouterr()
153-
154-
assert "Skipping line 4" in captured.err
155-
assert "Skipping line 6" in captured.err
149+
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
150+
reader = TextReader(
151+
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
152+
)
153+
reader.read()
156154

157155
def test_header_not_enough_lines(self):
158156
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"

0 commit comments

Comments
 (0)