Skip to content

BUG: read_csv(on_bad_lines='warn') did not raise a Python warning #55071

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ MultiIndex

I/O
^^^
- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)

Period
Expand Down
13 changes: 9 additions & 4 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ from csv import (
QUOTE_NONE,
QUOTE_NONNUMERIC,
)
import sys
import time
import warnings

Expand Down Expand Up @@ -880,9 +879,15 @@ cdef class TextReader:

cdef _check_tokenize_status(self, int status):
if self.parser.warn_msg != NULL:
print(PyUnicode_DecodeUTF8(
self.parser.warn_msg, strlen(self.parser.warn_msg),
self.encoding_errors), file=sys.stderr)
warnings.warn(
PyUnicode_DecodeUTF8(
self.parser.warn_msg,
strlen(self.parser.warn_msg),
self.encoding_errors
),
ParserWarning,
stacklevel=find_stack_level()
)
free(self.parser.warn_msg)
self.parser.warn_msg = NULL

Expand Down
11 changes: 8 additions & 3 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,25 @@
import csv
from io import StringIO
import re
import sys
from typing import (
IO,
TYPE_CHECKING,
DefaultDict,
Literal,
cast,
)
import warnings

import numpy as np

from pandas._libs import lib
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand Down Expand Up @@ -778,8 +780,11 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
raise ParserError(msg)
if self.on_bad_lines == self.BadLineHandleMethod.WARN:
base = f"Skipping line {row_num}: "
sys.stderr.write(base + msg + "\n")
warnings.warn(
f"Skipping line {row_num}: {msg}\n",
ParserWarning,
stacklevel=find_stack_level(),
)

def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
"""
Expand Down
39 changes: 14 additions & 25 deletions pandas/tests/io/parser/common/test_read_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)

from pandas import DataFrame
Expand Down Expand Up @@ -129,18 +130,16 @@ def test_unexpected_keyword_parameter_exception(all_parsers):
parser.read_table("foo.tsv", foo=1)


def test_suppress_error_output(all_parsers, capsys):
def test_suppress_error_output(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})

result = parser.read_csv(StringIO(data), on_bad_lines="skip")
with tm.assert_produces_warning(None):
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
assert captured.err == ""


def test_error_bad_lines(all_parsers):
# see gh-15925
Expand All @@ -152,19 +151,18 @@ def test_error_bad_lines(all_parsers):
parser.read_csv(StringIO(data), on_bad_lines="error")


def test_warn_bad_lines(all_parsers, capsys):
def test_warn_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})

result = parser.read_csv(StringIO(data), on_bad_lines="warn")
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
assert "Skipping line 3" in captured.err
assert "Skipping line 5" in captured.err


def test_read_csv_wrong_num_columns(all_parsers):
# Too few columns.
Expand Down Expand Up @@ -245,7 +243,7 @@ def test_bad_header_uniform_error(all_parsers):
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")


def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
def test_on_bad_lines_warn_correct_formatting(all_parsers):
# see gh-15925
parser = all_parsers
data = """1,2
Expand All @@ -256,17 +254,8 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
"""
expected = DataFrame({"1": "a", "2": ["b"] * 2})

result = parser.read_csv(StringIO(data), on_bad_lines="warn")
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
if parser.engine == "c":
warn = """Skipping line 3: expected 2 fields, saw 3
Skipping line 4: expected 2 fields, saw 3

"""
else:
warn = """Skipping line 3: Expected 2 fields in line 3, saw 3
Skipping line 4: Expected 2 fields in line 4, saw 3
"""
assert captured.err == warn
30 changes: 16 additions & 14 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

from pandas.compat import is_ci_environment
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import ParserError
from pandas.errors import (
ParserError,
ParserWarning,
)
import pandas.util._test_decorators as td

from pandas import (
Expand Down Expand Up @@ -461,7 +464,7 @@ def test_data_after_quote(c_parser_only):
tm.assert_frame_equal(result, expected)


def test_comment_whitespace_delimited(c_parser_only, capsys):
def test_comment_whitespace_delimited(c_parser_only):
parser = c_parser_only
test_input = """\
1 2
Expand All @@ -474,18 +477,17 @@ def test_comment_whitespace_delimited(c_parser_only, capsys):
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
captured = capsys.readouterr()
# skipped lines 2, 3, 4, 9
for line_num in (2, 3, 4, 9):
assert f"Skipping line {line_num}" in captured.err
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
tm.assert_frame_equal(df, expected)

Expand Down
12 changes: 7 additions & 5 deletions pandas/tests/io/parser/test_python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def test_multi_char_sep_quotes(python_parser_only, quoting):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)


def test_none_delimiter(python_parser_only, capsys):
def test_none_delimiter(python_parser_only):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
Expand All @@ -283,12 +283,14 @@ def test_none_delimiter(python_parser_only, capsys):
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn")
with tm.assert_produces_warning(
ParserWarning, match="Skipping line 3", check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), header=0, sep=None, on_bad_lines="warn"
)
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
assert "Skipping line 3" in captured.err


@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
Expand Down
16 changes: 7 additions & 9 deletions pandas/tests/io/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import pandas._libs.parsers as parser
from pandas._libs.parsers import TextReader
from pandas.errors import ParserWarning

from pandas import DataFrame
import pandas._testing as tm
Expand Down Expand Up @@ -125,7 +126,7 @@ def test_integer_thousands_alt(self):
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)

def test_skip_bad_lines(self, capsys):
def test_skip_bad_lines(self):
# too many lines, see #2430 for why
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"

Expand All @@ -145,14 +146,11 @@ def test_skip_bad_lines(self, capsys):
}
assert_array_dicts_equal(result, expected)

reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()
captured = capsys.readouterr()

assert "Skipping line 4" in captured.err
assert "Skipping line 6" in captured.err
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()

def test_header_not_enough_lines(self):
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
Expand Down