diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9aebcad1d8cae..000353aee99a0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -285,6 +285,7 @@ I/O ^^^ - Bug in :meth:`DataFrame.to_stata` where no error is raised if the :class:`DataFrame` contains ``-np.inf`` (:issue:`45350`) - Bug in :meth:`DataFrame.info` where a new line at the end of the output is omitted when called on an empty :class:`DataFrame` (:issue:`45494`) +- Bug in :func:`read_csv` not recognizing line break for ``on_bad_lines="warn"`` for ``engine="c"`` (:issue:`41710`) - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 08c885fba172a..b4d2c60837a7e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license +from base64 import decode from csv import ( QUOTE_MINIMAL, QUOTE_NONE, @@ -839,7 +840,9 @@ cdef class TextReader: status = tokenize_nrows(self.parser, nrows, self.encoding_errors) if self.parser.warn_msg != NULL: - print(self.parser.warn_msg, file=sys.stderr) + print(PyUnicode_DecodeUTF8( + self.parser.warn_msg, strlen(self.parser.warn_msg), + self.encoding_errors), file=sys.stderr) free(self.parser.warn_msg) self.parser.warn_msg = NULL @@ -868,7 +871,9 @@ cdef class TextReader: status = tokenize_all_rows(self.parser, self.encoding_errors) if self.parser.warn_msg != NULL: - print(self.parser.warn_msg, file=sys.stderr) + print(PyUnicode_DecodeUTF8( + self.parser.warn_msg, strlen(self.parser.warn_msg), + self.encoding_errors), file=sys.stderr) free(self.parser.warn_msg) self.parser.warn_msg = NULL diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index fe00afb4fdc1d..2274646ae7c69 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -278,3 +278,30 @@ def test_conflict_on_bad_line(all_parsers, error_bad_lines, warn_bad_lines): "Please only set on_bad_lines.", ): parser.read_csv(StringIO(data), on_bad_lines="error", **kwds) + + +def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): + # see gh-15925 + parser = all_parsers + data = """1,2 +a,b +a,b,c +a,b,d +a,b +""" + expected = DataFrame({"1": "a", "2": ["b"] * 2}) + + result = parser.read_csv(StringIO(data), on_bad_lines="warn") + tm.assert_frame_equal(result, expected) + + captured = capsys.readouterr() + if parser.engine == "c": + warn = """Skipping line 3: expected 2 fields, saw 3 +Skipping line 4: expected 2 fields, saw 3 + +""" + else: + warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 +Skipping line 4: Expected 2 fields in line 4, saw 3 +""" + assert captured.err == warn