From 4e867e9d1302f6e0071e7833d99a398dcc663a14 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 12 Mar 2021 20:55:46 -0800 Subject: [PATCH 01/21] Deprecate --- doc/source/user_guide/io.rst | 21 +++++++++++- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/parsers.pyx | 34 ++++++++++++++----- pandas/_libs/src/parser/tokenizer.c | 7 ++-- pandas/_libs/src/parser/tokenizer.h | 9 +++-- pandas/io/parsers/c_parser_wrapper.py | 3 -- pandas/io/parsers/python_parser.py | 31 ++++++++++++----- pandas/io/parsers/readers.py | 29 ++++++++++++++-- .../io/parser/common/test_common_basic.py | 17 ++++++++++ .../io/parser/common/test_read_errors.py | 31 +++++++++++++---- 10 files changed, 147 insertions(+), 36 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cf153ddd2cbbd..d86bb906cdfec 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -349,10 +349,29 @@ error_bad_lines : boolean, default ``True`` returned. If ``False``, then these "bad lines" will dropped from the ``DataFrame`` that is returned. See :ref:`bad lines ` below. + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter takes precedence over this parameter + when specified and should be used instead to specify behavior upon + encountering a bad line instead. warn_bad_lines : boolean, default ``True`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter takes precedence over this parameter + when specified and should be used instead to specify behavior upon + encountering a bad line instead. +on_bad_lines : {{``None``, 'error', 'warn', 'skip'}}, default ``None`` + Specifies what to do upon encountering a bad line (a line with too many fields). + The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``. + Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines" + will be dropped from the DataFrame, with a warning raised if 'warn' is specified. + This parameter takes precedence over parameters ``error_bad_lines`` and ``warn_bad_lines`` + if specified. + + .. versionadded:: 1.3 + .. _io.dtypes: Specifying column data types @@ -1244,7 +1263,7 @@ You can elect to skip bad lines: .. code-block:: ipython - In [29]: pd.read_csv(StringIO(data), error_bad_lines=False) + In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn") Skipping line 3: expected 3 fields, saw 4 Out[29]: diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 56a5412d4ecfc..a55481db99571 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -369,6 +369,7 @@ Deprecations - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like; will raise if any function fails on a column in a future version (:issue:`40211`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :func:`read_csv` in favor of ``on_bad_lines`` (:issue:`15122`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 493333fded6dd..e9cc25dac3a9b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -149,6 +149,11 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW + ctypedef enum BadLineHandleMethod: + ERROR, + WARN, + SKIP + ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) ctypedef int (*io_cleanup)(void *src) @@ -201,8 +206,7 @@ cdef extern from "parser/tokenizer.h": int usecols int expected_fields - int error_bad_lines - int warn_bad_lines + BadLineHandleMethod on_bad_lines # floating point options char decimal @@ -364,6 +368,7 @@ cdef class TextReader: usecols=None, bint error_bad_lines=True, bint warn_bad_lines=True, + on_bad_lines = None, bint na_filter=True, na_values=None, na_fvalues=None, @@ -448,8 +453,23 @@ cdef class TextReader: self.parser.commentchar = ord(comment) # error handling of bad lines - self.parser.error_bad_lines = int(error_bad_lines) - self.parser.warn_bad_lines = int(warn_bad_lines) + if on_bad_lines is not None: + if on_bad_lines == "error": + self.parser.on_bad_lines = ERROR + elif on_bad_lines == "warn": + self.parser.on_bad_lines = WARN + elif on_bad_lines == "skip": + self.parser.on_bad_lines = SKIP + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for " + "on_bad_lines") + else: + if error_bad_lines: + self.parser.on_bad_lines = ERROR + elif warn_bad_lines: + self.parser.on_bad_lines = WARN + else: + self.parser.on_bad_lines = SKIP self.skiprows = skiprows if skiprows is not None: @@ -466,8 +486,7 @@ cdef class TextReader: # XXX if skipfooter > 0: - self.parser.error_bad_lines = 0 - self.parser.warn_bad_lines = 0 + self.parser.on_bad_lines = SKIP self.delimiter = delimiter self.delim_whitespace = delim_whitespace @@ -582,9 +601,6 @@ cdef class TextReader: kh_destroy_str_starts(self.false_set) self.false_set = NULL - def set_error_bad_lines(self, int status): - self.parser.error_bad_lines = status - def _set_quoting(self, quote_char, quoting): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49eb1e7855098..49797eea59ddc 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) { self->allow_embedded_newline = 1; self->expected_fields = -1; - self->error_bad_lines = 0; - self->warn_bad_lines = 0; + self->on_bad_lines = ERROR; self->commentchar = '#'; self->thousands = '\0'; @@ -457,7 +456,7 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; // file_lines is now the actual file line number (starting at 1) - if (self->error_bad_lines) { + if (self->on_bad_lines == ERROR) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", @@ -468,7 +467,7 @@ static int end_line(parser_t *self) { return -1; } else { // simply skip bad lines - if (self->warn_bad_lines) { + if (self->on_bad_lines == WARN) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f69fee4993d34..623d3690f252a 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -84,6 +84,12 @@ typedef enum { QUOTE_NONE } QuoteStyle; +typedef enum { + ERROR, + WARN, + SKIP +} BadLineHandleMethod; + typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); typedef int (*io_cleanup)(void *src); @@ -136,8 +142,7 @@ typedef struct parser_t { int usecols; // Boolean: 1: usecols provided, 0: none provided int expected_fields; - int error_bad_lines; - int warn_bad_lines; + BadLineHandleMethod on_bad_lines; // floating point options char decimal; diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 135e093cdc1e0..aa5a78f5afd04 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -155,9 +155,6 @@ def _set_noconvert_columns(self): for col in noconvert_columns: self._reader.set_noconvert(col) - def set_error_bad_lines(self, status): - self._reader.set_error_bad_lines(int(status)) - def read(self, nrows=None): try: data = self._reader.read(nrows) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 37f553c724c9e..eba32684297b0 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -75,8 +75,19 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - self.warn_bad_lines = kwds["warn_bad_lines"] - self.error_bad_lines = kwds["error_bad_lines"] + if kwds["on_bad_lines"] is not None: + if kwds["on_bad_lines"] not in {"error", "warn", "skip"}: + raise ValueError( + f"Argument {kwds['on_bad_lines']} is invalid for on_bad_lines" + ) + self.on_bad_lines = kwds["on_bad_lines"] + else: + if kwds["error_bad_lines"]: + self.on_bad_lines = "error" + elif kwds["warn_bad_lines"]: + self.on_bad_lines = "warn" + else: + self.on_bad_lines = "skip" self.names_passed = kwds["names"] or None @@ -666,8 +677,10 @@ def _alert_malformed(self, msg, row_num): """ Alert a user about a malformed row. - If `self.error_bad_lines` is True, the alert will be `ParserError`. - If `self.warn_bad_lines` is True, the alert will be printed out. + If `self.on_bad_lines` is 'error' or `self.error_bad_lines` is True, + the alert will be `ParserError`. + If `self.on_bad_lines` is 'warn' or `self.warn_bad_lines` is True, + the alert will be printed out. Parameters ---------- @@ -676,9 +689,9 @@ def _alert_malformed(self, msg, row_num): Because this row number is displayed, we 1-index, even though we 0-index internally. """ - if self.error_bad_lines: + if self.on_bad_lines == "error": raise ParserError(msg) - elif self.warn_bad_lines: + elif self.on_bad_lines == "warn": base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n") @@ -699,7 +712,7 @@ def _next_iter_line(self, row_num): assert self.data is not None return next(self.data) except csv.Error as e: - if self.warn_bad_lines or self.error_bad_lines: + if self.on_bad_lines == "error" or self.on_bad_lines == "warn": msg = str(e) if "NULL byte" in msg or "line contains NUL" in msg: @@ -896,11 +909,11 @@ def _rows_to_cols(self, content): actual_len = len(l) if actual_len > col_len: - if self.error_bad_lines or self.warn_bad_lines: + if self.on_bad_lines == "error" or self.on_bad_lines == "warn": row_num = self.pos - (content_len - i + footers) bad_lines.append((row_num, actual_len)) - if self.error_bad_lines: + if self.on_bad_lines == "error": break else: content.append(l) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6adf1b20b769f..098468b5a2130 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -325,9 +325,29 @@ default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will be dropped from the DataFrame that is returned. + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter takes precedence over this parameter + when specified and should be used instead to specify behavior upon + encountering a bad line instead. warn_bad_lines : bool, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter takes precedence over this parameter + when specified and should be used instead to specify behavior upon + encountering a bad line instead. +on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default None + Specifies what to do upon encountering a bad line (a line with too many fields). + The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``. + Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines" + will be dropped from the DataFrame, with a warning raised if 'warn' is specified. + This parameter takes precedence over parameters ``error_bad_lines`` and + ``warn_bad_lines`` if specified. + + .. versionadded:: 1.3 + delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\\s+'``. If this option @@ -382,6 +402,7 @@ "memory_map": False, "error_bad_lines": True, "warn_bad_lines": True, + "on_bad_lines": None, "float_precision": None, } @@ -390,8 +411,8 @@ _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults: Dict[str, Any] = {} -_deprecated_args: Set[str] = set() +_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": True, "warn_bad_lines": True} +_deprecated_args: Set[str] = {"error_bad_lines", "warn_bad_lines"} def validate_integer(name, val, min_val=0): @@ -533,6 +554,8 @@ def read_csv( # Error Handling error_bad_lines=True, warn_bad_lines=True, + # TODO: disallow and change None to 'error' in on_bad_lines in 2.0 + on_bad_lines=None, # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], @@ -613,6 +636,8 @@ def read_table( # Error Handling error_bad_lines=True, warn_bad_lines=True, + # TODO: disallow and change None to 'error' in on_bad_lines in 2.0 + on_bad_lines=None, encoding_errors: Optional[str] = "strict", # Internal delim_whitespace=False, diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index b4ac419fba30c..17ba244f32f93 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -753,3 +753,20 @@ def test_encoding_surrogatepass(all_parsers): tm.assert_frame_equal(df, expected) with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"): parser.read_csv(path) + + +def test_deprecated_bad_lines_warns(all_parsers, csv1): + # GH 15122 + parser = all_parsers + with pytest.warns( + FutureWarning, + match="The error_bad_lines argument has been deprecated " + "and will be removed in a future version.\n\n", + ): + parser.read_csv(csv1, error_bad_lines=False) + with pytest.warns( + FutureWarning, + match="The warn_bad_lines argument has been deprecated " + "and will be removed in a future version.\n\n", + ): + parser.read_csv(csv1, warn_bad_lines=False) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 4e3d99af685ec..c9169132810b6 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -140,15 +140,17 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -def test_suppress_error_output(all_parsers, capsys): +@pytest.mark.parametrize( + "kwargs", + [{"error_bad_lines": False, "warn_bad_lines": False}, {"on_bad_lines": "skip"}], +) +def test_suppress_error_output(all_parsers, capsys, kwargs): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv( - StringIO(data), error_bad_lines=False, warn_bad_lines=False - ) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() @@ -160,7 +162,13 @@ def test_suppress_error_output(all_parsers, capsys): [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. ) @pytest.mark.parametrize( - "warn_kwargs", [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}] + "warn_kwargs", + [ + {}, + {"warn_bad_lines": True}, + {"warn_bad_lines": False}, + {"on_bad_lines": "error"}, + ], ) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 @@ -173,7 +181,11 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): parser.read_csv(StringIO(data), **kwargs) -def test_warn_bad_lines(all_parsers, capsys): +@pytest.mark.parametrize( + "kwargs", + [{"error_bad_lines": False, "warn_bad_lines": True}, {"on_bad_lines": "warn"}], +) +def test_warn_bad_lines(all_parsers, capsys, kwargs): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" @@ -234,3 +246,10 @@ def test_open_file(all_parsers): with pytest.raises(csv.Error, match="Could not determine delimiter"): parser.read_csv(file, sep=None, encoding_errors="replace") assert len(record) == 0, record[0].message + + +def test_invalid_on_bad_line(all_parsers): + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"): + parser.read_csv(StringIO(data), on_bad_lines="abc") From d230035fbf84180997730dcc2d3075f9b4daca48 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 12 Mar 2021 21:18:25 -0800 Subject: [PATCH 02/21] stacklevel/warnings fixes --- pandas/io/parsers/readers.py | 2 +- pandas/tests/io/parser/common/test_common_basic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 098468b5a2130..7c894bce8014e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -949,7 +949,7 @@ def _clean_options(self, options, engine): f"The {arg} argument has been deprecated and will be " "removed in a future version.\n\n" ) - warnings.warn(msg, FutureWarning, stacklevel=2) + warnings.warn(msg, FutureWarning, stacklevel=6) else: result[arg] = parser_default diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 17ba244f32f93..8c7ccd14043ca 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -758,13 +758,13 @@ def test_encoding_surrogatepass(all_parsers): def test_deprecated_bad_lines_warns(all_parsers, csv1): # GH 15122 parser = all_parsers - with pytest.warns( + with tm.assert_produces_warning( FutureWarning, match="The error_bad_lines argument has been deprecated " "and will be removed in a future version.\n\n", ): parser.read_csv(csv1, error_bad_lines=False) - with pytest.warns( + with tm.assert_produces_warning( FutureWarning, match="The warn_bad_lines argument has been deprecated " "and will be removed in a future version.\n\n", From ce5bf29d8f4eb7c1d282042f508c7c23e9d721a8 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 13 Mar 2021 09:32:17 -0800 Subject: [PATCH 03/21] Fixes --- doc/source/whatsnew/v1.3.0.rst | 2 +- .../io/parser/common/test_read_errors.py | 32 +++++++++++++------ 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a55481db99571..f106ad05f8085 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -369,7 +369,7 @@ Deprecations - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like; will raise if any function fails on a column in a future version (:issue:`40211`) -- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :func:`read_csv` in favor of ``on_bad_lines`` (:issue:`15122`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`pd.read_csv` in favor of ``on_bad_lines`` (:issue:`15122`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index c9169132810b6..1c85f55b4e1f7 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -142,7 +142,13 @@ def test_unexpected_keyword_parameter_exception(all_parsers): @pytest.mark.parametrize( "kwargs", - [{"error_bad_lines": False, "warn_bad_lines": False}, {"on_bad_lines": "skip"}], + [ + pytest.param( + {"error_bad_lines": False, "warn_bad_lines": False}, + marks=pytest.mark.filterwarnings("ignore"), + ), + {"on_bad_lines": "skip"}, + ], ) def test_suppress_error_output(all_parsers, capsys, kwargs): # see gh-15925 @@ -157,18 +163,18 @@ def test_suppress_error_output(all_parsers, capsys, kwargs): assert captured.err == "" +@pytest.mark.filterwarnings("ignore") @pytest.mark.parametrize( "kwargs", - [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. -) -@pytest.mark.parametrize( - "warn_kwargs", [ {}, - {"warn_bad_lines": True}, - {"warn_bad_lines": False}, + {"error_bad_lines": True}, {"on_bad_lines": "error"}, - ], + ], # Default is True. # Explicitly pass in. +) +@pytest.mark.parametrize( + "warn_kwargs", + [{}, {"warn_bad_lines": True}, pytest.param({"warn_bad_lines": False})], ) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 @@ -183,7 +189,13 @@ def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): @pytest.mark.parametrize( "kwargs", - [{"error_bad_lines": False, "warn_bad_lines": True}, {"on_bad_lines": "warn"}], + [ + pytest.param( + {"error_bad_lines": False, "warn_bad_lines": True}, + marks=pytest.mark.filterwarnings("ignore"), + ), + {"on_bad_lines": "warn"}, + ], ) def test_warn_bad_lines(all_parsers, capsys, kwargs): # see gh-15925 @@ -191,7 +203,7 @@ def test_warn_bad_lines(all_parsers, capsys, kwargs): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() From 8629f8712e501bea477f19901b9e58b55336979e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 13 Mar 2021 10:39:41 -0800 Subject: [PATCH 04/21] Fix tests --- pandas/tests/io/parser/test_c_parser_only.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 4 +--- pandas/tests/io/parser/test_textreader.py | 12 ++---------- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index f8aff3ad3696a..3e2d554d5e00e 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -496,7 +496,7 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): header=None, delimiter="\\s+", skiprows=0, - error_bad_lines=False, + on_bad_lines="warn", ) captured = capsys.readouterr() # skipped lines 2, 3, 4, 9 diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index cf6866946ab76..f62c9fd1349bf 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -276,9 +276,7 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv( - StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False - ) + result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") tm.assert_frame_equal(result, expected) captured = capsys.readouterr() diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 104cf56419bfd..5dfeb86ea1ca0 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -139,11 +139,7 @@ def test_skip_bad_lines(self, capsys): reader.read() reader = TextReader( - StringIO(data), - delimiter=":", - header=None, - error_bad_lines=False, - warn_bad_lines=False, + StringIO(data), delimiter=":", header=None, on_bad_lines="skip" ) result = reader.read() expected = { @@ -154,11 +150,7 @@ def test_skip_bad_lines(self, capsys): assert_array_dicts_equal(result, expected) reader = TextReader( - StringIO(data), - delimiter=":", - header=None, - error_bad_lines=False, - warn_bad_lines=True, + StringIO(data), delimiter=":", header=None, on_bad_lines="warn" ) reader.read() captured = capsys.readouterr() From 06f87a1d8690a1da06f994a2bce253a69dc81360 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 13 Mar 2021 13:57:19 -0800 Subject: [PATCH 05/21] Doc fixes for green --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d86bb906cdfec..28d2e6a7e3af2 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -362,7 +362,7 @@ warn_bad_lines : boolean, default ``True`` The ``on_bad_lines`` parameter takes precedence over this parameter when specified and should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : {{``None``, 'error', 'warn', 'skip'}}, default ``None`` +on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None`` Specifies what to do upon encountering a bad line (a line with too many fields). The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``. Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines" From 5b08a8889e80f1688078a4ac8ba82faf93c23b2e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 16 Mar 2021 16:15:55 -0700 Subject: [PATCH 06/21] Update test_common_basic.py --- pandas/tests/io/parser/common/test_common_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index cebcb80faa993..d6fbf410e7257 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -755,7 +755,6 @@ def test_encoding_surrogatepass(all_parsers): parser.read_csv(path) - def test_deprecated_bad_lines_warns(all_parsers, csv1): # GH 15122 parser = all_parsers @@ -772,6 +771,7 @@ def test_deprecated_bad_lines_warns(all_parsers, csv1): ): parser.read_csv(csv1, warn_bad_lines=False) + def test_malformed_second_line(all_parsers): # see GH14782 parser = all_parsers From af3fd1542f3f28d2510fc7502d39311b59a435ad Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 16 Mar 2021 16:17:34 -0700 Subject: [PATCH 07/21] Update test_common_basic.py --- pandas/tests/io/parser/common/test_common_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index d6fbf410e7257..6904ecce3e604 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -771,7 +771,7 @@ def test_deprecated_bad_lines_warns(all_parsers, csv1): ): parser.read_csv(csv1, warn_bad_lines=False) - + def test_malformed_second_line(all_parsers): # see GH14782 parser = all_parsers From a0406b53bacffbc49901303956935820ec3d36fb Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 5 Apr 2021 19:20:55 -0700 Subject: [PATCH 08/21] Address Code Review --- doc/source/user_guide/io.rst | 15 +++++-- pandas/_libs/parsers.pyx | 23 +---------- pandas/io/parsers/base_parser.py | 25 ++++++++++++ pandas/io/parsers/c_parser_wrapper.py | 13 +++++- pandas/io/parsers/python_parser.py | 40 ++++++++----------- pandas/io/parsers/readers.py | 21 +++++++--- .../io/parser/common/test_common_basic.py | 14 +++---- pandas/tests/io/parser/test_textreader.py | 4 +- 8 files changed, 89 insertions(+), 66 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 630a36d71625e..a80c9af137e30 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -364,9 +364,18 @@ warn_bad_lines : boolean, default ``True`` encountering a bad line instead. on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None`` Specifies what to do upon encountering a bad line (a line with too many fields). - The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``. - Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines" - will be dropped from the DataFrame, with a warning raised if 'warn' is specified. + Allowed values are : + + - ``None``, default option, defers to ``error_bad_lines`` and ``warn_bad_lines``. + + Note: This option is only present for backwards-compatibility reasons and will + be removed after the removal of ``error_bad_lines`` and ``warn_bad_lines``. + Please do not specify it explicitly. + + - 'error', raise an Exception when a bad line is encountered. + - 'warn', raise a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. + This parameter takes precedence over parameters ``error_bad_lines`` and ``warn_bad_lines`` if specified. diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2988431e27463..7a58aa6dd9591 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -355,9 +355,7 @@ cdef class TextReader: thousands=None, dtype=None, usecols=None, - bint error_bad_lines=True, - bint warn_bad_lines=True, - on_bad_lines = None, + on_bad_lines = ERROR, bint na_filter=True, na_values=None, na_fvalues=None, @@ -441,24 +439,7 @@ cdef class TextReader: raise ValueError('Only length-1 comment characters supported') self.parser.commentchar = ord(comment) - # error handling of bad lines - if on_bad_lines is not None: - if on_bad_lines == "error": - self.parser.on_bad_lines = ERROR - elif on_bad_lines == "warn": - self.parser.on_bad_lines = WARN - elif on_bad_lines == "skip": - self.parser.on_bad_lines = SKIP - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for " - "on_bad_lines") - else: - if error_bad_lines: - self.parser.on_bad_lines = ERROR - elif warn_bad_lines: - self.parser.on_bad_lines = WARN - else: - self.parser.on_bad_lines = SKIP + self.parser.on_bad_lines = on_bad_lines self.skiprows = skiprows if skiprows is not None: diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a011a789bf17c..6fc63260d5cf4 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1,6 +1,7 @@ from collections import defaultdict import csv import datetime +from enum import Enum import itertools from typing import ( Any, @@ -114,6 +115,11 @@ class ParserBase: + class BadLineHandleMethod(Enum): + ERROR = 0 + WARN = 1 + SKIP = 2 + def __init__(self, kwds): self.names = kwds.get("names") @@ -202,6 +208,25 @@ def __init__(self, kwds): self.handles: Optional[IOHandles] = None + # Bad line handling + on_bad_lines = kwds.get("on_bad_lines") + if on_bad_lines is not None: + if on_bad_lines == "error": + self.on_bad_lines = self.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + self.on_bad_lines = self.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + self.on_bad_lines = self.BadLineHandleMethod.SKIP + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + else: + if kwds.get("error_bad_lines"): + self.on_bad_lines = self.BadLineHandleMethod.ERROR + elif kwds.get("warn_bad_lines"): + self.on_bad_lines = self.BadLineHandleMethod.WARN + else: + self.on_bad_lines = self.BadLineHandleMethod.SKIP + def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: """ Let the readers open IOHanldes after they are done with their potential raises. diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index aa5a78f5afd04..1ad2717b87dc4 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -25,7 +25,18 @@ def __init__(self, src: FilePathOrBuffer, **kwds): # open handles self._open_handles(src, kwds) assert self.handles is not None - for key in ("storage_options", "encoding", "memory_map", "compression"): + + # Have to pass int, would break tests using TextReader directly otherwise :( + kwds["on_bad_lines"] = self.on_bad_lines.value + + for key in ( + "storage_options", + "encoding", + "memory_map", + "compression", + "error_bad_lines", + "warn_bad_lines", + ): kwds.pop(key, None) if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): # error: Item "IO[Any]" of "Union[IO[Any], RawIOBase, BufferedIOBase, diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index eba32684297b0..d6528add3fcda 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -75,20 +75,6 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self.quoting = kwds["quoting"] self.skip_blank_lines = kwds["skip_blank_lines"] - if kwds["on_bad_lines"] is not None: - if kwds["on_bad_lines"] not in {"error", "warn", "skip"}: - raise ValueError( - f"Argument {kwds['on_bad_lines']} is invalid for on_bad_lines" - ) - self.on_bad_lines = kwds["on_bad_lines"] - else: - if kwds["error_bad_lines"]: - self.on_bad_lines = "error" - elif kwds["warn_bad_lines"]: - self.on_bad_lines = "warn" - else: - self.on_bad_lines = "skip" - self.names_passed = kwds["names"] or None self.has_index_names = False @@ -675,12 +661,11 @@ def _next_line(self): def _alert_malformed(self, msg, row_num): """ - Alert a user about a malformed row. + Alert a user about a malformed row, depending on value of + `self.on_bad_lines` enum. - If `self.on_bad_lines` is 'error' or `self.error_bad_lines` is True, - the alert will be `ParserError`. - If `self.on_bad_lines` is 'warn' or `self.warn_bad_lines` is True, - the alert will be printed out. + If `self.on_bad_lines` is ERROR, the alert will be `ParserError`. + If `self.on_bad_lines` is WARN, the alert will be printed out. Parameters ---------- @@ -689,9 +674,10 @@ def _alert_malformed(self, msg, row_num): Because this row number is displayed, we 1-index, even though we 0-index internally. """ - if self.on_bad_lines == "error": + print(self.on_bad_lines) + if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) - elif self.on_bad_lines == "warn": + elif self.on_bad_lines == self.BadLineHandleMethod.WARN: base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n") @@ -712,7 +698,10 @@ def _next_iter_line(self, row_num): assert self.data is not None return next(self.data) except csv.Error as e: - if self.on_bad_lines == "error" or self.on_bad_lines == "warn": + if ( + self.on_bad_lines == self.BadLineHandleMethod.ERROR + or self.on_bad_lines == self.BadLineHandleMethod.WARN + ): msg = str(e) if "NULL byte" in msg or "line contains NUL" in msg: @@ -909,11 +898,14 @@ def _rows_to_cols(self, content): actual_len = len(l) if actual_len > col_len: - if self.on_bad_lines == "error" or self.on_bad_lines == "warn": + if ( + self.on_bad_lines == self.BadLineHandleMethod.ERROR + or self.on_bad_lines == self.BadLineHandleMethod.WARN + ): row_num = self.pos - (content_len - i + footers) bad_lines.append((row_num, actual_len)) - if self.on_bad_lines == "error": + if self.on_bad_lines == self.BadLineHandleMethod.ERROR: break else: content.append(l) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 09293909ccadb..578866947385d 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -338,13 +338,22 @@ The ``on_bad_lines`` parameter takes precedence over this parameter when specified and should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default None +on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None`` Specifies what to do upon encountering a bad line (a line with too many fields). - The default value of None will defer to ``error_bad_lines`` and ``warn_bad_lines``. - Specifying 'error' will cause an exception to be raised. Otherwise, the "bad lines" - will be dropped from the DataFrame, with a warning raised if 'warn' is specified. - This parameter takes precedence over parameters ``error_bad_lines`` and - ``warn_bad_lines`` if specified. + Allowed values are : + + - ``None``, default option, defer to ``error_bad_lines`` and ``warn_bad_lines``. + + Note: This option is only present for backwards-compatibility reasons and will + be removed after the removal of ``error_bad_lines`` and ``warn_bad_lines``. + Please do not specify it explicitly. + + - 'error', raise an Exception when a bad line is encountered. + - 'warn', raise a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. + + This parameter takes precedence over parameters + ``error_bad_lines`` and ``warn_bad_lines`` if specified. .. versionadded:: 1.3 diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 56c4f7f44def7..404a433458a78 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -755,21 +755,17 @@ def test_encoding_surrogatepass(all_parsers): parser.read_csv(path) -def test_deprecated_bad_lines_warns(all_parsers, csv1): +@pytest.mark.parametrize("on_bad_lines", ["error", "warn"]) +def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines): # GH 15122 parser = all_parsers + kwds = {f"{on_bad_lines}_bad_lines": False} with tm.assert_produces_warning( FutureWarning, - match="The error_bad_lines argument has been deprecated " + match=f"The {on_bad_lines}_bad_lines argument has been deprecated " "and will be removed in a future version.\n\n", ): - parser.read_csv(csv1, error_bad_lines=False) - with tm.assert_produces_warning( - FutureWarning, - match="The warn_bad_lines argument has been deprecated " - "and will be removed in a future version.\n\n", - ): - parser.read_csv(csv1, warn_bad_lines=False) + parser.read_csv(csv1, **kwds) def test_malformed_second_line(all_parsers): diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 5dfeb86ea1ca0..1498063dd909f 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -139,7 +139,7 @@ def test_skip_bad_lines(self, capsys): reader.read() reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines="skip" + StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip ) result = reader.read() expected = { @@ -150,7 +150,7 @@ def test_skip_bad_lines(self, capsys): assert_array_dicts_equal(result, expected) reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines="warn" + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn ) reader.read() captured = capsys.readouterr() From 89fdc707ff0b54e67f8110894ea2b2ecae93dc20 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 5 Apr 2021 19:22:56 -0700 Subject: [PATCH 09/21] oops --- pandas/io/parsers/python_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index d6528add3fcda..99ea3c1886cf5 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -674,7 +674,6 @@ def _alert_malformed(self, msg, row_num): Because this row number is displayed, we 1-index, even though we 0-index internally. """ - print(self.on_bad_lines) if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) elif self.on_bad_lines == self.BadLineHandleMethod.WARN: From 1e20b53c6021008bb2f614f1ad7dda473f800d2b Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 27 Apr 2021 19:15:13 -0700 Subject: [PATCH 10/21] Address code review --- doc/source/user_guide/io.rst | 26 +++-------- pandas/io/parsers/base_parser.py | 34 ++++++++------- pandas/io/parsers/readers.py | 43 +++++++------------ .../io/parser/common/test_read_errors.py | 2 +- 4 files changed, 41 insertions(+), 64 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index bb95ad100acfa..9affe38ab7883 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -343,7 +343,7 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, default ``True`` +error_bad_lines : boolean, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no ``DataFrame`` will be returned. If ``False``, then these "bad lines" will dropped from the @@ -351,34 +351,22 @@ error_bad_lines : boolean, default ``True`` below. .. deprecated:: 1.3 - The ``on_bad_lines`` parameter takes precedence over this parameter - when specified and should be used instead to specify behavior upon + The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -warn_bad_lines : boolean, default ``True`` +warn_bad_lines : boolean, default ``None`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. .. deprecated:: 1.3 - The ``on_bad_lines`` parameter takes precedence over this parameter - when specified and should be used instead to specify behavior upon + The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None`` +on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : - - - ``None``, default option, defers to ``error_bad_lines`` and ``warn_bad_lines``. - - Note: This option is only present for backwards-compatibility reasons and will - be removed after the removal of ``error_bad_lines`` and ``warn_bad_lines``. - Please do not specify it explicitly. - - - 'error', raise an Exception when a bad line is encountered. - - 'warn', raise a warning when a bad line is encountered and skip that line. + - 'error', raise an ParserError when a bad line is encountered. + - 'warn', raise a when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. - This parameter takes precedence over parameters ``error_bad_lines`` and ``warn_bad_lines`` - if specified. - .. versionadded:: 1.3 .. _io.dtypes: diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 6fc63260d5cf4..572edfc0f400f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -209,23 +209,25 @@ def __init__(self, kwds): self.handles: Optional[IOHandles] = None # Bad line handling - on_bad_lines = kwds.get("on_bad_lines") - if on_bad_lines is not None: - if on_bad_lines == "error": - self.on_bad_lines = self.BadLineHandleMethod.ERROR - elif on_bad_lines == "warn": - self.on_bad_lines = self.BadLineHandleMethod.WARN - elif on_bad_lines == "skip": - self.on_bad_lines = self.BadLineHandleMethod.SKIP - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + on_bad_lines = kwds.get("on_bad_lines", "error") + if on_bad_lines == "error": + self.on_bad_lines = self.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + self.on_bad_lines = self.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + self.on_bad_lines = self.BadLineHandleMethod.SKIP else: - if kwds.get("error_bad_lines"): - self.on_bad_lines = self.BadLineHandleMethod.ERROR - elif kwds.get("warn_bad_lines"): - self.on_bad_lines = self.BadLineHandleMethod.WARN - else: - self.on_bad_lines = self.BadLineHandleMethod.SKIP + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + # Override on_bad_lines w/ deprecated args for backward compatibility + error_bad_lines = kwds.get("error_bad_lines") + warn_bad_lines = kwds.get("warn_bad_lines") + if error_bad_lines: + self.on_bad_lines = self.BadLineHandleMethod.ERROR + elif warn_bad_lines: + self.on_bad_lines = self.BadLineHandleMethod.WARN + elif error_bad_lines is False and warn_bad_lines is False: + # Be careful - None evaluates to False + self.on_bad_lines = self.BadLineHandleMethod.SKIP def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: """ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 578866947385d..8e229159fb27c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -320,41 +320,30 @@ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. -error_bad_lines : bool, default True +error_bad_lines : bool, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no DataFrame will be returned. If False, then these "bad lines" will be dropped from the DataFrame that is returned. .. deprecated:: 1.3 - The ``on_bad_lines`` parameter takes precedence over this parameter - when specified and should be used instead to specify behavior upon + The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -warn_bad_lines : bool, default True +warn_bad_lines : bool, default ``None`` If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. .. deprecated:: 1.3 - The ``on_bad_lines`` parameter takes precedence over this parameter - when specified and should be used instead to specify behavior upon + The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : {{None, 'error', 'warn', 'skip'}}, default ``None`` +on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : - - ``None``, default option, defer to ``error_bad_lines`` and ``warn_bad_lines``. - - Note: This option is only present for backwards-compatibility reasons and will - be removed after the removal of ``error_bad_lines`` and ``warn_bad_lines``. - Please do not specify it explicitly. - - 'error', raise an Exception when a bad line is encountered. - 'warn', raise a warning when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. - This parameter takes precedence over parameters - ``error_bad_lines`` and ``warn_bad_lines`` if specified. - .. versionadded:: 1.3 delim_whitespace : bool, default False @@ -409,9 +398,9 @@ "na_filter": True, "low_memory": True, "memory_map": False, - "error_bad_lines": True, - "warn_bad_lines": True, - "on_bad_lines": None, + "error_bad_lines": None, + "warn_bad_lines": None, + "on_bad_lines": "error", "float_precision": None, } @@ -420,7 +409,7 @@ _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": True, "warn_bad_lines": True} +_deprecated_defaults: Dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None} _deprecated_args: Set[str] = {"error_bad_lines", "warn_bad_lines"} @@ -561,10 +550,9 @@ def read_csv( encoding_errors: Optional[str] = "strict", dialect=None, # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - # TODO: disallow and change None to 'error' in on_bad_lines in 2.0 - on_bad_lines=None, + error_bad_lines=None, + warn_bad_lines=None, + on_bad_lines="error", # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], @@ -643,10 +631,9 @@ def read_table( encoding=None, dialect=None, # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - # TODO: disallow and change None to 'error' in on_bad_lines in 2.0 - on_bad_lines=None, + error_bad_lines=None, + warn_bad_lines=None, + on_bad_lines="error", encoding_errors: Optional[str] = "strict", # Internal delim_whitespace=False, diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 1c85f55b4e1f7..376b9f40b19df 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -174,7 +174,7 @@ def test_suppress_error_output(all_parsers, capsys, kwargs): ) @pytest.mark.parametrize( "warn_kwargs", - [{}, {"warn_bad_lines": True}, pytest.param({"warn_bad_lines": False})], + [{}, {"warn_bad_lines": False}], ) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 From 2e79f9ae8614e06a4d2e2358013ad308619fad4e Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 27 Apr 2021 20:35:47 -0700 Subject: [PATCH 11/21] Update io.rst --- doc/source/user_guide/io.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9affe38ab7883..4d4eddb36eaa2 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -363,6 +363,7 @@ warn_bad_lines : boolean, default ``None`` on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : + - 'error', raise an ParserError when a bad line is encountered. - 'warn', raise a when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. From fe7541cd9186a3afc3c0c78c07290e8ab453d2b8 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 28 Apr 2021 18:41:05 -0700 Subject: [PATCH 12/21] Address code review --- pandas/io/parsers/base_parser.py | 5 +- pandas/io/parsers/readers.py | 56 +++++++++++++++++-- .../io/parser/common/test_read_errors.py | 22 ++++++-- 3 files changed, 71 insertions(+), 12 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 572edfc0f400f..7ed8fa8cf7b8e 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -223,7 +223,10 @@ def __init__(self, kwds): warn_bad_lines = kwds.get("warn_bad_lines") if error_bad_lines: self.on_bad_lines = self.BadLineHandleMethod.ERROR - elif warn_bad_lines: + elif warn_bad_lines and error_bad_lines is not None: + # Kinda sketch, but maintain BC in that needs explicit + # error_bad_lines -> False to warn even if warn_bad_lines->True. + # With new default of None, this is necessary. self.on_bad_lines = self.BadLineHandleMethod.WARN elif error_bad_lines is False and warn_bad_lines is False: # Be careful - None evaluates to False diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8e229159fb27c..78402b55cd12a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -552,7 +552,9 @@ def read_csv( # Error Handling error_bad_lines=None, warn_bad_lines=None, - on_bad_lines="error", + # TODO (2.0): set on_bad_lines to "error". + # See _refine_defaults_read comment for why we do this. + on_bad_lines=None, # Internal delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], @@ -565,7 +567,15 @@ def read_csv( del kwds["sep"] kwds_defaults = _refine_defaults_read( - dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} + dialect, + delimiter, + delim_whitespace, + engine, + sep, + error_bad_lines, + warn_bad_lines, + on_bad_lines, + defaults={"delimiter": ","}, ) kwds.update(kwds_defaults) @@ -633,7 +643,9 @@ def read_table( # Error Handling error_bad_lines=None, warn_bad_lines=None, - on_bad_lines="error", + # TODO (2.0): set on_bad_lines to "error". + # See _refine_defaults_read comment for why we do this. + on_bad_lines=None, encoding_errors: Optional[str] = "strict", # Internal delim_whitespace=False, @@ -646,7 +658,15 @@ def read_table( del kwds["sep"] kwds_defaults = _refine_defaults_read( - dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} + dialect, + delimiter, + delim_whitespace, + engine, + sep, + error_bad_lines, + warn_bad_lines, + on_bad_lines, + defaults={"delimiter": "\t"}, ) kwds.update(kwds_defaults) @@ -1193,6 +1213,9 @@ def _refine_defaults_read( delim_whitespace: bool, engine: str, sep: Union[str, object], + error_bad_lines: Optional[str], + warn_bad_lines: Optional[str], + on_bad_lines: Optional[str], defaults: Dict[str, Any], ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -1218,6 +1241,12 @@ def _refine_defaults_read( sep : str or object A delimiter provided by the user (str) or a sentinel value, i.e. pandas._libs.lib.no_default. + error_bad_lines : str or None + Whether to error on a bad line or not. + warn_bad_lines : str or None + Whether to warn on a bad line or not. + on_bad_lines : str or None + An option for handling bad lines or a sentinel value(None). defaults: dict Default values of input parameters. @@ -1228,8 +1257,11 @@ def _refine_defaults_read( Raises ------ - ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and + ValueError : + If a delimiter was specified with ``sep`` (or ``delimiter``) and ``delim_whitespace=True``. + If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/ + ``warn_bad_lines`` is True. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -1273,6 +1305,20 @@ def _refine_defaults_read( kwds["engine"] = "c" kwds["engine_specified"] = False + # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines + # aren't specified at the same time. If so, raise. Otherwise, + # alias on_bad_lines to "error" if error/warn_bad_lines not set + # and on_bad_lines is not set. on_bad_lines is defaulted to None + # so we can tell if it is set (this is why this hack exists). + if on_bad_lines is not None: + if error_bad_lines is not None or warn_bad_lines is not None: + raise ValueError( + "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " + "Please only set on_bad_lines." + ) + else: + kwds["on_bad_lines"] = "error" + return kwds diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 376b9f40b19df..f5438ea3f0296 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -166,15 +166,11 @@ def test_suppress_error_output(all_parsers, capsys, kwargs): @pytest.mark.filterwarnings("ignore") @pytest.mark.parametrize( "kwargs", - [ - {}, - {"error_bad_lines": True}, - {"on_bad_lines": "error"}, - ], # Default is True. # Explicitly pass in. + [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. ) @pytest.mark.parametrize( "warn_kwargs", - [{}, {"warn_bad_lines": False}], + [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}], ) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 @@ -265,3 +261,17 @@ def test_invalid_on_bad_line(all_parsers): data = "a\n1\n1,2,3\n4\n5,6,7" with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"): parser.read_csv(StringIO(data), on_bad_lines="abc") + + +@pytest.mark.parametrize("error_bad_lines", [True, False]) +@pytest.mark.parametrize("warn_bad_lines", [True, False]) +def test_conflict_on_bad_line(all_parsers, error_bad_lines, warn_bad_lines): + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + kwds = {"error_bad_lines": error_bad_lines, "warn_bad_lines": warn_bad_lines} + with pytest.raises( + ValueError, + match="Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " + "Please only set on_bad_lines.", + ): + parser.read_csv(StringIO(data), on_bad_lines="error", **kwds) From 772c13f1aff9fd81a40506a54da0051ec0cfe353 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 28 Apr 2021 18:57:46 -0700 Subject: [PATCH 13/21] manual pre-commit --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 4d4eddb36eaa2..050f8799a84a9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -363,7 +363,7 @@ warn_bad_lines : boolean, default ``None`` on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : - + - 'error', raise an ParserError when a bad line is encountered. - 'warn', raise a when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. From e267aa401e11213ffb57624aa4dedb3b30383460 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 22 May 2021 19:52:40 -0700 Subject: [PATCH 14/21] Consolidate --- pandas/io/parsers/base_parser.py | 26 +++--------------------- pandas/io/parsers/readers.py | 35 +++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 18cb486ee76a0..ddeb5647a0c0f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -213,29 +213,9 @@ def __init__(self, kwds): self.handles: Optional[IOHandles] = None - # Bad line handling - on_bad_lines = kwds.get("on_bad_lines", "error") - if on_bad_lines == "error": - self.on_bad_lines = self.BadLineHandleMethod.ERROR - elif on_bad_lines == "warn": - self.on_bad_lines = self.BadLineHandleMethod.WARN - elif on_bad_lines == "skip": - self.on_bad_lines = self.BadLineHandleMethod.SKIP - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") - # Override on_bad_lines w/ deprecated args for backward compatibility - error_bad_lines = kwds.get("error_bad_lines") - warn_bad_lines = kwds.get("warn_bad_lines") - if error_bad_lines: - self.on_bad_lines = self.BadLineHandleMethod.ERROR - elif warn_bad_lines and error_bad_lines is not None: - # Kinda sketch, but maintain BC in that needs explicit - # error_bad_lines -> False to warn even if warn_bad_lines->True. - # With new default of None, this is necessary. - self.on_bad_lines = self.BadLineHandleMethod.WARN - elif error_bad_lines is False and warn_bad_lines is False: - # Be careful - None evaluates to False - self.on_bad_lines = self.BadLineHandleMethod.SKIP + # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns) + # Normally, this arg would get pre-processed earlier on + self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: """ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 78402b55cd12a..bcbee11d1f262 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -32,6 +32,7 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( + is_bool, is_file_like, is_float, is_integer, @@ -1316,8 +1317,40 @@ def _refine_defaults_read( "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " "Please only set on_bad_lines." ) + if on_bad_lines == "error": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") else: - kwds["on_bad_lines"] = "error" + if error_bad_lines is not None: + # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true + if is_bool(error_bad_lines): + if error_bad_lines: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + else: + if warn_bad_lines is not None: + # This is the case where error_bad_lines is False + # We can only warn/skip if error_bad_lines is False + # None doesn't work because backwards-compatibility reasons + if warn_bad_lines is True: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif warn_bad_lines is False: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + else: + raise ValueError("warn_bad_lines must be a boolean") + else: + # Backwards compat, when only error_bad_lines = false, we warn + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + else: + raise ValueError("error_bad_lines must be a boolean") + else: + # Everything None -> Error + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + # print(kwds["on_bad_lines"]) return kwds From e724d0b6289d4a40d3db127674a0a221dbf9359c Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Sat, 22 May 2021 20:00:11 -0700 Subject: [PATCH 15/21] Clarify behavior --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 050f8799a84a9..9f4ffd74ccf03 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -365,7 +365,7 @@ on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Allowed values are : - 'error', raise an ParserError when a bad line is encountered. - - 'warn', raise a when a bad line is encountered and skip that line. + - 'warn', print a warning when a bad line is encountered and skip that line. - 'skip', skip bad lines without raising or warning when they are encountered. .. versionadded:: 1.3 From 9b8468a778f2f03a0eaa9fe31504f39747c73a98 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 23 May 2021 11:03:38 -0700 Subject: [PATCH 16/21] typing --- pandas/io/parsers/readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 589b246c2e974..bfe0d60b8c957 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1221,8 +1221,8 @@ def _refine_defaults_read( delim_whitespace: bool, engine: str, sep: Union[str, object], - error_bad_lines: Optional[str], - warn_bad_lines: Optional[str], + error_bad_lines: Optional[bool], + warn_bad_lines: Optional[bool], on_bad_lines: Optional[str], names: Union[Optional[ArrayLike], object], prefix: Union[Optional[str], object], From a6af9aa9e1a7bfaeb31a4389de67af3ed45c85be Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 24 May 2021 16:03:13 -0700 Subject: [PATCH 17/21] Fix failed test --- pandas/io/parsers/base_parser.py | 1 + pandas/io/parsers/readers.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5dde1ed8bff65..2a86ff13a2edc 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -109,6 +109,7 @@ "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", + "on_bad_lines": "error", } diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index bfe0d60b8c957..5f9c25a5b105f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -402,7 +402,6 @@ "memory_map": False, "error_bad_lines": None, "warn_bad_lines": None, - "on_bad_lines": "error", "float_precision": None, } From 2f70edc5e4c2e0791a0328e9ecb6fbfabf817102 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 24 May 2021 16:36:45 -0700 Subject: [PATCH 18/21] Clean code --- pandas/io/parsers/readers.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 5f9c25a5b105f..9bc1a2ccbf314 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -31,9 +31,9 @@ ParserWarning, ) from pandas.util._decorators import Appender +from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - is_bool, is_file_like, is_float, is_integer, @@ -1348,29 +1348,25 @@ def _refine_defaults_read( else: if error_bad_lines is not None: # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true - if is_bool(error_bad_lines): - if error_bad_lines: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - else: - if warn_bad_lines is not None: - # This is the case where error_bad_lines is False - # We can only warn/skip if error_bad_lines is False - # None doesn't work because backwards-compatibility reasons - if warn_bad_lines is True: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - elif warn_bad_lines is False: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - else: - raise ValueError("warn_bad_lines must be a boolean") - else: - # Backwards compat, when only error_bad_lines = false, we warn - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + validate_bool_kwarg(error_bad_lines, "error_bad_lines") + if error_bad_lines: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR else: - raise ValueError("error_bad_lines must be a boolean") + if warn_bad_lines is not None: + # This is the case where error_bad_lines is False + # We can only warn/skip if error_bad_lines is False + # None doesn't work because backwards-compatibility reasons + validate_bool_kwarg(warn_bad_lines, "warn_bad_lines") + if warn_bad_lines: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + else: + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + else: + # Backwards compat, when only error_bad_lines = false, we warn + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN else: # Everything None -> Error kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - # print(kwds["on_bad_lines"]) return kwds From cf3201c52dd8005a7de70110af81400f633f250d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 27 May 2021 16:08:58 -0700 Subject: [PATCH 19/21] Update v1.3.0.rst --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2360c2a1909b6..76784fbef7a0d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -669,7 +669,7 @@ Deprecations - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) -- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`pd.read_csv` in favor of ``on_bad_lines`` (:issue:`15122`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:``read_csv`` and :meth:``read_table`` in favor of argument ``on_bad_lines`` (:issue:`15122`) - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) - Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`) - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) From 4911b27e3827719c57f8b777a20ebbe1435f837d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 27 May 2021 16:56:15 -0700 Subject: [PATCH 20/21] Update readers.py --- pandas/io/parsers/readers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 1f6f8ffc16efc..8be3f0e167615 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -34,6 +34,7 @@ Appender, deprecate_nonkeyword_arguments, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -978,7 +979,7 @@ def _clean_options(self, options, engine): f"The {arg} argument has been deprecated and will be " "removed in a future version.\n\n" ) - warnings.warn(msg, FutureWarning, stacklevel=6) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) else: result[arg] = parser_default From f28316bc12017db68d933ce6cf8f0a89e5d6682a Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 27 May 2021 17:22:47 -0700 Subject: [PATCH 21/21] Fix stacklevel --- pandas/io/parsers/readers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8be3f0e167615..8bf1ab1260b8e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -34,7 +34,6 @@ Appender, deprecate_nonkeyword_arguments, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( @@ -979,7 +978,7 @@ def _clean_options(self, options, engine): f"The {arg} argument has been deprecated and will be " "removed in a future version.\n\n" ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + warnings.warn(msg, FutureWarning, stacklevel=7) else: result[arg] = parser_default