From 2867c0a46ccfcfc8b153126f92f80a790e5aa96b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 29 Sep 2022 10:56:08 +0200 Subject: [PATCH 1/5] DEP: Enforce deprecations of read_csv keywords --- doc/source/user_guide/io.rst | 69 ------- pandas/io/parsers/arrow_parser_wrapper.py | 4 +- pandas/io/parsers/base_parser.py | 10 -- pandas/io/parsers/c_parser_wrapper.py | 14 +- pandas/io/parsers/python_parser.py | 5 +- pandas/io/parsers/readers.py | 210 ++-------------------- 6 files changed, 17 insertions(+), 295 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 15b3b894c68b6..7053e8a6d9354 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -154,25 +154,6 @@ usecols : list-like or callable, default ``None`` Using this parameter results in much faster parsing time and lower memory usage when using the c engine. The Python engine loads the data first before deciding which columns to drop. -squeeze : boolean, default ``False`` - If the parsed data only contains one column then return a ``Series``. - - .. deprecated:: 1.4.0 - Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze - the data. -prefix : str, default ``None`` - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... - - .. deprecated:: 1.4.0 - Use a list comprehension on the DataFrame's columns after calling ``read_csv``. - - .. ipython:: python - - data = "col1,col2,col3\na,b,1" - - df = pd.read_csv(StringIO(data)) - df.columns = [f"pre_{col}" for col in df.columns] - df mangle_dupe_cols : boolean, default ``True`` Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'. @@ -387,23 +368,6 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, optional, default ``None`` - Lines with too many fields (e.g. a csv line with too many commas) will by - default cause an exception to be raised, and no ``DataFrame`` will be - returned. If ``False``, then these "bad lines" will dropped from the - ``DataFrame`` that is returned. See :ref:`bad lines ` - below. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. -warn_bad_lines : boolean, optional, default ``None`` - If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for - each "bad line" will be output. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : @@ -1213,37 +1177,6 @@ Infinity ``inf`` like values will be parsed as ``np.inf`` (positive infinity), and ``-inf`` as ``-np.inf`` (negative infinity). These will ignore the case of the value, meaning ``Inf``, will also be parsed as ``np.inf``. - -Returning Series -'''''''''''''''' - -Using the ``squeeze`` keyword, the parser will return output with a single column -as a ``Series``: - -.. deprecated:: 1.4.0 - Users should append ``.squeeze("columns")`` to the DataFrame returned by - ``read_csv`` instead. - -.. ipython:: python - :okwarning: - - data = "level\nPatient1,123000\nPatient2,23000\nPatient3,1234018" - - with open("tmp.csv", "w") as fh: - fh.write(data) - - print(open("tmp.csv").read()) - - output = pd.read_csv("tmp.csv", squeeze=True) - output - - type(output) - -.. ipython:: python - :suppress: - - os.remove("tmp.csv") - .. _io.boolean: Boolean values @@ -1700,8 +1633,6 @@ Options that are unsupported by the pyarrow engine which are not covered by the * ``thousands`` * ``memory_map`` * ``dialect`` -* ``warn_bad_lines`` -* ``error_bad_lines`` * ``on_bad_lines`` * ``delim_whitespace`` * ``quoting`` diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 49b71efbfb6ec..6cc56bb1c8840 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -95,9 +95,7 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: multi_index_named = True if self.header is None: if self.names is None: - if self.prefix is not None: - self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - elif self.header is None: + if self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index f90a0549a4320..77514750dd53f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -90,7 +90,6 @@ def __init__(self, kwds) -> None: self.names = kwds.get("names") self.orig_names: list | None = None - self.prefix = kwds.pop("prefix", None) self.index_col = kwds.get("index_col", None) self.unnamed_cols: set = set() @@ -147,11 +146,6 @@ def __init__(self, kwds) -> None: "index_col must only contain row numbers " "when specifying a multi-index header" ) - elif self.header is not None and self.prefix is not None: - # GH 27394 - raise ValueError( - "Argument prefix must be None if argument header is not None" - ) self._name_processed = False @@ -1113,7 +1107,6 @@ def converter(*date_cols): "header": "infer", "index_col": None, "names": None, - "prefix": None, "skiprows": None, "skipfooter": 0, "nrows": None, @@ -1137,15 +1130,12 @@ def converter(*date_cols): "chunksize": None, "verbose": False, "encoding": None, - "squeeze": None, "compression": None, "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, - "error_bad_lines": None, - "warn_bad_lines": None, } diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index dc104b3020f14..a95212de727a5 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -72,8 +72,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: "encoding", "memory_map", "compression", - "error_bad_lines", - "warn_bad_lines", ): kwds.pop(key, None) @@ -103,16 +101,8 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # error: Cannot determine type of 'names' if self.names is None: # type: ignore[has-type] - if self.prefix: - # error: Cannot determine type of 'names' - self.names = [ # type: ignore[has-type] - f"{self.prefix}{i}" for i in range(self._reader.table_width) - ] - else: - # error: Cannot determine type of 'names' - self.names = list( # type: ignore[has-type] - range(self._reader.table_width) - ) + # error: Cannot determine type of 'names' + self.names = list(range(self._reader.table_width)) # type: ignore[has-type] # gh-9755 # diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 5d5b497a04c04..b900e4bda1aea 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -537,10 +537,7 @@ def _infer_columns( num_original_columns = ncols if not names: - if self.prefix: - columns = [[f"{self.prefix}{i}" for i in range(ncols)]] - else: - columns = [list(range(ncols))] + columns = [list(range(ncols))] columns = self._handle_usecols( columns, columns[0], num_original_columns ) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index bacdbbecc3011..d7aeaaad23f82 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -44,7 +44,6 @@ deprecate_nonkeyword_arguments, ) from pandas.util._exceptions import find_stack_level -from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( is_file_like, @@ -151,17 +150,6 @@ example of a valid callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster parsing time and lower memory usage. -squeeze : bool, default False - If the parsed data only contains one column then return a Series. - - .. deprecated:: 1.4.0 - Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze - the data. -prefix : str, optional - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... - - .. deprecated:: 1.4.0 - Use a list comprehension on the DataFrame's columns after calling ``read_csv``. mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there @@ -363,22 +351,6 @@ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. -error_bad_lines : bool, optional, default ``None`` - Lines with too many fields (e.g. a csv line with too many commas) will by - default cause an exception to be raised, and no DataFrame will be returned. - If False, then these "bad lines" will be dropped from the DataFrame that is - returned. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. -warn_bad_lines : bool, optional, default ``None`` - If error_bad_lines is False, and warn_bad_lines is True, a warning for each - "bad line" will be output. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : @@ -467,8 +439,6 @@ "thousands", "memory_map", "dialect", - "warn_bad_lines", - "error_bad_lines", "on_bad_lines", "delim_whitespace", "quoting", @@ -489,18 +459,6 @@ class _DeprecationConfig(NamedTuple): msg: str | None -_deprecated_defaults: dict[str, _DeprecationConfig] = { - "error_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."), - "warn_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."), - "squeeze": _DeprecationConfig( - None, 'Append .squeeze("columns") to the call to squeeze.' - ), - "prefix": _DeprecationConfig( - None, "Use a list comprehension on the column names in the future." - ), -} - - @overload def validate_integer(name, val: None, min_val: int = ...) -> None: ... @@ -624,8 +582,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -661,8 +617,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -684,8 +638,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -721,8 +673,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -744,8 +694,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -781,8 +729,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -804,8 +750,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -841,8 +785,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -874,8 +816,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, - squeeze: bool | None = None, - prefix: str | lib.NoDefault = lib.no_default, mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, @@ -917,8 +857,6 @@ def read_csv( encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling - error_bad_lines: bool | None = None, - warn_bad_lines: bool | None = None, # TODO(2.0): set on_bad_lines to "error". # See _refine_defaults_read comment for why we do this. on_bad_lines=None, @@ -940,11 +878,8 @@ def read_csv( delim_whitespace, engine, sep, - error_bad_lines, - warn_bad_lines, on_bad_lines, names, - prefix, defaults={"delimiter": ","}, ) kwds.update(kwds_defaults) @@ -963,8 +898,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1000,8 +933,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1023,8 +954,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1060,8 +989,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1083,8 +1010,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1120,8 +1045,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1143,8 +1066,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1180,8 +1101,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1213,8 +1132,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, - squeeze: bool | None = None, - prefix: str | lib.NoDefault = lib.no_default, mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, @@ -1256,8 +1173,6 @@ def read_table( encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling - error_bad_lines: bool | None = None, - warn_bad_lines: bool | None = None, # TODO(2.0): set on_bad_lines to "error". # See _refine_defaults_read comment for why we do this. on_bad_lines=None, @@ -1279,11 +1194,8 @@ def read_table( delim_whitespace, engine, sep, - error_bad_lines, - warn_bad_lines, on_bad_lines, names, - prefix, defaults={"delimiter": "\t"}, ) kwds.update(kwds_defaults) @@ -1435,8 +1347,6 @@ def __init__( self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) - self.squeeze = self.options.pop("squeeze", False) - if "has_index_names" in kwds: self.options["has_index_names"] = kwds["has_index_names"] @@ -1464,16 +1374,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != default and value != getattr(value, "value", default) ): - if ( - argname == "on_bad_lines" - and kwds.get("error_bad_lines") is not None - ): - argname = "error_bad_lines" - elif ( - argname == "on_bad_lines" and kwds.get("warn_bad_lines") is not None - ): - argname = "warn_bad_lines" - raise ValueError( f"The {repr(argname)} option is not supported with the " f"'pyarrow' engine" @@ -1485,28 +1385,16 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: options[argname] = value for argname, default in _c_parser_defaults.items(): - if argname in kwds: - value = kwds[argname] - - if engine != "c" and value != default: - if "python" in engine and argname not in _python_unsupported: - pass - elif ( - value - == _deprecated_defaults.get( - argname, _DeprecationConfig(default, None) - ).default_value - ): - pass - else: - raise ValueError( - f"The {repr(argname)} option is not supported with the " - f"{repr(engine)} engine" - ) - else: - value = _deprecated_defaults.get( - argname, _DeprecationConfig(default, None) - ).default_value + value = kwds[argname] + + if engine != "c" and value != default: + if "python" in engine and argname not in _python_unsupported: + pass + else: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" + ) options[argname] = value if engine == "python-fwf": @@ -1630,22 +1518,6 @@ def _clean_options( validate_header_arg(options["header"]) - for arg in _deprecated_defaults.keys(): - parser_default = _c_parser_defaults.get(arg, parser_defaults[arg]) - depr_default = _deprecated_defaults[arg] - if result.get(arg, depr_default) != depr_default.default_value: - msg = ( - f"The {arg} argument has been deprecated and will be " - f"removed in a future version. {depr_default.msg}\n\n" - ) - warnings.warn( - msg, - FutureWarning, - stacklevel=find_stack_level(inspect.currentframe()), - ) - else: - result[arg] = parser_default - if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if is_index_col(index_col): @@ -1692,10 +1564,6 @@ def _clean_options( result["na_values"] = na_values result["na_fvalues"] = na_fvalues result["skiprows"] = skiprows - # Default for squeeze is none since we need to check - # if user sets it. We then set to False to preserve - # previous behavior. - result["squeeze"] = False if options["squeeze"] is None else options["squeeze"] return result, engine @@ -1801,9 +1669,6 @@ def read(self, nrows: int | None = None) -> DataFrame: df = DataFrame(col_dict, columns=columns, index=index) self._currow += new_rows - - if self.squeeze and len(df.columns) == 1: - return df.squeeze("columns").copy() return df def get_chunk(self, size: int | None = None) -> DataFrame: @@ -1869,8 +1734,6 @@ def TextParser(*args, **kwds) -> TextFileReader: transformed content. encoding : str, optional Encoding to use for UTF when reading/writing (ex. 'utf-8') - squeeze : bool, default False - returns Series if only one column. infer_datetime_format: bool, default False If True and `parse_dates` is True for a column, try to infer the datetime format based on the first datetime string. If the format @@ -1968,11 +1831,8 @@ def _refine_defaults_read( delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, - error_bad_lines: bool | None, - warn_bad_lines: bool | None, on_bad_lines: str | Callable | None, names: Sequence[Hashable] | None | lib.NoDefault, - prefix: str | None | lib.NoDefault, defaults: dict[str, Any], ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -1998,18 +1858,12 @@ def _refine_defaults_read( sep : str or object A delimiter provided by the user (str) or a sentinel value, i.e. pandas._libs.lib.no_default. - error_bad_lines : str or None - Whether to error on a bad line or not. - warn_bad_lines : str or None - Whether to warn on a bad line or not. on_bad_lines : str, callable or None An option for handling bad lines or a sentinel value(None). names : array-like, optional List of column names to use. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. - prefix : str, optional - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... defaults: dict Default values of input parameters. @@ -2023,8 +1877,6 @@ def _refine_defaults_read( ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and ``delim_whitespace=True``. - If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/ - ``warn_bad_lines`` is True. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -2049,16 +1901,7 @@ def _refine_defaults_read( if delimiter and (sep is not lib.no_default): raise ValueError("Specified a sep and a delimiter; you can only specify one.") - if ( - names is not None - and names is not lib.no_default - and prefix is not None - and prefix is not lib.no_default - ): - raise ValueError("Specified named and prefix; you can only specify one.") - kwds["names"] = None if names is lib.no_default else names - kwds["prefix"] = None if prefix is lib.no_default else prefix # Alias sep -> delimiter. if delimiter is None: @@ -2089,17 +1932,10 @@ def _refine_defaults_read( kwds["engine"] = "c" kwds["engine_specified"] = False - # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines - # aren't specified at the same time. If so, raise. Otherwise, - # alias on_bad_lines to "error" if error/warn_bad_lines not set - # and on_bad_lines is not set. on_bad_lines is defaulted to None + # Alias on_bad_lines to "error" on_bad_lines is not set. + # on_bad_lines is defaulted to None # so we can tell if it is set (this is why this hack exists). if on_bad_lines is not None: - if error_bad_lines is not None or warn_bad_lines is not None: - raise ValueError( - "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " - "Please only set on_bad_lines." - ) if on_bad_lines == "error": kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR elif on_bad_lines == "warn": @@ -2115,27 +1951,7 @@ def _refine_defaults_read( else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") else: - if error_bad_lines is not None: - # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true - validate_bool_kwarg(error_bad_lines, "error_bad_lines") - if error_bad_lines: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - else: - if warn_bad_lines is not None: - # This is the case where error_bad_lines is False - # We can only warn/skip if error_bad_lines is False - # None doesn't work because backwards-compatibility reasons - validate_bool_kwarg(warn_bad_lines, "warn_bad_lines") - if warn_bad_lines: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - else: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - else: - # Backwards compat, when only error_bad_lines = false, we warn - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - else: - # Everything None -> Error - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR return kwds From 790e44ac6ea3a8343c48d331ef3a4ec559e060e4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 29 Sep 2022 11:12:22 +0200 Subject: [PATCH 2/5] Remove tests --- pandas/io/parsers/readers.py | 21 ++--- .../io/parser/common/test_common_basic.py | 80 ------------------- .../io/parser/common/test_read_errors.py | 72 ++--------------- pandas/tests/io/parser/test_header.py | 22 +---- pandas/tests/io/parser/test_parse_dates.py | 46 +++-------- 5 files changed, 34 insertions(+), 207 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d7aeaaad23f82..0a0497520cbfd 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1385,16 +1385,19 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: options[argname] = value for argname, default in _c_parser_defaults.items(): - value = kwds[argname] + if argname in kwds: + value = kwds[argname] - if engine != "c" and value != default: - if "python" in engine and argname not in _python_unsupported: - pass - else: - raise ValueError( - f"The {repr(argname)} option is not supported with the " - f"{repr(engine)} engine" - ) + if engine != "c" and value != default: + if "python" in engine and argname not in _python_unsupported: + pass + else: + raise ValueError( + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" + ) + else: + value = default options[argname] = value if engine == "python-fwf": diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 359b059252556..05d92517a7951 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -21,7 +21,6 @@ from pandas import ( DataFrame, Index, - Series, Timestamp, compat, ) @@ -128,39 +127,6 @@ def test_1000_sep(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("squeeze", [True, False]) -def test_squeeze(all_parsers, squeeze): - data = """\ -a,1 -b,2 -c,3 -""" - parser = all_parsers - index = Index(["a", "b", "c"], name=0) - expected = Series([1, 2, 3], name=1, index=index) - - result = parser.read_csv_check_warnings( - FutureWarning, - "The squeeze argument has been deprecated " - "and will be removed in a future version. " - 'Append .squeeze\\("columns"\\) to the call to squeeze.\n\n', - StringIO(data), - index_col=0, - header=None, - squeeze=squeeze, - ) - if not squeeze: - expected = DataFrame(expected) - tm.assert_frame_equal(result, expected) - else: - tm.assert_series_equal(result, expected) - - # see gh-8217 - # - # Series should not be a view. - assert not result._is_view - - @xfail_pyarrow def test_unnamed_columns(all_parsers): data = """A,B,C,, @@ -832,36 +798,6 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) -@pytest.mark.parametrize("func", ["read_csv", "read_table"]) -def test_names_and_prefix_not_None_raises(all_parsers, func): - # GH#39123 - f = StringIO("a,b\n1,2") - parser = all_parsers - msg = "Specified named and prefix; you can only specify one." - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning): - getattr(parser, func)(f, names=["a", "b"], prefix="x") - - -@pytest.mark.parametrize("func", ["read_csv", "read_table"]) -@pytest.mark.parametrize("prefix, names", [(None, ["x0", "x1"]), ("x", None)]) -def test_names_and_prefix_explicit_None(all_parsers, names, prefix, func): - # GH42387 - f = StringIO("a,b\n1,2") - expected = DataFrame({"x0": ["a", "1"], "x1": ["b", "2"]}) - parser = all_parsers - if prefix is not None: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = getattr(parser, func)( - f, names=names, sep=",", prefix=prefix, header=None - ) - else: - result = getattr(parser, func)( - f, names=names, sep=",", prefix=prefix, header=None - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 @@ -894,22 +830,6 @@ def test_encoding_surrogatepass(all_parsers): parser.read_csv(path) -@xfail_pyarrow -@pytest.mark.parametrize("on_bad_lines", ["error", "warn"]) -def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines): - # GH 15122 - parser = all_parsers - kwds = {f"{on_bad_lines}_bad_lines": False} - parser.read_csv_check_warnings( - FutureWarning, - f"The {on_bad_lines}_bad_lines argument has been deprecated " - "and will be removed in a future version. " - "Use on_bad_lines in the future.\n\n", - csv1, - **kwds, - ) - - def test_malformed_second_line(all_parsers): # see GH14782 parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index fc30ebff0d93a..aec0d57bc0fc4 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -121,18 +121,6 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) -def test_read_csv_raises_on_header_prefix(all_parsers): - # gh-27394 - parser = all_parsers - msg = "Argument prefix must be None if argument header is not None" - - s = StringIO("0,1\n2,3") - - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning): - parser.read_csv(s, header=0, prefix="_X") - - def test_unexpected_keyword_parameter_exception(all_parsers): # GH-34976 parser = all_parsers @@ -144,66 +132,36 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -@pytest.mark.parametrize( - "kwargs", - [ - pytest.param( - {"error_bad_lines": False, "warn_bad_lines": False}, - marks=pytest.mark.filterwarnings("ignore"), - ), - {"on_bad_lines": "skip"}, - ], -) -def test_suppress_error_output(all_parsers, capsys, kwargs): +def test_suppress_error_output(all_parsers, capsys): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), **kwargs) + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert captured.err == "" -@pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize( - "kwargs", - [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. -) -@pytest.mark.parametrize( - "warn_kwargs", - [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}], -) -def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): +def test_error_bad_lines(all_parsers): # see gh-15925 parser = all_parsers - kwargs.update(**warn_kwargs) data = "a\n1\n1,2,3\n4\n5,6,7" msg = "Expected 1 fields in line 3, saw 3" with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - - -@pytest.mark.parametrize( - "kwargs", - [ - pytest.param( - {"error_bad_lines": False, "warn_bad_lines": True}, - marks=pytest.mark.filterwarnings("ignore"), - ), - {"on_bad_lines": "warn"}, - ], -) -def test_warn_bad_lines(all_parsers, capsys, kwargs): + parser.read_csv(StringIO(data), on_bad_lines="error") + + +def test_warn_bad_lines(all_parsers, capsys): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), **kwargs) + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) captured = capsys.readouterr() @@ -278,20 +236,6 @@ def test_invalid_on_bad_line(all_parsers): parser.read_csv(StringIO(data), on_bad_lines="abc") -@pytest.mark.parametrize("error_bad_lines", [True, False]) -@pytest.mark.parametrize("warn_bad_lines", [True, False]) -def test_conflict_on_bad_line(all_parsers, error_bad_lines, warn_bad_lines): - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - kwds = {"error_bad_lines": error_bad_lines, "warn_bad_lines": warn_bad_lines} - with pytest.raises( - ValueError, - match="Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " - "Please only set on_bad_lines.", - ): - parser.read_csv(StringIO(data), on_bad_lines="error", **kwds) - - def test_bad_header_uniform_error(all_parsers): parser = all_parsers data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4ded70db8bae7..5cb54bb4e2916 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -76,21 +76,6 @@ def test_bool_header_arg(all_parsers, header): parser.read_csv(StringIO(data), header=header) -def test_no_header_prefix(all_parsers): - parser = all_parsers - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = parser.read_csv(StringIO(data), prefix="Field", header=None) - expected = DataFrame( - [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], - columns=["Field0", "Field1", "Field2", "Field3", "Field4"], - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers @@ -442,7 +427,6 @@ def test_read_only_header_no_rows(all_parsers, kwargs): "kwargs,names", [ ({}, [0, 1, 2, 3, 4]), - ({"prefix": "X"}, ["X0", "X1", "X2", "X3", "X4"]), ( {"names": ["foo", "bar", "baz", "quux", "panda"]}, ["foo", "bar", "baz", "quux", "panda"], @@ -458,11 +442,7 @@ def test_no_header(all_parsers, kwargs, names): expected = DataFrame( [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names ) - if "prefix" in kwargs.keys(): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = parser.read_csv(StringIO(data), header=None, **kwargs) - else: - result = parser.read_csv(StringIO(data), header=None, **kwargs) + result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9c8809b6099f9..08b3c5421bcdd 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -170,17 +170,11 @@ def date_parser(*date_cols): kwds = { "header": None, "date_parser": date_parser, - "prefix": "X", "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, "keep_date_col": keep_date_col, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -314,17 +308,11 @@ def test_multiple_date_col(all_parsers, keep_date_col): parser = all_parsers kwds = { "header": None, - "prefix": "X", "parse_dates": [[1, 2], [1, 3]], "keep_date_col": keep_date_col, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -437,14 +425,13 @@ def test_date_col_as_index_col(all_parsers): KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 """ parser = all_parsers - kwds = {"header": None, "prefix": "X", "parse_dates": [1], "index_col": 1} - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + kwds = { + "header": None, + "parse_dates": [1], + "index_col": 1, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"], + } + result = parser.read_csv(StringIO(data), **kwds) index = Index( [ @@ -494,17 +481,10 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): kwds = { "header": None, - "prefix": "X", "parse_dates": parse_dates, "date_parser": date_parser, } - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -535,7 +515,7 @@ def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): -0.59, ], ], - columns=["actual", "nominal", "X0", "X4"], + columns=["actual", "nominal", 0, 4], ) # Python can sometimes be flaky about how From 422bd49c4fb9119a9a7f650cc147353da48e882e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 29 Sep 2022 13:17:26 +0200 Subject: [PATCH 3/5] Fix --- pandas/tests/io/excel/test_readers.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fa1d6bbfd5a7e..eb778c5ea864d 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1250,31 +1250,6 @@ def test_read_excel_nrows_params( ) tm.assert_frame_equal(actual, expected) - def test_read_excel_squeeze(self, read_ext): - # GH 12157 - f = "test_squeeze" + read_ext - - with tm.assert_produces_warning( - FutureWarning, - match="The squeeze argument has been deprecated " - "and will be removed in a future version. " - 'Append .squeeze\\("columns"\\) to the call to squeeze.\n\n', - ): - actual = pd.read_excel( - f, sheet_name="two_columns", index_col=0, squeeze=True - ) - expected = Series([2, 3, 4], [4, 5, 6], name="b") - expected.index.name = "a" - tm.assert_series_equal(actual, expected) - - actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True) - expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]}) - tm.assert_frame_equal(actual, expected) - - actual = pd.read_excel(f, sheet_name="one_column", squeeze=True) - expected = Series([1, 2, 3], name="a") - tm.assert_series_equal(actual, expected) - def test_deprecated_kwargs(self, read_ext): with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False): pd.read_excel("test1" + read_ext, "Sheet1", 0) From cea3c7b2b7a8e8dcd623a14a2990415a44a9c5d8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 10:52:53 +0100 Subject: [PATCH 4/5] Adress review --- pandas/io/parsers/readers.py | 46 +++++++++++++++--------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index a010f665f2f02..f2b466b06e062 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,5 +1,7 @@ """ Module contains tools for processing files into DataFrames or other objects + +GH#48849 provides a convenient way of deprecating keyword arguments """ from __future__ import annotations @@ -866,9 +868,7 @@ def read_csv( encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling - # TODO(2.0): set on_bad_lines to "error". - # See _refine_defaults_read comment for why we do this. - on_bad_lines=None, + on_bad_lines: str = "error", # Internal delim_whitespace: bool = False, low_memory=_c_parser_defaults["low_memory"], @@ -1187,9 +1187,7 @@ def read_table( encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling - # TODO(2.0): set on_bad_lines to "error". - # See _refine_defaults_read comment for why we do this. - on_bad_lines=None, + on_bad_lines: str = "error", # Internal delim_whitespace: bool = False, low_memory=_c_parser_defaults["low_memory"], @@ -1849,7 +1847,7 @@ def _refine_defaults_read( delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, - on_bad_lines: str | Callable | None, + on_bad_lines: str | Callable, names: Sequence[Hashable] | None | lib.NoDefault, defaults: dict[str, Any], ): @@ -1876,7 +1874,7 @@ def _refine_defaults_read( sep : str or object A delimiter provided by the user (str) or a sentinel value, i.e. pandas._libs.lib.no_default. - on_bad_lines : str, callable or None + on_bad_lines : str, callable An option for handling bad lines or a sentinel value(None). names : array-like, optional List of column names to use. If the file contains a header row, @@ -1950,26 +1948,20 @@ def _refine_defaults_read( kwds["engine"] = "c" kwds["engine_specified"] = False - # Alias on_bad_lines to "error" on_bad_lines is not set. - # on_bad_lines is defaulted to None - # so we can tell if it is set (this is why this hack exists). - if on_bad_lines is not None: - if on_bad_lines == "error": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - elif on_bad_lines == "warn": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - elif on_bad_lines == "skip": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - elif callable(on_bad_lines): - if engine != "python": - raise ValueError( - "on_bad_line can only be a callable function if engine='python'" - ) - kwds["on_bad_lines"] = on_bad_lines - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") - else: + if on_bad_lines == "error": kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + elif callable(on_bad_lines): + if engine != "python": + raise ValueError( + "on_bad_line can only be a callable function if engine='python'" + ) + kwds["on_bad_lines"] = on_bad_lines + else: + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") return kwds From fcdd379350047dd449b0feb9f222d3f27ceeb4e4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 31 Oct 2022 19:42:59 +0100 Subject: [PATCH 5/5] Add whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5614b7a2c0846..61dc8e6c33bb0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -193,6 +193,7 @@ Removal of prior version deprecations/changes - Removed argument ``sort_columns`` in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`) - Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`) - Removed argument ``kind`` from :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer` and :meth:`Index.slice_locs` (:issue:`41378`) +- Removed arguments ``prefix``, ``squeeze``, ``error_bad_lines`` and ``warn_bad_lines`` from :func:`read_csv` (:issue:`40413`, :issue:`43427`) - Disallow subclass-specific keywords (e.g. "freq", "tz", "names", "closed") in the :class:`Index` constructor (:issue:`38597`) - Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`) - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)