diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 92bd239d51ae4..b4bf3ef024d4c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -154,25 +154,6 @@ usecols : list-like or callable, default ``None`` Using this parameter results in much faster parsing time and lower memory usage when using the c engine. The Python engine loads the data first before deciding which columns to drop. -squeeze : boolean, default ``False`` - If the parsed data only contains one column then return a ``Series``. - - .. deprecated:: 1.4.0 - Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze - the data. -prefix : str, default ``None`` - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... - - .. deprecated:: 1.4.0 - Use a list comprehension on the DataFrame's columns after calling ``read_csv``. - - .. ipython:: python - - data = "col1,col2,col3\na,b,1" - - df = pd.read_csv(StringIO(data)) - df.columns = [f"pre_{col}" for col in df.columns] - df mangle_dupe_cols : boolean, default ``True`` Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'. @@ -395,23 +376,6 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, optional, default ``None`` - Lines with too many fields (e.g. a csv line with too many commas) will by - default cause an exception to be raised, and no ``DataFrame`` will be - returned. If ``False``, then these "bad lines" will dropped from the - ``DataFrame`` that is returned. See :ref:`bad lines ` - below. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. -warn_bad_lines : boolean, optional, default ``None`` - If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for - each "bad line" will be output. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : @@ -1221,37 +1185,6 @@ Infinity ``inf`` like values will be parsed as ``np.inf`` (positive infinity), and ``-inf`` as ``-np.inf`` (negative infinity). These will ignore the case of the value, meaning ``Inf``, will also be parsed as ``np.inf``. - -Returning Series -'''''''''''''''' - -Using the ``squeeze`` keyword, the parser will return output with a single column -as a ``Series``: - -.. deprecated:: 1.4.0 - Users should append ``.squeeze("columns")`` to the DataFrame returned by - ``read_csv`` instead. - -.. ipython:: python - :okwarning: - - data = "level\nPatient1,123000\nPatient2,23000\nPatient3,1234018" - - with open("tmp.csv", "w") as fh: - fh.write(data) - - print(open("tmp.csv").read()) - - output = pd.read_csv("tmp.csv", squeeze=True) - output - - type(output) - -.. ipython:: python - :suppress: - - os.remove("tmp.csv") - .. _io.boolean: Boolean values @@ -1708,8 +1641,6 @@ Options that are unsupported by the pyarrow engine which are not covered by the * ``thousands`` * ``memory_map`` * ``dialect`` -* ``warn_bad_lines`` -* ``error_bad_lines`` * ``on_bad_lines`` * ``delim_whitespace`` * ``quoting`` diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d71160cdbc369..cfad5e2ffee9c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -213,6 +213,7 @@ Removal of prior version deprecations/changes - Removed argument ``sort_columns`` in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`) - Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`) - Removed argument ``kind`` from :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer` and :meth:`Index.slice_locs` (:issue:`41378`) +- Removed arguments ``prefix``, ``squeeze``, ``error_bad_lines`` and ``warn_bad_lines`` from :func:`read_csv` (:issue:`40413`, :issue:`43427`) - Removed argument ``datetime_is_numeric`` from :meth:`DataFrame.describe` and :meth:`Series.describe` as datetime data will always be summarized as numeric data (:issue:`34798`) - Disallow subclass-specific keywords (e.g. "freq", "tz", "names", "closed") in the :class:`Index` constructor (:issue:`38597`) - Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 49b71efbfb6ec..6cc56bb1c8840 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -95,9 +95,7 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: multi_index_named = True if self.header is None: if self.names is None: - if self.prefix is not None: - self.names = [f"{self.prefix}{i}" for i in range(num_cols)] - elif self.header is None: + if self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 63c7e8047407d..dd4e801af5894 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -97,7 +97,6 @@ def __init__(self, kwds) -> None: self.names = kwds.get("names") self.orig_names: list | None = None - self.prefix = kwds.pop("prefix", None) self.index_col = kwds.get("index_col", None) self.unnamed_cols: set = set() @@ -155,11 +154,6 @@ def __init__(self, kwds) -> None: "index_col must only contain row numbers " "when specifying a multi-index header" ) - elif self.header is not None and self.prefix is not None: - # GH 27394 - raise ValueError( - "Argument prefix must be None if argument header is not None" - ) self._name_processed = False @@ -1161,7 +1155,6 @@ def converter(*date_cols): "header": "infer", "index_col": None, "names": None, - "prefix": None, "skiprows": None, "skipfooter": 0, "nrows": None, @@ -1185,15 +1178,12 @@ def converter(*date_cols): "chunksize": None, "verbose": False, "encoding": None, - "squeeze": None, "compression": None, "mangle_dupe_cols": True, "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, - "error_bad_lines": None, - "warn_bad_lines": None, "use_nullable_dtypes": False, } diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 8ce671bcb03a2..c1f2e6ddb2388 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -71,8 +71,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: "encoding", "memory_map", "compression", - "error_bad_lines", - "warn_bad_lines", ): kwds.pop(key, None) @@ -102,16 +100,8 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # error: Cannot determine type of 'names' if self.names is None: # type: ignore[has-type] - if self.prefix: - # error: Cannot determine type of 'names' - self.names = [ # type: ignore[has-type] - f"{self.prefix}{i}" for i in range(self._reader.table_width) - ] - else: - # error: Cannot determine type of 'names' - self.names = list( # type: ignore[has-type] - range(self._reader.table_width) - ) + # error: Cannot determine type of 'names' + self.names = list(range(self._reader.table_width)) # type: ignore[has-type] # gh-9755 # diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 4f15443ed5610..e0d0bd50c09c8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -536,10 +536,7 @@ def _infer_columns( num_original_columns = ncols if not names: - if self.prefix: - columns = [[f"{self.prefix}{i}" for i in range(ncols)]] - else: - columns = [list(range(ncols))] + columns = [list(range(ncols))] columns = self._handle_usecols( columns, columns[0], num_original_columns ) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 68798cb0436c4..f2b466b06e062 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1,5 +1,7 @@ """ Module contains tools for processing files into DataFrames or other objects + +GH#48849 provides a convenient way of deprecating keyword arguments """ from __future__ import annotations @@ -42,7 +44,6 @@ deprecate_kwarg, ) from pandas.util._exceptions import find_stack_level -from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( is_file_like, @@ -149,17 +150,6 @@ example of a valid callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster parsing time and lower memory usage. -squeeze : bool, default False - If the parsed data only contains one column then return a Series. - - .. deprecated:: 1.4.0 - Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze - the data. -prefix : str, optional - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... - - .. deprecated:: 1.4.0 - Use a list comprehension on the DataFrame's columns after calling ``read_csv``. mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there @@ -361,22 +351,6 @@ `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. -error_bad_lines : bool, optional, default ``None`` - Lines with too many fields (e.g. a csv line with too many commas) will by - default cause an exception to be raised, and no DataFrame will be returned. - If False, then these "bad lines" will be dropped from the DataFrame that is - returned. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. -warn_bad_lines : bool, optional, default ``None`` - If error_bad_lines is False, and warn_bad_lines is True, a warning for each - "bad line" will be output. - - .. deprecated:: 1.3.0 - The ``on_bad_lines`` parameter should be used instead to specify behavior upon - encountering a bad line instead. on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : @@ -472,8 +446,6 @@ "thousands", "memory_map", "dialect", - "warn_bad_lines", - "error_bad_lines", "on_bad_lines", "delim_whitespace", "quoting", @@ -494,18 +466,6 @@ class _DeprecationConfig(NamedTuple): msg: str | None -_deprecated_defaults: dict[str, _DeprecationConfig] = { - "error_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."), - "warn_bad_lines": _DeprecationConfig(None, "Use on_bad_lines in the future."), - "squeeze": _DeprecationConfig( - None, 'Append .squeeze("columns") to the call to squeeze.' - ), - "prefix": _DeprecationConfig( - None, "Use a list comprehension on the column names in the future." - ), -} - - @overload def validate_integer(name, val: None, min_val: int = ...) -> None: ... @@ -629,8 +589,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -666,8 +624,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -690,8 +646,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -727,8 +681,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -751,8 +703,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -788,8 +738,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -812,8 +760,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -849,8 +795,6 @@ def read_csv( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -883,8 +827,6 @@ def read_csv( names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, - squeeze: bool | None = None, - prefix: str | lib.NoDefault = lib.no_default, mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, @@ -926,11 +868,7 @@ def read_csv( encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling - error_bad_lines: bool | None = None, - warn_bad_lines: bool | None = None, - # TODO(2.0): set on_bad_lines to "error". - # See _refine_defaults_read comment for why we do this. - on_bad_lines=None, + on_bad_lines: str = "error", # Internal delim_whitespace: bool = False, low_memory=_c_parser_defaults["low_memory"], @@ -950,11 +888,8 @@ def read_csv( delim_whitespace, engine, sep, - error_bad_lines, - warn_bad_lines, on_bad_lines, names, - prefix, defaults={"delimiter": ","}, ) kwds.update(kwds_defaults) @@ -973,8 +908,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1010,8 +943,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1034,8 +965,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1071,8 +1000,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1095,8 +1022,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1132,8 +1057,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1156,8 +1079,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., usecols=..., - squeeze: bool | None = ..., - prefix: str | lib.NoDefault = ..., mangle_dupe_cols: bool = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., @@ -1193,8 +1114,6 @@ def read_table( encoding: str | None = ..., encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., - error_bad_lines: bool | None = ..., - warn_bad_lines: bool | None = ..., on_bad_lines=..., delim_whitespace: bool = ..., low_memory=..., @@ -1227,8 +1146,6 @@ def read_table( names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, usecols=None, - squeeze: bool | None = None, - prefix: str | lib.NoDefault = lib.no_default, mangle_dupe_cols: bool = True, # General Parsing Configuration dtype: DtypeArg | None = None, @@ -1270,11 +1187,7 @@ def read_table( encoding_errors: str | None = "strict", dialect: str | csv.Dialect | None = None, # Error Handling - error_bad_lines: bool | None = None, - warn_bad_lines: bool | None = None, - # TODO(2.0): set on_bad_lines to "error". - # See _refine_defaults_read comment for why we do this. - on_bad_lines=None, + on_bad_lines: str = "error", # Internal delim_whitespace: bool = False, low_memory=_c_parser_defaults["low_memory"], @@ -1294,11 +1207,8 @@ def read_table( delim_whitespace, engine, sep, - error_bad_lines, - warn_bad_lines, on_bad_lines, names, - prefix, defaults={"delimiter": "\t"}, ) kwds.update(kwds_defaults) @@ -1450,8 +1360,6 @@ def __init__( self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) - self.squeeze = self.options.pop("squeeze", False) - if "has_index_names" in kwds: self.options["has_index_names"] = kwds["has_index_names"] @@ -1479,16 +1387,6 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: and value != default and value != getattr(value, "value", default) ): - if ( - argname == "on_bad_lines" - and kwds.get("error_bad_lines") is not None - ): - argname = "error_bad_lines" - elif ( - argname == "on_bad_lines" and kwds.get("warn_bad_lines") is not None - ): - argname = "warn_bad_lines" - raise ValueError( f"The {repr(argname)} option is not supported with the " f"'pyarrow' engine" @@ -1506,22 +1404,13 @@ def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]: if engine != "c" and value != default: if "python" in engine and argname not in _python_unsupported: pass - elif ( - value - == _deprecated_defaults.get( - argname, _DeprecationConfig(default, None) - ).default_value - ): - pass else: raise ValueError( f"The {repr(argname)} option is not supported with the " f"{repr(engine)} engine" ) else: - value = _deprecated_defaults.get( - argname, _DeprecationConfig(default, None) - ).default_value + value = default options[argname] = value if engine == "python-fwf": @@ -1645,17 +1534,6 @@ def _clean_options( validate_header_arg(options["header"]) - for arg, depr_default in _deprecated_defaults.items(): - parser_default = _c_parser_defaults.get(arg, parser_defaults[arg]) - if result.get(arg, depr_default) != depr_default.default_value: - msg = ( - f"The {arg} argument has been deprecated and will be " - f"removed in a future version. {depr_default.msg}\n\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - else: - result[arg] = parser_default - if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if is_index_col(index_col): @@ -1702,10 +1580,6 @@ def _clean_options( result["na_values"] = na_values result["na_fvalues"] = na_fvalues result["skiprows"] = skiprows - # Default for squeeze is none since we need to check - # if user sets it. We then set to False to preserve - # previous behavior. - result["squeeze"] = False if options["squeeze"] is None else options["squeeze"] return result, engine @@ -1811,9 +1685,6 @@ def read(self, nrows: int | None = None) -> DataFrame: df = DataFrame(col_dict, columns=columns, index=index) self._currow += new_rows - - if self.squeeze and len(df.columns) == 1: - return df.squeeze("columns").copy() return df def get_chunk(self, size: int | None = None) -> DataFrame: @@ -1879,8 +1750,6 @@ def TextParser(*args, **kwds) -> TextFileReader: transformed content. encoding : str, optional Encoding to use for UTF when reading/writing (ex. 'utf-8') - squeeze : bool, default False - returns Series if only one column. infer_datetime_format: bool, default False If True and `parse_dates` is True for a column, try to infer the datetime format based on the first datetime string. If the format @@ -1978,11 +1847,8 @@ def _refine_defaults_read( delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, - error_bad_lines: bool | None, - warn_bad_lines: bool | None, - on_bad_lines: str | Callable | None, + on_bad_lines: str | Callable, names: Sequence[Hashable] | None | lib.NoDefault, - prefix: str | None | lib.NoDefault, defaults: dict[str, Any], ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -2008,18 +1874,12 @@ def _refine_defaults_read( sep : str or object A delimiter provided by the user (str) or a sentinel value, i.e. pandas._libs.lib.no_default. - error_bad_lines : str or None - Whether to error on a bad line or not. - warn_bad_lines : str or None - Whether to warn on a bad line or not. - on_bad_lines : str, callable or None + on_bad_lines : str, callable An option for handling bad lines or a sentinel value(None). names : array-like, optional List of column names to use. If the file contains a header row, then you should explicitly pass ``header=0`` to override the column names. Duplicates in this list are not allowed. - prefix : str, optional - Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... defaults: dict Default values of input parameters. @@ -2033,8 +1893,6 @@ def _refine_defaults_read( ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and ``delim_whitespace=True``. - If on_bad_lines is specified(not ``None``) and ``error_bad_lines``/ - ``warn_bad_lines`` is True. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -2059,16 +1917,7 @@ def _refine_defaults_read( if delimiter and (sep is not lib.no_default): raise ValueError("Specified a sep and a delimiter; you can only specify one.") - if ( - names is not None - and names is not lib.no_default - and prefix is not None - and prefix is not lib.no_default - ): - raise ValueError("Specified named and prefix; you can only specify one.") - kwds["names"] = None if names is lib.no_default else names - kwds["prefix"] = None if prefix is lib.no_default else prefix # Alias sep -> delimiter. if delimiter is None: @@ -2099,53 +1948,20 @@ def _refine_defaults_read( kwds["engine"] = "c" kwds["engine_specified"] = False - # Ensure that on_bad_lines and error_bad_lines/warn_bad_lines - # aren't specified at the same time. If so, raise. Otherwise, - # alias on_bad_lines to "error" if error/warn_bad_lines not set - # and on_bad_lines is not set. on_bad_lines is defaulted to None - # so we can tell if it is set (this is why this hack exists). - if on_bad_lines is not None: - if error_bad_lines is not None or warn_bad_lines is not None: + if on_bad_lines == "error": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + elif on_bad_lines == "warn": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN + elif on_bad_lines == "skip": + kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + elif callable(on_bad_lines): + if engine != "python": raise ValueError( - "Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " - "Please only set on_bad_lines." + "on_bad_line can only be a callable function if engine='python'" ) - if on_bad_lines == "error": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - elif on_bad_lines == "warn": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - elif on_bad_lines == "skip": - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - elif callable(on_bad_lines): - if engine != "python": - raise ValueError( - "on_bad_line can only be a callable function if engine='python'" - ) - kwds["on_bad_lines"] = on_bad_lines - else: - raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") + kwds["on_bad_lines"] = on_bad_lines else: - if error_bad_lines is not None: - # Must check is_bool, because other stuff(e.g. non-empty lists) eval to true - validate_bool_kwarg(error_bad_lines, "error_bad_lines") - if error_bad_lines: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR - else: - if warn_bad_lines is not None: - # This is the case where error_bad_lines is False - # We can only warn/skip if error_bad_lines is False - # None doesn't work because backwards-compatibility reasons - validate_bool_kwarg(warn_bad_lines, "warn_bad_lines") - if warn_bad_lines: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - else: - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP - else: - # Backwards compat, when only error_bad_lines = false, we warn - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN - else: - # Everything None -> Error - kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR + raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") return kwds diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 876799c49e138..ee2a8f518cd56 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1308,31 +1308,6 @@ def test_read_excel_nrows_params( ) tm.assert_frame_equal(actual, expected) - def test_read_excel_squeeze(self, read_ext): - # GH 12157 - f = "test_squeeze" + read_ext - - with tm.assert_produces_warning( - FutureWarning, - match="The squeeze argument has been deprecated " - "and will be removed in a future version. " - 'Append .squeeze\\("columns"\\) to the call to squeeze.\n\n', - ): - actual = pd.read_excel( - f, sheet_name="two_columns", index_col=0, squeeze=True - ) - expected = Series([2, 3, 4], [4, 5, 6], name="b") - expected.index.name = "a" - tm.assert_series_equal(actual, expected) - - actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True) - expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]}) - tm.assert_frame_equal(actual, expected) - - actual = pd.read_excel(f, sheet_name="one_column", squeeze=True) - expected = Series([1, 2, 3], name="a") - tm.assert_series_equal(actual, expected) - def test_deprecated_kwargs(self, read_ext): with pytest.raises(TypeError, match="but 3 positional arguments"): pd.read_excel("test1" + read_ext, "Sheet1", 0) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 8997fc83eb5cf..24b18c8657546 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -21,7 +21,6 @@ from pandas import ( DataFrame, Index, - Series, Timestamp, compat, ) @@ -128,39 +127,6 @@ def test_1000_sep(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("squeeze", [True, False]) -def test_squeeze(all_parsers, squeeze): - data = """\ -a,1 -b,2 -c,3 -""" - parser = all_parsers - index = Index(["a", "b", "c"], name=0) - expected = Series([1, 2, 3], name=1, index=index) - - result = parser.read_csv_check_warnings( - FutureWarning, - "The squeeze argument has been deprecated " - "and will be removed in a future version. " - 'Append .squeeze\\("columns"\\) to the call to squeeze.\n\n', - StringIO(data), - index_col=0, - header=None, - squeeze=squeeze, - ) - if not squeeze: - expected = DataFrame(expected) - tm.assert_frame_equal(result, expected) - else: - tm.assert_series_equal(result, expected) - - # see gh-8217 - # - # Series should not be a view. - assert not result._is_view - - @xfail_pyarrow def test_unnamed_columns(all_parsers): data = """A,B,C,, @@ -821,36 +787,6 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) -@pytest.mark.parametrize("func", ["read_csv", "read_table"]) -def test_names_and_prefix_not_None_raises(all_parsers, func): - # GH#39123 - f = StringIO("a,b\n1,2") - parser = all_parsers - msg = "Specified named and prefix; you can only specify one." - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning): - getattr(parser, func)(f, names=["a", "b"], prefix="x") - - -@pytest.mark.parametrize("func", ["read_csv", "read_table"]) -@pytest.mark.parametrize("prefix, names", [(None, ["x0", "x1"]), ("x", None)]) -def test_names_and_prefix_explicit_None(all_parsers, names, prefix, func): - # GH42387 - f = StringIO("a,b\n1,2") - expected = DataFrame({"x0": ["a", "1"], "x1": ["b", "2"]}) - parser = all_parsers - if prefix is not None: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = getattr(parser, func)( - f, names=names, sep=",", prefix=prefix, header=None - ) - else: - result = getattr(parser, func)( - f, names=names, sep=",", prefix=prefix, header=None - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 @@ -883,22 +819,6 @@ def test_encoding_surrogatepass(all_parsers): parser.read_csv(path) -@xfail_pyarrow -@pytest.mark.parametrize("on_bad_lines", ["error", "warn"]) -def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines): - # GH 15122 - parser = all_parsers - kwds = {f"{on_bad_lines}_bad_lines": False} - parser.read_csv_check_warnings( - FutureWarning, - f"The {on_bad_lines}_bad_lines argument has been deprecated " - "and will be removed in a future version. " - "Use on_bad_lines in the future.\n\n", - csv1, - **kwds, - ) - - def test_malformed_second_line(all_parsers): # see GH14782 parser = all_parsers diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index fc30ebff0d93a..aec0d57bc0fc4 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -121,18 +121,6 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) -def test_read_csv_raises_on_header_prefix(all_parsers): - # gh-27394 - parser = all_parsers - msg = "Argument prefix must be None if argument header is not None" - - s = StringIO("0,1\n2,3") - - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning): - parser.read_csv(s, header=0, prefix="_X") - - def test_unexpected_keyword_parameter_exception(all_parsers): # GH-34976 parser = all_parsers @@ -144,66 +132,36 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -@pytest.mark.parametrize( - "kwargs", - [ - pytest.param( - {"error_bad_lines": False, "warn_bad_lines": False}, - marks=pytest.mark.filterwarnings("ignore"), - ), - {"on_bad_lines": "skip"}, - ], -) -def test_suppress_error_output(all_parsers, capsys, kwargs): +def test_suppress_error_output(all_parsers, capsys): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), **kwargs) + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert captured.err == "" -@pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize( - "kwargs", - [{}, {"error_bad_lines": True}], # Default is True. # Explicitly pass in. -) -@pytest.mark.parametrize( - "warn_kwargs", - [{}, {"warn_bad_lines": True}, {"warn_bad_lines": False}], -) -def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): +def test_error_bad_lines(all_parsers): # see gh-15925 parser = all_parsers - kwargs.update(**warn_kwargs) data = "a\n1\n1,2,3\n4\n5,6,7" msg = "Expected 1 fields in line 3, saw 3" with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), **kwargs) - - -@pytest.mark.parametrize( - "kwargs", - [ - pytest.param( - {"error_bad_lines": False, "warn_bad_lines": True}, - marks=pytest.mark.filterwarnings("ignore"), - ), - {"on_bad_lines": "warn"}, - ], -) -def test_warn_bad_lines(all_parsers, capsys, kwargs): + parser.read_csv(StringIO(data), on_bad_lines="error") + + +def test_warn_bad_lines(all_parsers, capsys): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), **kwargs) + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) captured = capsys.readouterr() @@ -278,20 +236,6 @@ def test_invalid_on_bad_line(all_parsers): parser.read_csv(StringIO(data), on_bad_lines="abc") -@pytest.mark.parametrize("error_bad_lines", [True, False]) -@pytest.mark.parametrize("warn_bad_lines", [True, False]) -def test_conflict_on_bad_line(all_parsers, error_bad_lines, warn_bad_lines): - parser = all_parsers - data = "a\n1\n1,2,3\n4\n5,6,7" - kwds = {"error_bad_lines": error_bad_lines, "warn_bad_lines": warn_bad_lines} - with pytest.raises( - ValueError, - match="Both on_bad_lines and error_bad_lines/warn_bad_lines are set. " - "Please only set on_bad_lines.", - ): - parser.read_csv(StringIO(data), on_bad_lines="error", **kwds) - - def test_bad_header_uniform_error(all_parsers): parser = all_parsers data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 4ded70db8bae7..5cb54bb4e2916 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -76,21 +76,6 @@ def test_bool_header_arg(all_parsers, header): parser.read_csv(StringIO(data), header=header) -def test_no_header_prefix(all_parsers): - parser = all_parsers - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = parser.read_csv(StringIO(data), prefix="Field", header=None) - expected = DataFrame( - [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], - columns=["Field0", "Field1", "Field2", "Field3", "Field4"], - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow def test_header_with_index_col(all_parsers): parser = all_parsers @@ -442,7 +427,6 @@ def test_read_only_header_no_rows(all_parsers, kwargs): "kwargs,names", [ ({}, [0, 1, 2, 3, 4]), - ({"prefix": "X"}, ["X0", "X1", "X2", "X3", "X4"]), ( {"names": ["foo", "bar", "baz", "quux", "panda"]}, ["foo", "bar", "baz", "quux", "panda"], @@ -458,11 +442,7 @@ def test_no_header(all_parsers, kwargs, names): expected = DataFrame( [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names ) - if "prefix" in kwargs.keys(): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = parser.read_csv(StringIO(data), header=None, **kwargs) - else: - result = parser.read_csv(StringIO(data), header=None, **kwargs) + result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 890819d42841e..d81c1449bea63 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -169,17 +169,11 @@ def date_parser(*date_cols): kwds = { "header": None, "date_parser": date_parser, - "prefix": "X", "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, "keep_date_col": keep_date_col, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -313,17 +307,11 @@ def test_multiple_date_col(all_parsers, keep_date_col): parser = all_parsers kwds = { "header": None, - "prefix": "X", "parse_dates": [[1, 2], [1, 3]], "keep_date_col": keep_date_col, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -436,14 +424,13 @@ def test_date_col_as_index_col(all_parsers): KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 """ parser = all_parsers - kwds = {"header": None, "prefix": "X", "parse_dates": [1], "index_col": 1} - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + kwds = { + "header": None, + "parse_dates": [1], + "index_col": 1, + "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7"], + } + result = parser.read_csv(StringIO(data), **kwds) index = Index( [ @@ -489,17 +476,10 @@ def test_multiple_date_cols_int_cast(all_parsers): kwds = { "header": None, - "prefix": "X", "parse_dates": parse_dates, "date_parser": pd.to_datetime, } - result = parser.read_csv_check_warnings( - FutureWarning, - "The prefix argument has been deprecated " - "and will be removed in a future version. .*\n\n", - StringIO(data), - **kwds, - ) + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -530,7 +510,7 @@ def test_multiple_date_cols_int_cast(all_parsers): -0.59, ], ], - columns=["actual", "nominal", "X0", "X4"], + columns=["actual", "nominal", 0, 4], ) # Python can sometimes be flaky about how