diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 0398d88da1b20..dc06dd9620c24 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1511,7 +1511,6 @@ Currently, options unsupported by the C and pyarrow engines include: * ``sep`` other than a single character (e.g. regex separators) * ``skipfooter`` -* ``sep=None`` with ``delim_whitespace=False`` Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. @@ -1526,7 +1525,6 @@ Options that are unsupported by the pyarrow engine which are not covered by the * ``memory_map`` * ``dialect`` * ``on_bad_lines`` -* ``delim_whitespace`` * ``quoting`` * ``lineterminator`` * ``converters`` diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ce44050a99a79..5691531f6ef5b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -242,6 +242,7 @@ Removal of prior version deprecations/changes - Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`) - Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`) - Removed the "closed" and "normalize" keywords in :meth:`DatetimeIndex.__new__` (:issue:`52628`) +- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`) - Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`) - Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`) - Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index f01fe8ecef930..c8863e1b39c94 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -229,7 +229,6 @@ class ParserWarning(Warning): 1. `sep` other than a single character (e.g. regex separators) 2. `skipfooter` higher than 0 - 3. `sep=None` with `delim_whitespace=False` The warning can be avoided by adding `engine='python'` as a parameter in `pd.read_csv` and `pd.read_table` methods. diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index aa20ec237e968..5a0a8c321e629 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -113,9 +113,8 @@ def read_clipboard( if index_length != 0: kwargs.setdefault("index_col", list(range(index_length))) - # Edge case where sep is specified to be None, return to default - if sep is None and kwargs.get("delim_whitespace") is None: - sep = r"\s+" + elif not isinstance(sep, str): + raise ValueError(f"{sep=} must be a string") # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c87912dc6f24a..413f15f389105 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -133,7 +133,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): encoding_errors: str | None dialect: str | csv.Dialect | None on_bad_lines: str - delim_whitespace: bool | lib.NoDefault low_memory: bool memory_map: bool float_precision: Literal["high", "legacy", "round_trip"] | None @@ -425,14 +424,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Callable for ``engine='pyarrow'`` -delim_whitespace : bool, default False - Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be - used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option - is set to ``True``, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -558,7 +549,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): class _C_Parser_Defaults(TypedDict): - delim_whitespace: Literal[False] na_filter: Literal[True] low_memory: Literal[True] memory_map: Literal[False] @@ -566,7 +556,6 @@ class _C_Parser_Defaults(TypedDict): _c_parser_defaults: _C_Parser_Defaults = { - "delim_whitespace": False, "na_filter": True, "low_memory": True, "memory_map": False, @@ -592,7 +581,6 @@ class _Fwf_Defaults(TypedDict): "thousands", "memory_map", "dialect", - "delim_whitespace", "quoting", "lineterminator", "converters", @@ -818,24 +806,12 @@ def read_csv( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -844,7 +820,6 @@ def read_csv( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, @@ -963,24 +938,12 @@ def read_table( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy", "round_trip"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -989,7 +952,6 @@ def read_table( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, @@ -1296,17 +1258,10 @@ def _clean_options( engine = "python" sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] - if sep is None and not delim_whitespace: - if engine in ("c", "pyarrow"): - fallback_reason = ( - f"the '{engine}' engine does not support " - "sep=None with delim_whitespace=False" - ) - engine = "python" - elif sep is not None and len(sep) > 1: + if sep is not None and len(sep) > 1: if engine == "c" and sep == r"\s+": + # delim_whitespace passed on to pandas._libs.parsers.TextReader result["delim_whitespace"] = True del result["delimiter"] elif engine not in ("python", "python-fwf"): @@ -1317,9 +1272,6 @@ def _clean_options( r"different from '\s+' are interpreted as regex)" ) engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" elif sep is not None: encodeable = True encoding = sys.getfilesystemencoding() or "utf-8" @@ -1730,7 +1682,6 @@ def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: def _refine_defaults_read( dialect: str | csv.Dialect | None, delimiter: str | None | lib.NoDefault, - delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, on_bad_lines: str | Callable, @@ -1750,14 +1701,6 @@ def _refine_defaults_read( documentation for more details. delimiter : str or object Alias for sep. - delim_whitespace : bool - Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\\s+'``. If this option - is set to True, nothing should be passed in for the ``delimiter`` - parameter. - - .. deprecated:: 2.2.0 - Use ``sep="\\s+"`` instead. engine : {{'c', 'python'}} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -1777,12 +1720,6 @@ def _refine_defaults_read( ------- kwds : dict Input parameters with correct values. - - Raises - ------ - ValueError : - If a delimiter was specified with ``sep`` (or ``delimiter``) and - ``delim_whitespace=True``. """ # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] @@ -1813,12 +1750,6 @@ def _refine_defaults_read( if delimiter is None: delimiter = sep - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - if delimiter == "\n": raise ValueError( r"Specified \n as separator or delimiter. This forces the python engine " diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index d79e0c34edaab..df76b46cc6a7b 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -422,65 +422,43 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): @pytest.mark.parametrize( - "kwargs,expected", + "kwargs,expected_data", [ # gh-8661, gh-8679: this should ignore six lines, including # lines with trailing whitespace and blank lines. ( { "header": None, - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [0, 1, 2, 3, 5, 6], "skip_blank_lines": True, }, - DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + [[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]], ), # gh-8983: test skipping set of rows after a row with trailing spaces. ( { - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [1, 2, 3, 5, 6], "skip_blank_lines": True, }, - DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + {"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}, ), ], ) -def test_trailing_spaces(all_parsers, kwargs, expected): +def test_trailing_spaces(all_parsers, kwargs, expected_data): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + with pytest.raises(ValueError, match="the 'pyarrow' engine does not support"): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + expected = DataFrame(expected_data) + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) tm.assert_frame_equal(result, expected) -def test_raise_on_sep_with_delim_whitespace(all_parsers): - # see gh-6607 - data = "a b c\n1 2 3" - parser = all_parsers - - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with pytest.raises(ValueError, match="you can only specify one"): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) - - def test_read_filepath_or_buffer(all_parsers): # see gh-43366 parser = all_parsers @@ -489,8 +467,7 @@ def test_read_filepath_or_buffer(all_parsers): parser.read_csv(filepath_or_buffer=b"input") -@pytest.mark.parametrize("delim_whitespace", [True, False]) -def test_single_char_leading_whitespace(all_parsers, delim_whitespace): +def test_single_char_leading_whitespace(all_parsers): # see gh-9710 parser = all_parsers data = """\ @@ -500,28 +477,16 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): a b\n""" - expected = DataFrame({"MyColumn": list("abab")}) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), - skipinitialspace=True, - delim_whitespace=delim_whitespace, - ) + parser.read_csv( + StringIO(data), + skipinitialspace=True, + ) return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) + expected = DataFrame({"MyColumn": list("abab")}) + result = parser.read_csv(StringIO(data), skipinitialspace=True, sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -764,49 +729,6 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) -def test_read_table_delim_whitespace_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_table(f, delim_whitespace=True) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_table(f, delim_whitespace=True) - expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) - - def test_read_csv_delimiter_and_sep_no_default(all_parsers): # GH#39823 f = StringIO("a,b\n1,2") @@ -832,26 +754,6 @@ def test_read_csv_line_break_as_separator(kwargs, all_parsers): parser.read_csv(StringIO(data), **kwargs) -@pytest.mark.parametrize("delimiter", [",", "\t"]) -def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) - - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) - - @skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index bd47e045417ce..ed2e729430b01 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -250,19 +250,17 @@ def test_null_byte_char(request, all_parsers): @pytest.mark.filterwarnings("always::ResourceWarning") -def test_open_file(request, all_parsers): +def test_open_file(all_parsers): # GH 39024 parser = all_parsers msg = "Could not determine delimiter" err = csv.Error if parser.engine == "c": - msg = "the 'c' engine does not support sep=None with delim_whitespace=False" - err = ValueError + msg = "object of type 'NoneType' has no len" + err = TypeError elif parser.engine == "pyarrow": - msg = ( - "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" - ) + msg = "'utf-8' codec can't decode byte 0xe4" err = ValueError with tm.ensure_clean() as path: diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index ab2e1ee138315..39718ca2ec134 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -53,11 +53,7 @@ def test_delim_whitespace_custom_terminator(c_parser_only): data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + df = parser.read_csv(StringIO(data), lineterminator="~", sep=r"\s+") expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index ca8df520b171e..ba27b170aecdc 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -31,10 +31,8 @@ def test_comment(all_parsers, na_values): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}] -) -def test_line_comment(all_parsers, read_kwargs, request): +@pytest.mark.parametrize("read_kwargs", [{}, {"lineterminator": "*"}, {"sep": r"\s+"}]) +def test_line_comment(all_parsers, read_kwargs): parser = all_parsers data = """# empty A,B,C @@ -42,12 +40,8 @@ def test_line_comment(all_parsers, read_kwargs, request): #ignore this line 5.,NaN,10.0 """ - warn = None - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if read_kwargs.get("delim_whitespace"): + if read_kwargs.get("sep"): data = data.replace(",", " ") - warn = FutureWarning elif read_kwargs.get("lineterminator"): data = data.replace("\n", read_kwargs.get("lineterminator")) @@ -60,23 +54,15 @@ def test_line_comment(all_parsers, read_kwargs, request): else: msg = "The 'comment' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - warn, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), **read_kwargs) + parser.read_csv(StringIO(data), **read_kwargs) return elif parser.engine == "python" and read_kwargs.get("lineterminator"): msg = r"Custom line terminators not supported in python parser \(yet\)" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - warn, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), **read_kwargs) + parser.read_csv(StringIO(data), **read_kwargs) return - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - result = parser.read_csv(StringIO(data), **read_kwargs) - + result = parser.read_csv(StringIO(data), **read_kwargs) expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 85ce55b3bcf83..b7e3a13ec28b8 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -682,7 +682,7 @@ def test_header_missing_rows(all_parsers): parser.read_csv(StringIO(data), header=[0, 1, 2]) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +# ValueError: the 'pyarrow' engine does not support regex separators @xfail_pyarrow def test_header_multiple_whitespaces(all_parsers): # GH#54931 @@ -695,7 +695,7 @@ def test_header_multiple_whitespaces(all_parsers): tm.assert_frame_equal(result, expected) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +# ValueError: the 'pyarrow' engine does not support regex separators @xfail_pyarrow def test_header_delim_whitespace(all_parsers): # GH#54918 @@ -704,12 +704,7 @@ def test_header_delim_whitespace(all_parsers): 1,2 3,4 """ - - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), delim_whitespace=True) + result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 547afb1b25a04..0a9f6bd83e0d9 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -580,10 +580,7 @@ def test_skiprows_inference(): """.strip() skiprows = 2 - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -598,10 +595,7 @@ def test_skiprows_by_index_inference(): """.strip() skiprows = [0, 2] - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) - + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 3cd2351f84c7a..17a806d05fe28 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -187,7 +187,7 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported +@xfail_pyarrow # ValueError: the 'pyarrow' engine does not support regex separators @pytest.mark.parametrize( "lineterminator", ["\n", "\r\n", "\r"], # "LF" # "CRLF" # "CR" @@ -218,16 +218,12 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request): data = data.replace("\n", lineterminator) - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) + result = parser.read_csv( + StringIO(data), + skiprows=1, + sep=r"\s+", + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 44a55cf3be240..07f84466e3ac2 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -44,12 +44,7 @@ def test_c_engine(self): data = "a b c\n1 2 3" msg = "does not support" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - # specify C engine with unsupported options (raise) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): @@ -58,8 +53,6 @@ def test_c_engine(self): read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)): - read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=r"\s") with tm.assert_produces_warning(parsers.ParserWarning): @@ -154,14 +147,8 @@ def test_pyarrow_engine(self): elif default == "on_bad_lines": kwargs[default] = "warn" - warn = None - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if "delim_whitespace" in kwargs: - warn = FutureWarning - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg): - read_csv(StringIO(data), engine="pyarrow", **kwargs) + read_csv(StringIO(data), engine="pyarrow", **kwargs) def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d55066d2d70bb..82b42beb38ae0 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -158,7 +158,8 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@skip_pyarrow # CSV parse error in one case, AttributeError in another +# ArrowKeyError: Column 'a' in include_columns does not exist in CSV file +@skip_pyarrow @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -254,29 +255,12 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@skip_pyarrow # Column 'a' in include_columns does not exist in CSV file def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) - return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected)