From 0ee992e8e27d45b53e4b9cc63f67f4119dc7a6d3 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 16 Apr 2024 18:47:27 +0200 Subject: [PATCH 1/3] remove delim_whitespace from read_table, read_csv --- pandas/io/parsers/readers.py | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 70f9a68244164..7f015d5d671c3 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -783,7 +783,6 @@ def read_csv( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy", "round_trip"] | None = None, @@ -833,17 +832,6 @@ def read_csv( stacklevel=find_stack_level(), ) - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -852,12 +840,11 @@ def read_csv( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, names, - defaults={"delimiter": ","}, + defaults={"delimiter": ",", "delim_whitespace": False}, dtype_backend=dtype_backend, ) kwds.update(kwds_defaults) @@ -974,7 +961,6 @@ def read_table( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy", "round_trip"] | None = None, @@ -1015,17 +1001,6 @@ def read_table( stacklevel=find_stack_level(), ) - if delim_whitespace is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - delim_whitespace = False - # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1034,12 +1009,11 @@ def read_table( kwds_defaults = _refine_defaults_read( dialect, delimiter, - delim_whitespace, engine, sep, on_bad_lines, names, - defaults={"delimiter": "\t"}, + defaults={"delimiter": "\t", "delim_whitespace": False}, dtype_backend=dtype_backend, ) kwds.update(kwds_defaults) From 107a6c3f66bb78c4a4eaad6c74fb55da2fac235f Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Wed, 17 Apr 2024 14:00:00 +0200 Subject: [PATCH 2/3] remove delim_whitespace from _read_shared, _refine_defaults_read, _clean_options, _C_Parser_Defaults --- pandas/io/parsers/readers.py | 26 ++++--------- .../io/parser/common/test_common_basic.py | 37 ++++--------------- pandas/tests/io/parser/test_header.py | 8 +--- pandas/tests/io/parser/test_read_fwf.py | 8 +--- .../io/parser/usecols/test_usecols_basic.py | 22 ++--------- 5 files changed, 21 insertions(+), 80 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7f015d5d671c3..fd4c8ac8197bd 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -137,7 +137,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): encoding_errors: str | None dialect: str | csv.Dialect | None on_bad_lines: str - delim_whitespace: bool | lib.NoDefault low_memory: bool memory_map: bool float_precision: Literal["high", "legacy", "round_trip"] | None @@ -517,7 +516,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): class _C_Parser_Defaults(TypedDict): - delim_whitespace: Literal[False] na_filter: Literal[True] low_memory: Literal[True] memory_map: Literal[False] @@ -525,7 +523,6 @@ class _C_Parser_Defaults(TypedDict): _c_parser_defaults: _C_Parser_Defaults = { - "delim_whitespace": False, "na_filter": True, "low_memory": True, "memory_map": False, @@ -551,7 +548,6 @@ class _Fwf_Defaults(TypedDict): "thousands", "memory_map", "dialect", - "delim_whitespace", "quoting", "lineterminator", "converters", @@ -844,7 +840,7 @@ def read_csv( sep, on_bad_lines, names, - defaults={"delimiter": ",", "delim_whitespace": False}, + defaults={"delimiter": ","}, dtype_backend=dtype_backend, ) kwds.update(kwds_defaults) @@ -1013,7 +1009,7 @@ def read_table( sep, on_bad_lines, names, - defaults={"delimiter": "\t", "delim_whitespace": False}, + defaults={"delimiter": "\t"}, dtype_backend=dtype_backend, ) kwds.update(kwds_defaults) @@ -1315,9 +1311,8 @@ def _clean_options( engine = "python" sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] - if sep is None and not delim_whitespace: + if sep is None: if engine in ("c", "pyarrow"): fallback_reason = ( f"the '{engine}' engine does not support " @@ -1326,7 +1321,6 @@ def _clean_options( engine = "python" elif sep is not None and len(sep) > 1: if engine == "c" and sep == r"\s+": - result["delim_whitespace"] = True del result["delimiter"] elif engine not in ("python", "python-fwf"): # wait until regex engine integrated @@ -1336,9 +1330,6 @@ def _clean_options( r"different from '\s+' are interpreted as regex)" ) engine = "python" - elif delim_whitespace: - if "python" in engine: - result["delimiter"] = r"\s+" elif sep is not None: encodeable = True encoding = sys.getfilesystemencoding() or "utf-8" @@ -1753,7 +1744,6 @@ def _stringify_na_values(na_values, floatify: bool) -> set[str | float]: def _refine_defaults_read( dialect: str | csv.Dialect | None, delimiter: str | None | lib.NoDefault, - delim_whitespace: bool, engine: CSVEngine | None, sep: str | None | lib.NoDefault, on_bad_lines: str | Callable, @@ -1836,11 +1826,11 @@ def _refine_defaults_read( if delimiter is None: delimiter = sep - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) + # if delim_whitespace and (delimiter is not lib.no_default): + # raise ValueError( + # "Specified a delimiter with both sep and " + # "delim_whitespace=True; you can only specify one." + # ) if delimiter == "\n": raise ValueError( diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 485680d9de48c..a5185f63ef174 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -480,7 +480,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): ( { "header": None, - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [0, 1, 2, 3, 5, 6], "skip_blank_lines": True, }, @@ -489,7 +489,7 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): # gh-8983: test skipping set of rows after a row with trailing spaces. ( { - "delim_whitespace": True, + "sep": r"\s+", "skiprows": [1, 2, 3, 5, 6], "skip_blank_lines": True, }, @@ -501,22 +501,11 @@ def test_trailing_spaces(all_parsers, kwargs, expected): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - return + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - tm.assert_frame_equal(result, expected) + # result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + # tm.assert_frame_equal(result, expected) def test_raise_on_sep_with_delim_whitespace(all_parsers): @@ -815,25 +804,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser.read_csv(StringIO(data), names=set("QAZ")) +@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") parser = all_parsers - depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_table(f, delim_whitespace=True) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_table(f, delim_whitespace=True) + result = parser.read_table(f, sep=r"\s+") expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 85ce55b3bcf83..3aadc3cd5f15f 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -682,7 +682,6 @@ def test_header_missing_rows(all_parsers): parser.read_csv(StringIO(data), header=[0, 1, 2]) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine @xfail_pyarrow def test_header_multiple_whitespaces(all_parsers): # GH#54931 @@ -695,7 +694,6 @@ def test_header_multiple_whitespaces(all_parsers): tm.assert_frame_equal(result, expected) -# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine @xfail_pyarrow def test_header_delim_whitespace(all_parsers): # GH#54918 @@ -705,11 +703,7 @@ def test_header_delim_whitespace(all_parsers): 3,4 """ - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), delim_whitespace=True) + result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index b62fcc04c375c..d46f7d0004935 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -593,9 +593,7 @@ def test_skiprows_inference(): """.strip() skiprows = 2 - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -611,9 +609,7 @@ def test_skiprows_by_index_inference(): """.strip() skiprows = [0, 2] - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + expected = read_csv(StringIO(data), skiprows=skiprows, sep=r"\s+") result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d55066d2d70bb..19249cace5f9d 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -254,30 +254,14 @@ def test_usecols_regex_sep(all_parsers): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" - - if parser.engine == "pyarrow": - msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) - return - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), delim_whitespace=True, usecols=("a", "b") - ) + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) + print(result) tm.assert_frame_equal(result, expected) From 3b9e24da579b41d22da8c56cb9b0ac24ecae4afb Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Wed, 17 Apr 2024 18:33:16 +0200 Subject: [PATCH 3/3] remove temporary comments --- pandas/io/parsers/readers.py | 6 ------ pandas/tests/io/parser/common/test_common_basic.py | 4 ++-- pandas/tests/io/parser/usecols/test_usecols_basic.py | 1 - 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index fd4c8ac8197bd..df66fefc58903 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1826,12 +1826,6 @@ def _refine_defaults_read( if delimiter is None: delimiter = sep - # if delim_whitespace and (delimiter is not lib.no_default): - # raise ValueError( - # "Specified a delimiter with both sep and " - # "delim_whitespace=True; you can only specify one." - # ) - if delimiter == "\n": raise ValueError( r"Specified \n as separator or delimiter. This forces the python engine " diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index a5185f63ef174..8f826caa800b0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -504,8 +504,8 @@ def test_trailing_spaces(all_parsers, kwargs, expected): if parser.engine == "pyarrow": parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - # result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) - # tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + tm.assert_frame_equal(result, expected) def test_raise_on_sep_with_delim_whitespace(all_parsers): diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 19249cace5f9d..3e08be9664fa1 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -261,7 +261,6 @@ def test_usecols_with_whitespace(all_parsers): result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) - print(result) tm.assert_frame_equal(result, expected)