From 8094e0b4606b41f75a26eee814252c8127ad179d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 19:05:16 -0800 Subject: [PATCH 1/2] CI: Skip pyarrow csv tests where parsing fails --- .github/workflows/unit-tests.yml | 6 +++--- .../tests/io/parser/common/test_file_buffer_url.py | 6 ++++-- pandas/tests/io/parser/test_encoding.py | 4 ++-- pandas/tests/io/parser/test_header.py | 4 ++-- pandas/tests/io/parser/test_parse_dates.py | 12 +++++++----- pandas/tests/io/parser/usecols/test_usecols_basic.py | 12 ++++-------- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 9785b81ae9e0b..d4de92c987574 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -23,7 +23,7 @@ defaults: jobs: ubuntu: runs-on: ubuntu-22.04 - timeout-minutes: 180 + timeout-minutes: 120 strategy: matrix: env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] @@ -177,7 +177,7 @@ jobs: if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} macos-windows: - timeout-minutes: 180 + timeout-minutes: 120 strategy: matrix: os: [macos-latest, windows-latest] @@ -322,7 +322,7 @@ jobs: matrix: os: [ubuntu-22.04, macOS-latest, windows-latest] - timeout-minutes: 180 + timeout-minutes: 120 concurrency: #https://github.community/t/concurrecy-not-work-for-push/183068/7 diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index a6e68cb984ef4..69c39fdf4cdbe 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -222,8 +222,10 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request): return if parser.engine == "pyarrow" and "\r" not in data: - mark = pytest.mark.xfail(reason="Mismatched exception type/message") - request.applymarker(mark) + # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1: + # ValueError: skiprows argument must be an integer when using engine='pyarrow' + # AssertionError: Regex pattern did not match. + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") if expected is None: with pytest.raises(ParserError, match=msg): diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 3580c040688d8..36d7a19cf6781 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -130,8 +130,8 @@ def _encode_data_with_bom(_data): and data == "\n1" and kwargs.get("skip_blank_lines", True) ): - # Manually xfail, since we don't have mechanism to xfail specific version - request.applymarker(pytest.mark.xfail(reason="Pyarrow can't read blank lines")) + # CSV parse error: Empty CSV file or block: cannot infer number of columns + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index f55f8497f318c..86162bf90db8b 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -411,7 +411,7 @@ def test_header_names_backward_compat(all_parsers, data, header, request): parser = all_parsers if parser.engine == "pyarrow" and header is not None: - mark = pytest.mark.xfail(reason="mismatched index") + mark = pytest.mark.xfail(reason="DataFrame.columns are different") request.applymarker(mark) expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) @@ -635,7 +635,7 @@ def test_header_none_and_implicit_index(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " +@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " def test_header_none_and_implicit_index_in_second_row(all_parsers): # GH#22144 parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 70d9171fa3c22..30580423a3099 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1753,7 +1753,7 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # pandas.errors.ParserError: CSV parse error +@skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1787,8 +1787,8 @@ def test_parse_delimited_date_swap_no_warning( expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") if parser.engine == "pyarrow": if not dayfirst: - mark = pytest.mark.xfail(reason="CSV parse error: Empty CSV file or block") - request.applymarker(mark) + # "CSV parse error: Empty CSV file or block" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv( @@ -1802,7 +1802,8 @@ def test_parse_delimited_date_swap_no_warning( tm.assert_frame_equal(result, expected) -@xfail_pyarrow +# ArrowInvalid: CSV parse error: Empty CSV file or block: cannot infer number of columns +@skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1887,7 +1888,8 @@ def test_hypothesis_delimited_date( assert result == expected -@xfail_pyarrow # KeyErrors +# ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file +@skip_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", [ diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 055be81d2996d..ded6b91a26eca 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -95,10 +95,8 @@ def test_usecols_relative_to_names(all_parsers, names, usecols, request): 10,11,12""" parser = all_parsers if parser.engine == "pyarrow" and not isinstance(usecols[0], int): - mark = pytest.mark.xfail( - reason="ArrowKeyError: Column 'fb' in include_columns does not exist" - ) - request.applymarker(mark) + # ArrowKeyError: Column 'fb' in include_columns does not exist + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) @@ -438,10 +436,8 @@ def test_raises_on_usecols_names_mismatch( usecols is not None and expected is not None ): # everything but the first case - mark = pytest.mark.xfail( - reason="e.g. Column 'f' in include_columns does not exist in CSV file" - ) - request.applymarker(mark) + # ArrowKeyError: Column 'f' in include_columns does not exist in CSV file + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") if expected is None: with pytest.raises(ValueError, match=msg): From 7ebda531dc34a2e55311b98919e409f106e2e1fe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 19:09:28 -0800 Subject: [PATCH 2/2] Use 90 instead --- .github/workflows/unit-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d4de92c987574..e156b1e0aeca2 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -23,7 +23,7 @@ defaults: jobs: ubuntu: runs-on: ubuntu-22.04 - timeout-minutes: 120 + timeout-minutes: 90 strategy: matrix: env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] @@ -177,7 +177,7 @@ jobs: if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} macos-windows: - timeout-minutes: 120 + timeout-minutes: 90 strategy: matrix: os: [macos-latest, windows-latest] @@ -322,7 +322,7 @@ jobs: matrix: os: [ubuntu-22.04, macOS-latest, windows-latest] - timeout-minutes: 120 + timeout-minutes: 90 concurrency: #https://github.community/t/concurrecy-not-work-for-push/183068/7