CI/TST: Skip pyarrow csv tests where parsing fails (#56015)

mroeschke · web-flow · commit 089481f2246c · 2023-11-17T07:20:05.000-08:00
* CI: Skip pyarrow csv tests where parsing fails

* Use 90 instead
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -23,7 +23,7 @@ defaults:
 jobs:
   ubuntu:
     runs-on: ubuntu-22.04
-    timeout-minutes: 180
+    timeout-minutes: 90
     strategy:
       matrix:
         env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml]
@@ -177,7 +177,7 @@ jobs:
       if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}}
 
   macos-windows:
-    timeout-minutes: 180
+    timeout-minutes: 90
     strategy:
       matrix:
         os: [macos-latest, windows-latest]
@@ -322,7 +322,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, macOS-latest, windows-latest]
 
-    timeout-minutes: 180
+    timeout-minutes: 90
 
     concurrency:
       #https://github.community/t/concurrecy-not-work-for-push/183068/7
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -222,8 +222,10 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
         return
 
     if parser.engine == "pyarrow" and "\r" not in data:
-        mark = pytest.mark.xfail(reason="Mismatched exception type/message")
-        request.applymarker(mark)
+        # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1:
+        # ValueError: skiprows argument must be an integer when using engine='pyarrow'
+        # AssertionError: Regex pattern did not match.
+        pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
 
     if expected is None:
         with pytest.raises(ParserError, match=msg):
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -130,8 +130,8 @@ def _encode_data_with_bom(_data):
         and data == "\n1"
         and kwargs.get("skip_blank_lines", True)
     ):
-        # Manually xfail, since we don't have mechanism to xfail specific version
-        request.applymarker(pytest.mark.xfail(reason="Pyarrow can't read blank lines"))
+        # CSV parse error: Empty CSV file or block: cannot infer number of columns
+        pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
 
     result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py
@@ -411,7 +411,7 @@ def test_header_names_backward_compat(all_parsers, data, header, request):
     parser = all_parsers
 
     if parser.engine == "pyarrow" and header is not None:
-        mark = pytest.mark.xfail(reason="mismatched index")
+        mark = pytest.mark.xfail(reason="DataFrame.columns are different")
         request.applymarker(mark)
 
     expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
@@ -635,7 +635,7 @@ def test_header_none_and_implicit_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # regex mismatch "CSV parse error: Expected 2 columns, got "
+@skip_pyarrow  # regex mismatch "CSV parse error: Expected 2 columns, got "
 def test_header_none_and_implicit_index_in_second_row(all_parsers):
     # GH#22144
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -1753,7 +1753,7 @@ def test_parse_timezone(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # pandas.errors.ParserError: CSV parse error
+@skip_pyarrow  # pandas.errors.ParserError: CSV parse error
 @pytest.mark.parametrize(
     "date_string",
     ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
@@ -1787,8 +1787,8 @@ def test_parse_delimited_date_swap_no_warning(
     expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
     if parser.engine == "pyarrow":
         if not dayfirst:
-            mark = pytest.mark.xfail(reason="CSV parse error: Empty CSV file or block")
-            request.applymarker(mark)
+            # "CSV parse error: Empty CSV file or block"
+            pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
         msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine"
         with pytest.raises(ValueError, match=msg):
             parser.read_csv(
@@ -1802,7 +1802,8 @@ def test_parse_delimited_date_swap_no_warning(
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
+# ArrowInvalid: CSV parse error: Empty CSV file or block: cannot infer number of columns
+@skip_pyarrow
 @pytest.mark.parametrize(
     "date_string,dayfirst,expected",
     [
@@ -1887,7 +1888,8 @@ def test_hypothesis_delimited_date(
     assert result == expected
 
 
-@xfail_pyarrow  # KeyErrors
+# ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file
+@skip_pyarrow
 @pytest.mark.parametrize(
     "names, usecols, parse_dates, missing_cols",
     [
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -95,10 +95,8 @@ def test_usecols_relative_to_names(all_parsers, names, usecols, request):
 10,11,12"""
     parser = all_parsers
     if parser.engine == "pyarrow" and not isinstance(usecols[0], int):
-        mark = pytest.mark.xfail(
-            reason="ArrowKeyError: Column 'fb' in include_columns does not exist"
-        )
-        request.applymarker(mark)
+        # ArrowKeyError: Column 'fb' in include_columns does not exist
+        pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
 
     result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
 
@@ -438,10 +436,8 @@ def test_raises_on_usecols_names_mismatch(
         usecols is not None and expected is not None
     ):
         # everything but the first case
-        mark = pytest.mark.xfail(
-            reason="e.g. Column 'f' in include_columns does not exist in CSV file"
-        )
-        request.applymarker(mark)
+        # ArrowKeyError: Column 'f' in include_columns does not exist in CSV file
+        pytest.skip(reason="https://github.com/apache/arrow/issues/38676")
 
     if expected is None:
         with pytest.raises(ValueError, match=msg):