TST: Check network URL statuses in tests (pandas-dev#45949)

mroeschke · yehoshuadimarsky · commit b9dcd4410f39 · 2022-07-13T10:17:54.000-04:00
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -3,6 +3,7 @@
 import bz2
 from functools import wraps
 import gzip
+import socket
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -73,7 +74,13 @@ def _get_default_network_errors():
     import http.client
     import urllib.error
 
-    return (OSError, http.client.HTTPException, TimeoutError, urllib.error.URLError)
+    return (
+        OSError,
+        http.client.HTTPException,
+        TimeoutError,
+        urllib.error.URLError,
+        socket.timeout,
+    )
 
 
 def optional_args(decorator):
@@ -264,8 +271,10 @@ def can_connect(url, error_classes=None):
         error_classes = _get_default_network_errors()
 
     try:
-        with urlopen(url):
-            pass
+        with urlopen(url, timeout=20) as response:
+            # Timeout just in case rate-limiting is applied
+            if response.status != 200:
+                return False
     except error_classes:
         return False
     else:
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -764,7 +764,13 @@ def test_corrupt_bytes_raises(self, engine):
             pd.read_excel(bad_stream)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url=(
+            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
+            "pandas/tests/io/data/excel/test1.xlsx"
+        ),
+        check_before_test=True,
+    )
     def test_read_from_http_url(self, read_ext):
         url = (
             "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -994,7 +994,10 @@ def test_round_trip_exception_(self, datapath):
         tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5",
+        check_before_test=True,
+    )
     @pytest.mark.parametrize(
         "field,dtype",
         [
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -27,7 +27,13 @@
 
 
 @pytest.mark.network
-@tm.network
+@tm.network(
+    url=(
+        "https://raw.github.com/pandas-dev/pandas/main/"
+        "pandas/tests/io/parser/data/salaries.csv"
+    ),
+    check_before_test=True,
+)
 def test_url(all_parsers, csv_dir_path):
     parser = all_parsers
     kwargs = {"sep": "\t"}
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
@@ -23,7 +23,13 @@
 
 
 @pytest.mark.network
-@tm.network
+@tm.network(
+    url=(
+        "https://github.com/pandas-dev/pandas/raw/main/"
+        "pandas/tests/io/parser/data/salaries.csv"
+    ),
+    check_before_test=True,
+)
 @pytest.mark.parametrize("mode", ["explicit", "infer"])
 @pytest.mark.parametrize("engine", ["python", "c"])
 def test_compressed_urls(salaries_table, mode, engine, compression_only):
@@ -45,7 +51,13 @@ def test_compressed_urls(salaries_table, mode, engine, compression_only):
 
 
 @pytest.mark.network
-@tm.network
+@tm.network(
+    url=(
+        "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
+        "pandas/tests/io/parser/data/unicode_series.csv"
+    ),
+    check_before_test=True,
+)
 def test_url_encoding_csv():
     """
     read_csv should honor the requested encoding for URLs.
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -182,7 +182,13 @@ def test_passthrough_keywords(self):
         self.check_round_trip(df, write_kwargs={"version": 1})
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url=(
+            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
+            "pandas/tests/io/data/feather/feather-0_3_1.feather"
+        ),
+        check_before_test=True,
+    )
     def test_http_path(self, feather_file):
         # GH 29055
         url = (
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -133,9 +133,15 @@ def test_to_html_compat(self):
         tm.assert_frame_equal(res, df)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url=(
+            "https://www.fdic.gov/resources/resolutions/"
+            "bank-failures/failed-bank-list/index.html"
+        ),
+        check_before_test=True,
+    )
     def test_banklist_url_positional_match(self):
-        url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html"  # noqa E501
+        url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html"  # noqa E501
         # Passing match argument as positional should cause a FutureWarning.
         with tm.assert_produces_warning(FutureWarning):
             df1 = self.read_html(
@@ -153,9 +159,15 @@ def test_banklist_url_positional_match(self):
         assert_framelist_equal(df1, df2)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url=(
+            "https://www.fdic.gov/resources/resolutions/"
+            "bank-failures/failed-bank-list/index.html"
+        ),
+        check_before_test=True,
+    )
     def test_banklist_url(self):
-        url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html"  # noqa E501
+        url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html"  # noqa E501
         df1 = self.read_html(
             # lxml cannot find attrs leave out for now
             url,
@@ -170,7 +182,13 @@ def test_banklist_url(self):
         assert_framelist_equal(df1, df2)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url=(
+            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
+            "pandas/tests/io/data/html/spam.html"
+        ),
+        check_before_test=True,
+    )
     def test_spam_url(self):
         url = (
             "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
@@ -406,14 +424,14 @@ def test_negative_skiprows(self, spam_data):
             self.read_html(spam_data, match="Water", skiprows=-1)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(url="https://docs.python.org/2/", check_before_test=True)
     def test_multiple_matches(self):
         url = "https://docs.python.org/2/"
         dfs = self.read_html(url, match="Python")
         assert len(dfs) > 1
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(url="https://docs.python.org/2/", check_before_test=True)
     def test_python_docs_table(self):
         url = "https://docs.python.org/2/"
         dfs = self.read_html(url, match="Python")
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -381,7 +381,13 @@ def check_external_error_on_write(self, df, engine, exc):
                 to_parquet(df, path, engine, compression=None)
 
     @pytest.mark.network
-    @tm.network
+    @tm.network(
+        url=(
+            "https://raw.githubusercontent.com/pandas-dev/pandas/"
+            "main/pandas/tests/io/data/parquet/simple.parquet"
+        ),
+        check_before_test=True,
+    )
     def test_parquet_read_from_url(self, df_compat, engine):
         if engine != "auto":
             pytest.importorskip(engine)
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -258,7 +258,13 @@ def test_parser_consistency_file(datapath):
 @pytest.mark.network
 @pytest.mark.slow
 @td.skip_if_no("lxml")
-@tm.network
+@tm.network(
+    url=(
+        "https://data.cityofchicago.org/api/views/"
+        "8pix-ypme/rows.xml?accessType=DOWNLOAD"
+    ),
+    check_before_test=True,
+)
 def test_parser_consistency_url():
     url = (
         "https://data.cityofchicago.org/api/views/"
@@ -404,7 +410,10 @@ def test_wrong_file_path_etree():
 
 
 @pytest.mark.network
-@tm.network
+@tm.network(
+    url="https://www.w3schools.com/xml/books.xml",
+    check_before_test=True,
+)
 @td.skip_if_no("lxml")
 def test_url():
     url = "https://www.w3schools.com/xml/books.xml"
@@ -425,7 +434,7 @@ def test_url():
 
 
 @pytest.mark.network
-@tm.network
+@tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True)
 def test_wrong_url(parser):
     with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")):
         url = "https://www.w3schools.com/xml/python.xml"
@@ -1022,7 +1031,9 @@ def test_empty_stylesheet(val):
 
 @pytest.mark.network
 @td.skip_if_no("lxml")
-@tm.network
+@tm.network(
+    url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True
+)
 def test_online_stylesheet():
     xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml"
     xsl = "https://www.w3schools.com/xml/cdcatalog.xsl"