diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index f4654582277df..4c7e669f94734 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -3,6 +3,7 @@ import bz2 from functools import wraps import gzip +import socket from typing import ( TYPE_CHECKING, Any, @@ -73,7 +74,13 @@ def _get_default_network_errors(): import http.client import urllib.error - return (OSError, http.client.HTTPException, TimeoutError, urllib.error.URLError) + return ( + OSError, + http.client.HTTPException, + TimeoutError, + urllib.error.URLError, + socket.timeout, + ) def optional_args(decorator): @@ -264,8 +271,10 @@ def can_connect(url, error_classes=None): error_classes = _get_default_network_errors() try: - with urlopen(url): - pass + with urlopen(url, timeout=20) as response: + # Timeout just in case rate-limiting is applied + if response.status != 200: + return False except error_classes: return False else: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 2ed2d57de2048..1e0f74ea41453 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -764,7 +764,13 @@ def test_corrupt_bytes_raises(self, engine): pd.read_excel(bad_stream) @pytest.mark.network - @tm.network + @tm.network( + url=( + "https://raw.githubusercontent.com/pandas-dev/pandas/main/" + "pandas/tests/io/data/excel/test1.xlsx" + ), + check_before_test=True, + ) def test_read_from_http_url(self, read_ext): url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/main/" diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7a546c13e0318..e08b7592c4d82 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -994,7 +994,10 @@ def test_round_trip_exception_(self, datapath): tm.assert_frame_equal(result.reindex(index=df.index, columns=df.columns), df) @pytest.mark.network - @tm.network + @tm.network( + url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5", + check_before_test=True, + ) @pytest.mark.parametrize( "field,dtype", [ diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index b5c3e98a1821d..b9ddec0a37c11 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -27,7 +27,13 @@ @pytest.mark.network -@tm.network +@tm.network( + url=( + "https://raw.github.com/pandas-dev/pandas/main/" + "pandas/tests/io/parser/data/salaries.csv" + ), + check_before_test=True, +) def test_url(all_parsers, csv_dir_path): parser = all_parsers kwargs = {"sep": "\t"} diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index bdcea07108938..93924c9b670c2 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -23,7 +23,13 @@ @pytest.mark.network -@tm.network +@tm.network( + url=( + "https://github.com/pandas-dev/pandas/raw/main/" + "pandas/tests/io/parser/data/salaries.csv" + ), + check_before_test=True, +) @pytest.mark.parametrize("mode", ["explicit", "infer"]) @pytest.mark.parametrize("engine", ["python", "c"]) def test_compressed_urls(salaries_table, mode, engine, compression_only): @@ -45,7 +51,13 @@ def test_compressed_urls(salaries_table, mode, engine, compression_only): @pytest.mark.network -@tm.network +@tm.network( + url=( + "https://raw.githubusercontent.com/pandas-dev/pandas/main/" + "pandas/tests/io/parser/data/unicode_series.csv" + ), + check_before_test=True, +) def test_url_encoding_csv(): """ read_csv should honor the requested encoding for URLs. diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 7c2ce37d8aa70..236a7f9e1a9c1 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -182,7 +182,13 @@ def test_passthrough_keywords(self): self.check_round_trip(df, write_kwargs={"version": 1}) @pytest.mark.network - @tm.network + @tm.network( + url=( + "https://raw.githubusercontent.com/pandas-dev/pandas/main/" + "pandas/tests/io/data/feather/feather-0_3_1.feather" + ), + check_before_test=True, + ) def test_http_path(self, feather_file): # GH 29055 url = ( diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index df373cf42590d..59a968bc4719a 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -133,9 +133,15 @@ def test_to_html_compat(self): tm.assert_frame_equal(res, df) @pytest.mark.network - @tm.network + @tm.network( + url=( + "https://www.fdic.gov/resources/resolutions/" + "bank-failures/failed-bank-list/index.html" + ), + check_before_test=True, + ) def test_banklist_url_positional_match(self): - url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 # Passing match argument as positional should cause a FutureWarning. with tm.assert_produces_warning(FutureWarning): df1 = self.read_html( @@ -153,9 +159,15 @@ def test_banklist_url_positional_match(self): assert_framelist_equal(df1, df2) @pytest.mark.network - @tm.network + @tm.network( + url=( + "https://www.fdic.gov/resources/resolutions/" + "bank-failures/failed-bank-list/index.html" + ), + check_before_test=True, + ) def test_banklist_url(self): - url = "http://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa E501 df1 = self.read_html( # lxml cannot find attrs leave out for now url, @@ -170,7 +182,13 @@ def test_banklist_url(self): assert_framelist_equal(df1, df2) @pytest.mark.network - @tm.network + @tm.network( + url=( + "https://raw.githubusercontent.com/pandas-dev/pandas/main/" + "pandas/tests/io/data/html/spam.html" + ), + check_before_test=True, + ) def test_spam_url(self): url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/main/" @@ -406,14 +424,14 @@ def test_negative_skiprows(self, spam_data): self.read_html(spam_data, match="Water", skiprows=-1) @pytest.mark.network - @tm.network + @tm.network(url="https://docs.python.org/2/", check_before_test=True) def test_multiple_matches(self): url = "https://docs.python.org/2/" dfs = self.read_html(url, match="Python") assert len(dfs) > 1 @pytest.mark.network - @tm.network + @tm.network(url="https://docs.python.org/2/", check_before_test=True) def test_python_docs_table(self): url = "https://docs.python.org/2/" dfs = self.read_html(url, match="Python") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 459d5701434d8..85f1cbf8c7707 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -381,7 +381,13 @@ def check_external_error_on_write(self, df, engine, exc): to_parquet(df, path, engine, compression=None) @pytest.mark.network - @tm.network + @tm.network( + url=( + "https://raw.githubusercontent.com/pandas-dev/pandas/" + "main/pandas/tests/io/data/parquet/simple.parquet" + ), + check_before_test=True, + ) def test_parquet_read_from_url(self, df_compat, engine): if engine != "auto": pytest.importorskip(engine) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 379b0898afa27..f0fd500bb443c 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -258,7 +258,13 @@ def test_parser_consistency_file(datapath): @pytest.mark.network @pytest.mark.slow @td.skip_if_no("lxml") -@tm.network +@tm.network( + url=( + "https://data.cityofchicago.org/api/views/" + "8pix-ypme/rows.xml?accessType=DOWNLOAD" + ), + check_before_test=True, +) def test_parser_consistency_url(): url = ( "https://data.cityofchicago.org/api/views/" @@ -404,7 +410,10 @@ def test_wrong_file_path_etree(): @pytest.mark.network -@tm.network +@tm.network( + url="https://www.w3schools.com/xml/books.xml", + check_before_test=True, +) @td.skip_if_no("lxml") def test_url(): url = "https://www.w3schools.com/xml/books.xml" @@ -425,7 +434,7 @@ def test_url(): @pytest.mark.network -@tm.network +@tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True) def test_wrong_url(parser): with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): url = "https://www.w3schools.com/xml/python.xml" @@ -1022,7 +1031,9 @@ def test_empty_stylesheet(val): @pytest.mark.network @td.skip_if_no("lxml") -@tm.network +@tm.network( + url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True +) def test_online_stylesheet(): xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml" xsl = "https://www.w3schools.com/xml/cdcatalog.xsl"