From 3bb72c7dc0af881fbefec789b00ba3274deb26ff Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 18:31:53 -0700
Subject: [PATCH 1/7] Use pytest-localserver instead of making network
 connections

---
 ci/deps/actions-310.yaml                      |   1 +
 ci/deps/actions-311-downstream_compat.yaml    |   1 +
 ci/deps/actions-311.yaml                      |   1 +
 ci/deps/actions-39-minimum_versions.yaml      |   1 +
 ci/deps/actions-39.yaml                       |   1 +
 ci/deps/circle-310-arm64.yaml                 |   1 +
 environment.yml                               |   1 +
 pandas/tests/io/conftest.py                   |  15 +-
 pandas/tests/io/excel/test_readers.py         |  18 +-
 pandas/tests/io/json/test_pandas.py           |  12 +-
 .../io/parser/common/test_file_buffer_url.py  |  21 +-
 pandas/tests/io/parser/test_network.py        |  44 +--
 pandas/tests/io/parser/test_read_fwf.py       |  56 +--
 pandas/tests/io/test_feather.py               |  18 +-
 pandas/tests/io/test_html.py                  | 137 ++++---
 pandas/tests/io/test_parquet.py               |  18 +-
 pandas/tests/io/test_s3.py                    |  15 +-
 pandas/tests/io/xml/test_xml.py               | 371 +++++++++++++-----
 pandas/tests/test_downstream.py               |  23 +-
 requirements-dev.txt                          |   1 +
 20 files changed, 448 insertions(+), 308 deletions(-)

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 0923594f2c840..ffa7732c604a0 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - boto3
 
   # required dependencies
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 51c7a97ad6500..596f3476c9c4e 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -16,6 +16,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - boto3
 
   # required dependencies
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 66b8650116854..9d60d734db5b3 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - boto3
 
   # required dependencies
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
index e1b4fdfb1d897..91961e4af2d1c 100644
--- a/ci/deps/actions-39-minimum_versions.yaml
+++ b/ci/deps/actions-39-minimum_versions.yaml
@@ -17,6 +17,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - boto3
 
   # required dependencies
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index 8ff47dbb9cc95..6ea0d41b947dc 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - boto3
 
   # required dependencies
diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
index ca9860fc20742..df4e8e285bd02 100644
--- a/ci/deps/circle-310-arm64.yaml
+++ b/ci/deps/circle-310-arm64.yaml
@@ -15,6 +15,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - boto3
 
   # required dependencies
diff --git a/environment.yml b/environment.yml
index 6178fe896760f..8fd97e6fcc0e1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -17,6 +17,7 @@ dependencies:
   - pytest-cov
   - pytest-xdist>=2.2.0
   - pytest-asyncio>=0.17.0
+  - pytest-localserver>=0.7.1
   - coverage
 
   # required dependencies
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
index c9890032f408a..170e2f61e7d4a 100644
--- a/pandas/tests/io/conftest.py
+++ b/pandas/tests/io/conftest.py
@@ -45,6 +45,11 @@ def feather_file(datapath):
     return datapath("io", "data", "feather", "feather-0_3_1.feather")
 
 
+@pytest.fixture
+def xml_file(datapath):
+    return datapath("io", "data", "xml", "books.xml")
+
+
 @pytest.fixture
 def s3so(worker_id):
     if is_ci_environment():
@@ -141,7 +146,9 @@ def s3_public_bucket(s3_resource):
 
 
 @pytest.fixture
-def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_file):
+def s3_public_bucket_with_data(
+    s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file
+):
     """
     The following datasets
     are loaded.
@@ -158,6 +165,7 @@ def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_
         ("tips.csv.bz2", tips_file + ".bz2"),
         ("items.jsonl", jsonl_file),
         ("simple_dataset.feather", feather_file),
+        ("books.xml", xml_file),
     ]
     for s3_key, file_name in test_s3_files:
         with open(file_name, "rb") as f:
@@ -175,7 +183,9 @@ def s3_private_bucket(s3_resource):
 
 
 @pytest.fixture
-def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feather_file):
+def s3_private_bucket_with_data(
+    s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file
+):
     """
     The following datasets
     are loaded.
@@ -192,6 +202,7 @@ def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feathe
         ("tips.csv.bz2", tips_file + ".bz2"),
         ("items.jsonl", jsonl_file),
         ("simple_dataset.feather", feather_file),
+        ("books.xml", xml_file),
     ]
     for s3_key, file_name in test_s3_files:
         with open(file_name, "rb") as f:
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 88f55145b599a..f507314928784 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -876,19 +876,11 @@ def test_corrupt_bytes_raises(self, engine):
             pd.read_excel(bad_stream)
 
     @pytest.mark.network
-    @tm.network(
-        url=(
-            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-            "pandas/tests/io/data/excel/test1.xlsx"
-        ),
-        check_before_test=True,
-    )
-    def test_read_from_http_url(self, read_ext):
-        url = (
-            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-            "pandas/tests/io/data/excel/test1" + read_ext
-        )
-        url_table = pd.read_excel(url)
+    @pytest.mark.single_cpu
+    def test_read_from_http_url(self, httpserver, read_ext):
+        with open("test1" + read_ext, "rb") as f:
+            httpserver.serve_content(content=f.read())
+        url_table = pd.read_excel(httpserver.url)
         local_table = pd.read_excel("test1" + read_ext)
         tm.assert_frame_equal(url_table, local_table)
 
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 90c48012ccac9..b6b21f9962876 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -998,10 +998,7 @@ def test_round_trip_exception(self, datapath):
         tm.assert_frame_equal(res, df)
 
     @pytest.mark.network
-    @tm.network(
-        url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5",
-        check_before_test=True,
-    )
+    @pytest.mark.single_cpu
     @pytest.mark.parametrize(
         "field,dtype",
         [
@@ -1010,9 +1007,10 @@ def test_round_trip_exception(self, datapath):
             ["updated_at", pd.DatetimeTZDtype(tz="UTC")],
         ],
     )
-    def test_url(self, field, dtype):
-        url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5"
-        result = read_json(url, convert_dates=True)
+    def test_url(self, field, dtype, httpserver):
+        data = '{"created_at": ["2023-06-23T18:21:36Z"], "closed_at": ["2023-06-23T18:21:36"], "updated_at": ["2023-06-23T18:21:36Z"]}\n'  # noqa: E501
+        httpserver.serve_content(content=data)
+        result = read_json(httpserver.url, convert_dates=True)
         assert result[field].dtype == dtype
 
     def test_timedelta(self):
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
index ba196a532adf6..c0f5c1a203e94 100644
--- a/pandas/tests/io/parser/common/test_file_buffer_url.py
+++ b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -28,24 +28,17 @@
 
 
 @pytest.mark.network
-@tm.network(
-    url=(
-        "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-        "pandas/tests/io/parser/data/salaries.csv"
-    ),
-    check_before_test=True,
-)
-def test_url(all_parsers, csv_dir_path):
+@pytest.mark.single_cpu
+def test_url(all_parsers, csv_dir_path, httpserver):
     parser = all_parsers
     kwargs = {"sep": "\t"}
 
-    url = (
-        "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-        "pandas/tests/io/parser/data/salaries.csv"
-    )
-    url_result = parser.read_csv(url, **kwargs)
-
     local_path = os.path.join(csv_dir_path, "salaries.csv")
+    with open(local_path) as f:
+        httpserver.serve_content(content=f.read())
+
+    url_result = parser.read_csv(httpserver.url, **kwargs)
+
     local_result = parser.read_csv(local_path, **kwargs)
     tm.assert_frame_equal(url_result, local_result)
 
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index d8c58649984fa..184d9338d4c1f 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -22,27 +22,25 @@
 
 
 @pytest.mark.network
-@tm.network(
-    url=(
-        "https://github.com/pandas-dev/pandas/raw/main/"
-        "pandas/tests/io/parser/data/salaries.csv"
-    ),
-    check_before_test=True,
-)
+@pytest.mark.single_cpu
 @pytest.mark.parametrize("mode", ["explicit", "infer"])
 @pytest.mark.parametrize("engine", ["python", "c"])
 def test_compressed_urls(
-    salaries_table, mode, engine, compression_only, compression_to_extension
+    httpserver,
+    datapath,
+    salaries_table,
+    mode,
+    engine,
+    compression_only,
+    compression_to_extension,
 ):
     # test reading compressed urls with various engines and
     # extension inference
-    extension = compression_to_extension[compression_only]
-    base_url = (
-        "https://github.com/pandas-dev/pandas/raw/main/"
-        "pandas/tests/io/parser/data/salaries.csv"
-    )
+    compression_to_extension[compression_only]
+    with open(datapath("io", "parser", "data", "salaries.csv")) as f:
+        httpserver.serve_content(content=f.read())
 
-    url = base_url + extension
+    url = httpserver.url
 
     if mode != "explicit":
         compression_only = mode
@@ -52,24 +50,16 @@ def test_compressed_urls(
 
 
 @pytest.mark.network
-@tm.network(
-    url=(
-        "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-        "pandas/tests/io/parser/data/unicode_series.csv"
-    ),
-    check_before_test=True,
-)
-def test_url_encoding_csv():
+@pytest.mark.single_cpu
+def test_url_encoding_csv(httpserver, datapath):
     """
     read_csv should honor the requested encoding for URLs.
 
     GH 10424
     """
-    path = (
-        "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-        "pandas/tests/io/parser/data/unicode_series.csv"
-    )
-    df = read_csv(path, encoding="latin-1", header=None)
+    with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
+        httpserver.serve_content(content=f.read())
+        df = read_csv(httpserver.url, encoding="latin-1", header=None)
     assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
 
 
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index c682963c462cc..7f622295472e4 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -1013,47 +1013,19 @@ def test_invalid_dtype_backend():
 
 
 @pytest.mark.network
-@tm.network(
-    url="ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt",
-    check_before_test=True,
-)
-def test_url_urlopen():
-    expected = pd.Index(
-        [
-            "CC",
-            "Network",
-            "Code",
-            "StationId",
-            "Latitude",
-            "Longitude",
-            "Elev",
-            "dummy",
-            "StationName",
-            "From",
-            "To",
-            "Nrec",
-        ],
-        dtype="object",
-    )
-    url = "ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt"
-    with urlopen(url) as f:
-        result = read_fwf(
-            f,
-            widths=(2, 1, 3, 5, 9, 10, 7, 4, 30, 5, 5, 7),
-            names=(
-                "CC",
-                "Network",
-                "Code",
-                "StationId",
-                "Latitude",
-                "Longitude",
-                "Elev",
-                "dummy",
-                "StationName",
-                "From",
-                "To",
-                "Nrec",
-            ),
-        ).columns
+@pytest.mark.single_cpu
+def test_url_urlopen(httpserver):
+    data = """\
+A         B            C            D
+201158    360.242940   149.910199   11950.7
+201159    444.953632   166.985655   11788.4
+201160    364.136849   183.628767   11806.2
+201161    413.836124   184.375703   11916.8
+201162    502.953953   173.237159   12468.3
+"""
+    httpserver.serve_content(content=data)
+    expected = pd.Index(list("ABCD"))
+    with urlopen(httpserver.url) as f:
+        result = read_fwf(f).columns
 
     tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index 7b139dc45624e..9de097fe8c0e6 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -142,21 +142,13 @@ def test_passthrough_keywords(self):
         self.check_round_trip(df, write_kwargs={"version": 1})
 
     @pytest.mark.network
-    @tm.network(
-        url=(
-            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-            "pandas/tests/io/data/feather/feather-0_3_1.feather"
-        ),
-        check_before_test=True,
-    )
-    def test_http_path(self, feather_file):
+    @pytest.mark.single_cpu
+    def test_http_path(self, feather_file, httpserver):
         # GH 29055
-        url = (
-            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-            "pandas/tests/io/data/feather/feather-0_3_1.feather"
-        )
         expected = read_feather(feather_file)
-        res = read_feather(url)
+        with open(feather_file, "rb") as f:
+            httpserver.serve_content(content=f.read())
+            res = read_feather(httpserver.url)
         tm.assert_frame_equal(expected, res)
 
     def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 5c6c33de5ac5f..dd0ce417e1f15 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -35,7 +35,6 @@
 )
 
 from pandas.io.common import file_path_to_url
-import pandas.io.html
 
 
 @pytest.fixture(
@@ -193,43 +192,30 @@ def test_dtype_backend(self, string_storage, dtype_backend):
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.network
-    @tm.network(
-        url=(
-            "https://www.fdic.gov/resources/resolutions/"
-            "bank-failures/failed-bank-list/index.html"
-        ),
-        check_before_test=True,
-    )
-    def test_banklist_url(self):
-        url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html"  # noqa: E501
-        df1 = self.read_html(
+    @pytest.mark.single_cpu
+    def test_banklist_url(self, httpserver, banklist_data):
+        with open(banklist_data) as f:
+            httpserver.serve_content(content=f.read())
+            df1 = self.read_html(
+                # lxml cannot find attrs leave out for now
+                httpserver.url,
+                match="First Federal Bank of Florida",  # attrs={"class": "dataTable"}
+            )
             # lxml cannot find attrs leave out for now
-            url,
-            match="First Federal Bank of Florida",  # attrs={"class": "dataTable"}
-        )
-        # lxml cannot find attrs leave out for now
-        df2 = self.read_html(
-            url,
-            match="Metcalf Bank",
-        )  # attrs={"class": "dataTable"})
+            df2 = self.read_html(
+                httpserver.url,
+                match="Metcalf Bank",
+            )  # attrs={"class": "dataTable"})
 
         assert_framelist_equal(df1, df2)
 
     @pytest.mark.network
-    @tm.network(
-        url=(
-            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-            "pandas/tests/io/data/html/spam.html"
-        ),
-        check_before_test=True,
-    )
-    def test_spam_url(self):
-        url = (
-            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
-            "pandas/tests/io/data/html/spam.html"
-        )
-        df1 = self.read_html(url, match=".*Water.*")
-        df2 = self.read_html(url, match="Unit")
+    @pytest.mark.single_cpu
+    def test_spam_url(self, httpserver, spam_data):
+        with open(spam_data) as f:
+            httpserver.serve_content(content=f.read())
+            df1 = self.read_html(httpserver.url, match=".*Water.*")
+            df2 = self.read_html(httpserver.url, match="Unit")
 
         assert_framelist_equal(df1, df2)
 
@@ -366,21 +352,19 @@ def test_file_like(self, spam_data):
         assert_framelist_equal(df1, df2)
 
     @pytest.mark.network
-    @tm.network
-    def test_bad_url_protocol(self):
+    @pytest.mark.single_cpu
+    def test_bad_url_protocol(self, httpserver):
+        httpserver.serve_content("urlopen error unknown url type: git", code=404)
         with pytest.raises(URLError, match="urlopen error unknown url type: git"):
             self.read_html("git://github.com", match=".*Water.*")
 
     @pytest.mark.slow
     @pytest.mark.network
-    @tm.network
-    def test_invalid_url(self):
-        msg = (
-            "Name or service not known|Temporary failure in name resolution|"
-            "No tables found"
-        )
-        with pytest.raises((URLError, ValueError), match=msg):
-            self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*")
+    @pytest.mark.single_cpu
+    def test_invalid_url(self, httpserver):
+        httpserver.serve_content("Name or service not known", code=404)
+        with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"):
+            self.read_html(httpserver.url, match=".*Water.*")
 
     @pytest.mark.slow
     def test_file_url(self, banklist_data):
@@ -454,20 +438,69 @@ def test_negative_skiprows(self, spam_data):
         with pytest.raises(ValueError, match=msg):
             self.read_html(spam_data, match="Water", skiprows=-1)
 
+    @pytest.fixture
+    def python_docs(self):
+        return """
+          <table class="contentstable" align="center"><tr>
+            <td width="50%">
+            <p class="biglink"><a class="biglink" href="whatsnew/2.7.html">What's new in Python 2.7?</a><br/>
+                <span class="linkdescr">or <a href="whatsnew/index.html">all "What's new" documents</a> since 2.0</span></p>
+            <p class="biglink"><a class="biglink" href="tutorial/index.html">Tutorial</a><br/>
+                <span class="linkdescr">start here</span></p>
+            <p class="biglink"><a class="biglink" href="library/index.html">Library Reference</a><br/>
+                <span class="linkdescr">keep this under your pillow</span></p>
+            <p class="biglink"><a class="biglink" href="reference/index.html">Language Reference</a><br/>
+                <span class="linkdescr">describes syntax and language elements</span></p>
+            <p class="biglink"><a class="biglink" href="using/index.html">Python Setup and Usage</a><br/>
+                <span class="linkdescr">how to use Python on different platforms</span></p>
+            <p class="biglink"><a class="biglink" href="howto/index.html">Python HOWTOs</a><br/>
+                <span class="linkdescr">in-depth documents on specific topics</span></p>
+            </td><td width="50%">
+            <p class="biglink"><a class="biglink" href="installing/index.html">Installing Python Modules</a><br/>
+                <span class="linkdescr">installing from the Python Package Index &amp; other sources</span></p>
+            <p class="biglink"><a class="biglink" href="distributing/index.html">Distributing Python Modules</a><br/>
+                <span class="linkdescr">publishing modules for installation by others</span></p>
+            <p class="biglink"><a class="biglink" href="extending/index.html">Extending and Embedding</a><br/>
+                <span class="linkdescr">tutorial for C/C++ programmers</span></p>
+            <p class="biglink"><a class="biglink" href="c-api/index.html">Python/C API</a><br/>
+                <span class="linkdescr">reference for C/C++ programmers</span></p>
+            <p class="biglink"><a class="biglink" href="faq/index.html">FAQs</a><br/>
+                <span class="linkdescr">frequently asked questions (with answers!)</span></p>
+            </td></tr>
+        </table>
+
+        <p><strong>Indices and tables:</strong></p>
+        <table class="contentstable" align="center"><tr>
+            <td width="50%">
+            <p class="biglink"><a class="biglink" href="py-modindex.html">Python Global Module Index</a><br/>
+                <span class="linkdescr">quick access to all modules</span></p>
+            <p class="biglink"><a class="biglink" href="genindex.html">General Index</a><br/>
+                <span class="linkdescr">all functions, classes, terms</span></p>
+            <p class="biglink"><a class="biglink" href="glossary.html">Glossary</a><br/>
+                <span class="linkdescr">the most important terms explained</span></p>
+            </td><td width="50%">
+            <p class="biglink"><a class="biglink" href="search.html">Search page</a><br/>
+                <span class="linkdescr">search this documentation</span></p>
+            <p class="biglink"><a class="biglink" href="contents.html">Complete Table of Contents</a><br/>
+                <span class="linkdescr">lists all sections and subsections</span></p>
+            </td></tr>
+        </table>
+        """  # noqa: E501
+
     @pytest.mark.network
-    @tm.network(url="https://docs.python.org/2/", check_before_test=True)
-    def test_multiple_matches(self):
-        url = "https://docs.python.org/2/"
-        dfs = self.read_html(url, match="Python")
+    @pytest.mark.single_cpu
+    def test_multiple_matches(self, python_docs, httpserver):
+        httpserver.serve_content(content=python_docs)
+        dfs = self.read_html(httpserver.url, match="Python")
         assert len(dfs) > 1
 
     @pytest.mark.network
-    @tm.network(url="https://docs.python.org/2/", check_before_test=True)
-    def test_python_docs_table(self):
-        url = "https://docs.python.org/2/"
-        dfs = self.read_html(url, match="Python")
+    @pytest.mark.single_cpu
+    def test_python_docs_table(self, python_docs, httpserver):
+        httpserver.serve_content(content=python_docs)
+        dfs = self.read_html(httpserver.url, match="Python")
         zz = [df.iloc[0, 0][0:4] for df in dfs]
-        assert sorted(zz) == sorted(["Repo", "What"])
+        assert sorted(zz) == ["Pyth", "What"]
 
     def test_empty_tables(self):
         """
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index f2ff526a58f99..35bf75d3928f8 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -376,21 +376,13 @@ def check_external_error_on_write(self, df, engine, exc):
                 to_parquet(df, path, engine, compression=None)
 
     @pytest.mark.network
-    @tm.network(
-        url=(
-            "https://raw.githubusercontent.com/pandas-dev/pandas/"
-            "main/pandas/tests/io/data/parquet/simple.parquet"
-        ),
-        check_before_test=True,
-    )
-    def test_parquet_read_from_url(self, df_compat, engine):
+    @pytest.mark.single_cpu
+    def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
         if engine != "auto":
             pytest.importorskip(engine)
-        url = (
-            "https://raw.githubusercontent.com/pandas-dev/pandas/"
-            "main/pandas/tests/io/data/parquet/simple.parquet"
-        )
-        df = read_parquet(url)
+        with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
+            httpserver.serve_content(content=f.read())
+            df = read_parquet(httpserver.url)
         tm.assert_frame_equal(df, df_compat)
 
 
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
index 5171ec04b0bcf..5c23000270c99 100644
--- a/pandas/tests/io/test_s3.py
+++ b/pandas/tests/io/test_s3.py
@@ -5,7 +5,6 @@
 import pandas.util._test_decorators as td
 
 from pandas import read_csv
-import pandas._testing as tm
 
 
 def test_streaming_s3_objects():
@@ -21,28 +20,24 @@ def test_streaming_s3_objects():
 
 
 @td.skip_if_no("s3fs")
-@pytest.mark.network
-@tm.network
-def test_read_without_creds_from_pub_bucket():
+def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data):
     # GH 34626
     # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
-    result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3)
+    result = read_csv(f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=3)
     assert len(result) == 3
 
 
 @td.skip_if_no("s3fs")
-@pytest.mark.network
-@tm.network
-def test_read_with_creds_from_pub_bucket(monkeypatch):
+@pytest.mark.single_cpu
+def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch):
     # Ensure we can read from a public bucket with credentials
     # GH 34626
-    # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
 
     # temporary workaround as moto fails for botocore >= 1.11 otherwise,
     # see https://github.com/spulec/moto/issues/1924 & 1952
     monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
     monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
     df = read_csv(
-        "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None
+        f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, header=None
     )
     assert len(df) == 5
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index b0e806caecc80..80c63d0e17de1 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -14,7 +14,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import is_ci_environment
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
     EmptyDataError,
@@ -297,53 +296,16 @@ def test_parser_consistency_file(xml_books):
 
 
 @pytest.mark.network
-@pytest.mark.slow
-@tm.network(
-    url=(
-        "https://data.cityofchicago.org/api/views/"
-        "8pix-ypme/rows.xml?accessType=DOWNLOAD"
-    ),
-    check_before_test=True,
-)
-def test_parser_consistency_url(parser):
-    url = (
-        "https://data.cityofchicago.org/api/views/"
-        "8pix-ypme/rows.xml?accessType=DOWNLOAD"
-    )
-
-    with tm.ensure_clean(filename="cta.xml") as path:
-        (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False))
+@pytest.mark.single_cpu
+def test_parser_consistency_url(parser, httpserver):
+    httpserver.serve_content(content=xml_default_nmsp)
 
-        df_xpath = read_xml(path, parser=parser)
-        df_iter = read_xml(
-            path,
-            parser=parser,
-            iterparse={
-                "row": [
-                    "_id",
-                    "_uuid",
-                    "_position",
-                    "_address",
-                    "stop_id",
-                    "direction_id",
-                    "stop_name",
-                    "station_name",
-                    "station_descriptive_name",
-                    "map_id",
-                    "ada",
-                    "red",
-                    "blue",
-                    "g",
-                    "brn",
-                    "p",
-                    "pexp",
-                    "y",
-                    "pnk",
-                    "o",
-                    "location",
-                ]
-            },
-        )
+    df_xpath = read_xml(xml_default_nmsp, parser=parser)
+    df_iter = read_xml(
+        BytesIO(xml_default_nmsp.encode()),
+        parser=parser,
+        iterparse={"row": ["shape", "degrees", "sides"]},
+    )
 
     tm.assert_frame_equal(df_xpath, df_iter)
 
@@ -520,14 +482,11 @@ def test_wrong_file_path_etree():
 
 
 @pytest.mark.network
-@tm.network(
-    url="https://www.w3schools.com/xml/books.xml",
-    check_before_test=True,
-)
 @td.skip_if_no("lxml")
-def test_url():
-    url = "https://www.w3schools.com/xml/books.xml"
-    df_url = read_xml(url, xpath=".//book[count(*)=4]")
+def test_url(httpserver, xml_file):
+    with open(xml_file) as f:
+        httpserver.serve_content(content=f.read())
+        df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]")
 
     df_expected = DataFrame(
         {
@@ -536,7 +495,6 @@ def test_url():
             "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
             "year": [2005, 2005, 2003],
             "price": [30.00, 29.99, 39.95],
-            "cover": [None, None, "paperback"],
         }
     )
 
@@ -544,11 +502,11 @@ def test_url():
 
 
 @pytest.mark.network
-@tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True)
-def test_wrong_url(parser):
-    with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")):
-        url = "https://www.w3schools.com/xml/python.xml"
-        read_xml(url, xpath=".//book[count(*)=4]", parser=parser)
+@pytest.mark.single_cpu
+def test_wrong_url(parser, httpserver):
+    httpserver.serve_content("NOT FOUND", code=404)
+    with pytest.raises(HTTPError, match=("HTTP Error 404: NOT FOUND")):
+        read_xml(httpserver.url, xpath=".//book[count(*)=4]", parser=parser)
 
 
 # XPATH
@@ -1429,17 +1387,18 @@ def test_file_io_iterparse(xml_books, parser, mode):
 
 
 @pytest.mark.network
-@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True)
-def test_url_path_error(parser):
-    url = "https://www.w3schools.com/xml/books.xml"
-    with pytest.raises(
-        ParserError, match=("iterparse is designed for large XML files")
-    ):
-        read_xml(
-            url,
-            parser=parser,
-            iterparse={"row": ["shape", "degrees", "sides", "date"]},
-        )
+@pytest.mark.single_cpu
+def test_url_path_error(parser, httpserver, xml_file):
+    with open(xml_file) as f:
+        httpserver.serve_content(content=f.read())
+        with pytest.raises(
+            ParserError, match=("iterparse is designed for large XML files")
+        ):
+            read_xml(
+                httpserver.url,
+                parser=parser,
+                iterparse={"row": ["shape", "degrees", "sides", "date"]},
+            )
 
 
 def test_compression_error(parser, compression_only):
@@ -1641,14 +1600,245 @@ def test_empty_data(xml_books, parser):
         )
 
 
-@pytest.mark.network
 @td.skip_if_no("lxml")
-@tm.network(
-    url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True
-)
 def test_online_stylesheet():
-    xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml"
-    xsl = "https://www.w3schools.com/xml/cdcatalog.xsl"
+    xml = """\
+<?xml version="1.0" encoding="UTF-8"?>
+<catalog>
+  <cd>
+    <title>Empire Burlesque</title>
+    <artist>Bob Dylan</artist>
+    <country>USA</country>
+    <company>Columbia</company>
+    <price>10.90</price>
+    <year>1985</year>
+  </cd>
+  <cd>
+    <title>Hide your heart</title>
+    <artist>Bonnie Tyler</artist>
+    <country>UK</country>
+    <company>CBS Records</company>
+    <price>9.90</price>
+    <year>1988</year>
+  </cd>
+  <cd>
+    <title>Greatest Hits</title>
+    <artist>Dolly Parton</artist>
+    <country>USA</country>
+    <company>RCA</company>
+    <price>9.90</price>
+    <year>1982</year>
+  </cd>
+  <cd>
+    <title>Still got the blues</title>
+    <artist>Gary Moore</artist>
+    <country>UK</country>
+    <company>Virgin records</company>
+    <price>10.20</price>
+    <year>1990</year>
+  </cd>
+  <cd>
+    <title>Eros</title>
+    <artist>Eros Ramazzotti</artist>
+    <country>EU</country>
+    <company>BMG</company>
+    <price>9.90</price>
+    <year>1997</year>
+  </cd>
+  <cd>
+    <title>One night only</title>
+    <artist>Bee Gees</artist>
+    <country>UK</country>
+    <company>Polydor</company>
+    <price>10.90</price>
+    <year>1998</year>
+  </cd>
+  <cd>
+    <title>Sylvias Mother</title>
+    <artist>Dr.Hook</artist>
+    <country>UK</country>
+    <company>CBS</company>
+    <price>8.10</price>
+    <year>1973</year>
+  </cd>
+  <cd>
+    <title>Maggie May</title>
+    <artist>Rod Stewart</artist>
+    <country>UK</country>
+    <company>Pickwick</company>
+    <price>8.50</price>
+    <year>1990</year>
+  </cd>
+  <cd>
+    <title>Romanza</title>
+    <artist>Andrea Bocelli</artist>
+    <country>EU</country>
+    <company>Polydor</company>
+    <price>10.80</price>
+    <year>1996</year>
+  </cd>
+  <cd>
+    <title>When a man loves a woman</title>
+    <artist>Percy Sledge</artist>
+    <country>USA</country>
+    <company>Atlantic</company>
+    <price>8.70</price>
+    <year>1987</year>
+  </cd>
+  <cd>
+    <title>Black angel</title>
+    <artist>Savage Rose</artist>
+    <country>EU</country>
+    <company>Mega</company>
+    <price>10.90</price>
+    <year>1995</year>
+  </cd>
+  <cd>
+    <title>1999 Grammy Nominees</title>
+    <artist>Many</artist>
+    <country>USA</country>
+    <company>Grammy</company>
+    <price>10.20</price>
+    <year>1999</year>
+  </cd>
+  <cd>
+    <title>For the good times</title>
+    <artist>Kenny Rogers</artist>
+    <country>UK</country>
+    <company>Mucik Master</company>
+    <price>8.70</price>
+    <year>1995</year>
+  </cd>
+  <cd>
+    <title>Big Willie style</title>
+    <artist>Will Smith</artist>
+    <country>USA</country>
+    <company>Columbia</company>
+    <price>9.90</price>
+    <year>1997</year>
+  </cd>
+  <cd>
+    <title>Tupelo Honey</title>
+    <artist>Van Morrison</artist>
+    <country>UK</country>
+    <company>Polydor</company>
+    <price>8.20</price>
+    <year>1971</year>
+  </cd>
+  <cd>
+    <title>Soulsville</title>
+    <artist>Jorn Hoel</artist>
+    <country>Norway</country>
+    <company>WEA</company>
+    <price>7.90</price>
+    <year>1996</year>
+  </cd>
+  <cd>
+    <title>The very best of</title>
+    <artist>Cat Stevens</artist>
+    <country>UK</country>
+    <company>Island</company>
+    <price>8.90</price>
+    <year>1990</year>
+  </cd>
+  <cd>
+    <title>Stop</title>
+    <artist>Sam Brown</artist>
+    <country>UK</country>
+    <company>A and M</company>
+    <price>8.90</price>
+    <year>1988</year>
+  </cd>
+  <cd>
+    <title>Bridge of Spies</title>
+    <artist>T`Pau</artist>
+    <country>UK</country>
+    <company>Siren</company>
+    <price>7.90</price>
+    <year>1987</year>
+  </cd>
+  <cd>
+    <title>Private Dancer</title>
+    <artist>Tina Turner</artist>
+    <country>UK</country>
+    <company>Capitol</company>
+    <price>8.90</price>
+    <year>1983</year>
+  </cd>
+  <cd>
+    <title>Midt om natten</title>
+    <artist>Kim Larsen</artist>
+    <country>EU</country>
+    <company>Medley</company>
+    <price>7.80</price>
+    <year>1983</year>
+  </cd>
+  <cd>
+    <title>Pavarotti Gala Concert</title>
+    <artist>Luciano Pavarotti</artist>
+    <country>UK</country>
+    <company>DECCA</company>
+    <price>9.90</price>
+    <year>1991</year>
+  </cd>
+  <cd>
+    <title>The dock of the bay</title>
+    <artist>Otis Redding</artist>
+    <country>USA</country>
+    <COMPANY>Stax Records</COMPANY>
+    <PRICE>7.90</PRICE>
+    <YEAR>1968</YEAR>
+  </cd>
+  <cd>
+    <title>Picture book</title>
+    <artist>Simply Red</artist>
+    <country>EU</country>
+    <company>Elektra</company>
+    <price>7.20</price>
+    <year>1985</year>
+  </cd>
+  <cd>
+    <title>Red</title>
+    <artist>The Communards</artist>
+    <country>UK</country>
+    <company>London</company>
+    <price>7.80</price>
+    <year>1987</year>
+  </cd>
+  <cd>
+    <title>Unchain my heart</title>
+    <artist>Joe Cocker</artist>
+    <country>USA</country>
+    <company>EMI</company>
+    <price>8.20</price>
+    <year>1987</year>
+  </cd>
+</catalog>
+"""
+    xsl = """\
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+<xsl:template match="/">
+<html>
+<body>
+  <h2>My CD Collection</h2>
+  <table border="1">
+    <tr bgcolor="#9acd32">
+      <th style="text-align:left">Title</th>
+      <th style="text-align:left">Artist</th>
+    </tr>
+    <xsl:for-each select="catalog/cd">
+    <tr>
+      <td><xsl:value-of select="title"/></td>
+      <td><xsl:value-of select="artist"/></td>
+    </tr>
+    </xsl:for-each>
+  </table>
+</body>
+</html>
+</xsl:template>
+</xsl:stylesheet>
+"""
 
     df_xsl = read_xml(
         xml,
@@ -1740,32 +1930,15 @@ def test_unsuported_compression(parser):
 
 
 @pytest.mark.network
+@pytest.mark.single_cpu
 @td.skip_if_no("s3fs")
 @td.skip_if_no("lxml")
-@pytest.mark.skipif(
-    is_ci_environment(),
-    reason="2022.1.17: Hanging on the CI min versions build.",
-)
-@tm.network
-def test_s3_parser_consistency():
-    # Python Software Foundation (2019 IRS-990 RETURN)
-    s3 = "s3://irs-form-990/201923199349319487_public.xml"
+def test_s3_parser_consistency(s3_public_bucket_with_data, s3so):
+    s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml"
 
-    df_lxml = read_xml(
-        s3,
-        xpath=".//irs:Form990PartVIISectionAGrp",
-        namespaces={"irs": "http://www.irs.gov/efile"},
-        parser="lxml",
-        storage_options={"anon": True},
-    )
+    df_lxml = read_xml(s3, parser="lxml", storage_options=s3so)
 
-    df_etree = read_xml(
-        s3,
-        xpath=".//irs:Form990PartVIISectionAGrp",
-        namespaces={"irs": "http://www.irs.gov/efile"},
-        parser="etree",
-        storage_options={"anon": True},
-    )
+    df_etree = read_xml(s3, parser="etree", storage_options=s3so)
 
     tm.assert_frame_equal(df_lxml, df_etree)
 
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index 0f5fdbefd13d2..7354e313e24f4 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -139,14 +139,13 @@ def test_oo_optimized_datetime_index_unpickle():
     )
 
 
-@pytest.mark.network
-@tm.network
 def test_statsmodels():
     statsmodels = import_module("statsmodels")  # noqa: F841
-    import statsmodels.api as sm
     import statsmodels.formula.api as smf
 
-    df = sm.datasets.get_rdataset("Guerry", "HistData").data
+    df = DataFrame(
+        {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)}
+    )
     smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
 
 
@@ -163,11 +162,11 @@ def test_scikit_learn():
     clf.predict(digits.data[-1:])
 
 
-@pytest.mark.network
-@tm.network
 def test_seaborn():
     seaborn = import_module("seaborn")
-    tips = seaborn.load_dataset("tips")
+    tips = DataFrame(
+        {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)}
+    )
     seaborn.stripplot(x="day", y="total_bill", data=tips)
 
 
@@ -177,16 +176,8 @@ def test_pandas_gbq():
     pandas_gbq = import_module("pandas_gbq")  # noqa: F841
 
 
-@pytest.mark.network
-@tm.network
-@pytest.mark.xfail(
-    raises=ValueError,
-    reason="The Quandl API key must be provided either through the api_key "
-    "variable or through the environmental variable QUANDL_API_KEY",
-)
 def test_pandas_datareader():
-    pandas_datareader = import_module("pandas_datareader")
-    pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01")
+    pandas_datareader = import_module("pandas_datareader")  # noqa: F841
 
 
 def test_pyarrow(df):
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 38a2ce7f66aa3..b1d8ce1cf2143 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,6 +10,7 @@ pytest>=7.3.2
 pytest-cov
 pytest-xdist>=2.2.0
 pytest-asyncio>=0.17.0
+pytest-localserver>=0.7.1
 coverage
 python-dateutil
 numpy

From c02deb9742027b9bed346b70f205b36625ba0c20 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:16:09 -0700
Subject: [PATCH 2/7] Fix test, remove network function

---
 .../development/contributing_codebase.rst     |  19 +-
 pandas/_testing/_io.py                        | 253 ------------------
 pandas/tests/io/parser/test_network.py        |   9 +-
 pandas/tests/io/xml/test_xml.py               |   1 +
 4 files changed, 13 insertions(+), 269 deletions(-)

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 311120fc527d4..17d33bcb306bc 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -613,22 +613,15 @@ Testing involving network connectivity
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and
-lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the
-``tm.network`` decorator.
+lack of ownership of the server that is being connected to. If network connectivity is absolutely required, mock
+the network connection using the ``httpserver`` fixture from the
+`pytest-localserver plugin. <https://github.com/pytest-dev/pytest-localserver>`_
 
 .. code-block:: python
 
-    @tm.network   # noqa
-    def test_network():
-        result = package.call_to_internet()
-
-If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator.
-
-.. code-block:: python
-
-    @tm.network("https://www.somespecificsite.com", check_before_test=True)
-    def test_network():
-        result = pd.read_html("https://www.somespecificsite.com")
+    def test_network(httpserver):
+        httpserver.serve_content(content="content")
+        result = pd.read_html(httpserver.url)
 
 Example
 ^^^^^^^
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
index d79968a580e40..fa0bc58a132d4 100644
--- a/pandas/_testing/_io.py
+++ b/pandas/_testing/_io.py
@@ -1,10 +1,8 @@
 from __future__ import annotations
 
 import bz2
-from functools import wraps
 import gzip
 import io
-import socket
 import tarfile
 from typing import (
     TYPE_CHECKING,
@@ -20,8 +18,6 @@
 from pandas._testing._random import rands
 from pandas._testing.contexts import ensure_clean
 
-from pandas.io.common import urlopen
-
 if TYPE_CHECKING:
     from pandas._typing import (
         FilePath,
@@ -33,255 +29,6 @@
         Series,
     )
 
-# skip tests on exceptions with these messages
-_network_error_messages = (
-    # 'urlopen error timed out',
-    # 'timeout: timed out',
-    # 'socket.timeout: timed out',
-    "timed out",
-    "Server Hangup",
-    "HTTP Error 503: Service Unavailable",
-    "502: Proxy Error",
-    "HTTP Error 502: internal error",
-    "HTTP Error 502",
-    "HTTP Error 503",
-    "HTTP Error 403",
-    "HTTP Error 400",
-    "Temporary failure in name resolution",
-    "Name or service not known",
-    "Connection refused",
-    "certificate verify",
-)
-
-# or this e.errno/e.reason.errno
-_network_errno_vals = (
-    101,  # Network is unreachable
-    111,  # Connection refused
-    110,  # Connection timed out
-    104,  # Connection reset Error
-    54,  # Connection reset by peer
-    60,  # urllib.error.URLError: [Errno 60] Connection timed out
-)
-
-# Both of the above shouldn't mask real issues such as 404's
-# or refused connections (changed DNS).
-# But some tests (test_data yahoo) contact incredibly flakey
-# servers.
-
-# and conditionally raise on exception types in _get_default_network_errors
-
-
-def _get_default_network_errors():
-    # Lazy import for http.client & urllib.error
-    # because it imports many things from the stdlib
-    import http.client
-    import urllib.error
-
-    return (
-        OSError,
-        http.client.HTTPException,
-        TimeoutError,
-        urllib.error.URLError,
-        socket.timeout,
-    )
-
-
-def optional_args(decorator):
-    """
-    allows a decorator to take optional positional and keyword arguments.
-    Assumes that taking a single, callable, positional argument means that
-    it is decorating a function, i.e. something like this::
-
-        @my_decorator
-        def function(): pass
-
-    Calls decorator with decorator(f, *args, **kwargs)
-    """
-
-    @wraps(decorator)
-    def wrapper(*args, **kwargs):
-        def dec(f):
-            return decorator(f, *args, **kwargs)
-
-        is_decorating = not kwargs and len(args) == 1 and callable(args[0])
-        if is_decorating:
-            f = args[0]
-            args = ()
-            return dec(f)
-        else:
-            return dec
-
-    return wrapper
-
-
-# error: Untyped decorator makes function "network" untyped
-@optional_args  # type: ignore[misc]
-def network(
-    t,
-    url: str = "https://www.google.com",
-    raise_on_error: bool = False,
-    check_before_test: bool = False,
-    error_classes=None,
-    skip_errnos=_network_errno_vals,
-    _skip_on_messages=_network_error_messages,
-):
-    """
-    Label a test as requiring network connection and, if an error is
-    encountered, only raise if it does not find a network connection.
-
-    In comparison to ``network``, this assumes an added contract to your test:
-    you must assert that, under normal conditions, your test will ONLY fail if
-    it does not have network connectivity.
-
-    You can call this in 3 ways: as a standard decorator, with keyword
-    arguments, or with a positional argument that is the url to check.
-
-    Parameters
-    ----------
-    t : callable
-        The test requiring network connectivity.
-    url : path
-        The url to test via ``pandas.io.common.urlopen`` to check
-        for connectivity. Defaults to 'https://www.google.com'.
-    raise_on_error : bool
-        If True, never catches errors.
-    check_before_test : bool
-        If True, checks connectivity before running the test case.
-    error_classes : tuple or Exception
-        error classes to ignore. If not in ``error_classes``, raises the error.
-        defaults to OSError. Be careful about changing the error classes here.
-    skip_errnos : iterable of int
-        Any exception that has .errno or .reason.erno set to one
-        of these values will be skipped with an appropriate
-        message.
-    _skip_on_messages: iterable of string
-        any exception e for which one of the strings is
-        a substring of str(e) will be skipped with an appropriate
-        message. Intended to suppress errors where an errno isn't available.
-
-    Notes
-    -----
-    * ``raise_on_error`` supersedes ``check_before_test``
-
-    Returns
-    -------
-    t : callable
-        The decorated test ``t``, with checks for connectivity errors.
-
-    Example
-    -------
-
-    Tests decorated with @network will fail if it's possible to make a network
-    connection to another URL (defaults to google.com)::
-
-      >>> from pandas import _testing as tm
-      >>> @tm.network
-      ... def test_network():
-      ...     with pd.io.common.urlopen("rabbit://bonanza.com"):
-      ...         pass
-      >>> test_network()  # doctest: +SKIP
-      Traceback
-         ...
-      URLError: <urlopen error unknown url type: rabbit>
-
-      You can specify alternative URLs::
-
-        >>> @tm.network("https://www.yahoo.com")
-        ... def test_something_with_yahoo():
-        ...    raise OSError("Failure Message")
-        >>> test_something_with_yahoo()  # doctest: +SKIP
-        Traceback (most recent call last):
-            ...
-        OSError: Failure Message
-
-    If you set check_before_test, it will check the url first and not run the
-    test on failure::
-
-        >>> @tm.network("failing://url.blaher", check_before_test=True)
-        ... def test_something():
-        ...     print("I ran!")
-        ...     raise ValueError("Failure")
-        >>> test_something()  # doctest: +SKIP
-        Traceback (most recent call last):
-            ...
-
-    Errors not related to networking will always be raised.
-    """
-    import pytest
-
-    if error_classes is None:
-        error_classes = _get_default_network_errors()
-
-    t.network = True
-
-    @wraps(t)
-    def wrapper(*args, **kwargs):
-        if (
-            check_before_test
-            and not raise_on_error
-            and not can_connect(url, error_classes)
-        ):
-            pytest.skip(
-                f"May not have network connectivity because cannot connect to {url}"
-            )
-        try:
-            return t(*args, **kwargs)
-        except Exception as err:
-            errno = getattr(err, "errno", None)
-            if not errno and hasattr(errno, "reason"):
-                # error: "Exception" has no attribute "reason"
-                errno = getattr(err.reason, "errno", None)  # type: ignore[attr-defined]
-
-            if errno in skip_errnos:
-                pytest.skip(f"Skipping test due to known errno and error {err}")
-
-            e_str = str(err)
-
-            if any(m.lower() in e_str.lower() for m in _skip_on_messages):
-                pytest.skip(
-                    f"Skipping test because exception message is known and error {err}"
-                )
-
-            if not isinstance(err, error_classes) or raise_on_error:
-                raise
-            pytest.skip(f"Skipping test due to lack of connectivity and error {err}")
-
-    return wrapper
-
-
-def can_connect(url, error_classes=None) -> bool:
-    """
-    Try to connect to the given url. True if succeeds, False if OSError
-    raised
-
-    Parameters
-    ----------
-    url : basestring
-        The URL to try to connect to
-
-    Returns
-    -------
-    connectable : bool
-        Return True if no OSError (unable to connect) or URLError (bad url) was
-        raised
-    """
-    if error_classes is None:
-        error_classes = _get_default_network_errors()
-
-    try:
-        with urlopen(url, timeout=20) as response:
-            # Timeout just in case rate-limiting is applied
-            if (
-                response.info().get("Content-type") == "text/html"
-                and response.status != 200
-            ):
-                return False
-    except error_classes:
-        return False
-    else:
-        return True
-
-
 # ------------------------------------------------------------------
 # File-IO
 
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index 184d9338d4c1f..ba0307cf5111e 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -36,11 +36,14 @@ def test_compressed_urls(
 ):
     # test reading compressed urls with various engines and
     # extension inference
-    compression_to_extension[compression_only]
-    with open(datapath("io", "parser", "data", "salaries.csv")) as f:
+    if compression_only == "tar":
+        pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
+
+    extension = compression_to_extension[compression_only]
+    with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
         httpserver.serve_content(content=f.read())
 
-    url = httpserver.url
+    url = httpserver.url + "/salaries.csv" + extension
 
     if mode != "explicit":
         compression_only = mode
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 80c63d0e17de1..40609598916a6 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -482,6 +482,7 @@ def test_wrong_file_path_etree():
 
 
 @pytest.mark.network
+@pytest.mark.single_cpu
 @td.skip_if_no("lxml")
 def test_url(httpserver, xml_file):
     with open(xml_file) as f:

From c958362fa529c3975c7185e8dd9b67013ed775fa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:39:14 -0700
Subject: [PATCH 3/7] remove network from init

---
 pandas/_testing/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index de3dd58d3b716..fbbdfa4b8a5bf 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -51,7 +51,6 @@
 )
 from pandas._testing._io import (
     close,
-    network,
     round_trip_localpath,
     round_trip_pathlib,
     round_trip_pickle,
@@ -1150,7 +1149,6 @@ def shares_memory(left, right) -> bool:
     "makeUIntIndex",
     "maybe_produces_warning",
     "NARROW_NP_DTYPES",
-    "network",
     "NP_NAT_OBJECTS",
     "NULL_OBJECTS",
     "OBJECT_DTYPES",

From 693d7170ebfdc957d110f11bce28b1520c45e85d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 21:17:56 -0700
Subject: [PATCH 4/7] Ignore disutils from datareader, s3so

---
 pandas/tests/io/test_s3.py | 17 ++++++++++++-----
 pyproject.toml             |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
index 5c23000270c99..35250f1dd3081 100644
--- a/pandas/tests/io/test_s3.py
+++ b/pandas/tests/io/test_s3.py
@@ -20,16 +20,20 @@ def test_streaming_s3_objects():
 
 
 @td.skip_if_no("s3fs")
-def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data):
+@pytest.mark.single_cpu
+def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so):
     # GH 34626
-    # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
-    result = read_csv(f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=3)
+    result = read_csv(
+        f"s3://{s3_public_bucket_with_data.name}/tips.csv",
+        nrows=3,
+        storage_options=s3so,
+    )
     assert len(result) == 3
 
 
 @td.skip_if_no("s3fs")
 @pytest.mark.single_cpu
-def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch):
+def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so):
     # Ensure we can read from a public bucket with credentials
     # GH 34626
 
@@ -38,6 +42,9 @@ def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch
     monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
     monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
     df = read_csv(
-        f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, header=None
+        f"s3://{s3_public_bucket_with_data.name}/tips.csv",
+        nrows=5,
+        header=None,
+        storage_options=s3so,
     )
     assert len(df) == 5
diff --git a/pyproject.toml b/pyproject.toml
index 0d1bca886a638..ef257b3143598 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -483,6 +483,7 @@ filterwarnings = [
   "ignore:a closed node found in the registry:UserWarning:tables",
   "ignore:`np.object` is a deprecated:DeprecationWarning:tables",
   "ignore:tostring:DeprecationWarning:tables",
+  "ignore:distutils Version classes are deprecated:DeprecationWarning:pandas_datareader",
   "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr",
   "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet",
   "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec",

From 08549b4c26bc89ab5b2ecfced6f8614914862d76 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 22:15:27 -0700
Subject: [PATCH 5/7] specify encoding

---
 pandas/tests/io/parser/common/test_file_buffer_url.py | 2 +-
 pandas/tests/io/test_html.py                          | 4 ++--
 pandas/tests/io/xml/test_xml.py                       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
index c0f5c1a203e94..5ee629947db48 100644
--- a/pandas/tests/io/parser/common/test_file_buffer_url.py
+++ b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -34,7 +34,7 @@ def test_url(all_parsers, csv_dir_path, httpserver):
     kwargs = {"sep": "\t"}
 
     local_path = os.path.join(csv_dir_path, "salaries.csv")
-    with open(local_path) as f:
+    with open(local_path, encoding="utf-8") as f:
         httpserver.serve_content(content=f.read())
 
     url_result = parser.read_csv(httpserver.url, **kwargs)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index dd0ce417e1f15..d17e4b08b5a4d 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -194,7 +194,7 @@ def test_dtype_backend(self, string_storage, dtype_backend):
     @pytest.mark.network
     @pytest.mark.single_cpu
     def test_banklist_url(self, httpserver, banklist_data):
-        with open(banklist_data) as f:
+        with open(banklist_data, encoding="utf-8") as f:
             httpserver.serve_content(content=f.read())
             df1 = self.read_html(
                 # lxml cannot find attrs leave out for now
@@ -212,7 +212,7 @@ def test_banklist_url(self, httpserver, banklist_data):
     @pytest.mark.network
     @pytest.mark.single_cpu
     def test_spam_url(self, httpserver, spam_data):
-        with open(spam_data) as f:
+        with open(spam_data, encoding="utf-8") as f:
             httpserver.serve_content(content=f.read())
             df1 = self.read_html(httpserver.url, match=".*Water.*")
             df2 = self.read_html(httpserver.url, match="Unit")
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 40609598916a6..49fb43bf40006 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -1390,7 +1390,7 @@ def test_file_io_iterparse(xml_books, parser, mode):
 @pytest.mark.network
 @pytest.mark.single_cpu
 def test_url_path_error(parser, httpserver, xml_file):
-    with open(xml_file) as f:
+    with open(xml_file, encoding="utf-8") as f:
         httpserver.serve_content(content=f.read())
         with pytest.raises(
             ParserError, match=("iterparse is designed for large XML files")

From 6b09451a583a78bcbdbf0b8c98da045869757d7b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Sat, 24 Jun 2023 10:46:02 -0700
Subject: [PATCH 6/7] Specify encoding

---
 pandas/tests/io/xml/test_xml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 49fb43bf40006..a3a1646bc4748 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -485,7 +485,7 @@ def test_wrong_file_path_etree():
 @pytest.mark.single_cpu
 @td.skip_if_no("lxml")
 def test_url(httpserver, xml_file):
-    with open(xml_file) as f:
+    with open(xml_file, encoding="utf-8") as f:
         httpserver.serve_content(content=f.read())
         df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]")
 

From 82acdc1ded645aa674fe2f858ca479e9917700e3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 26 Jun 2023 15:32:24 -0700
Subject: [PATCH 7/7] Clarify contributing doc

---
 doc/source/development/contributing_codebase.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 17d33bcb306bc..00f9fd74e01ca 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -612,13 +612,14 @@ deleted when the context block is exited.
 Testing involving network connectivity
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and
-lack of ownership of the server that is being connected to. If network connectivity is absolutely required, mock
-the network connection using the ``httpserver`` fixture from the
-`pytest-localserver plugin. <https://github.com/pytest-dev/pytest-localserver>`_
+A unit test should not access a public data set over the internet due to flakiness of network connections and
+lack of ownership of the server that is being connected to. To mock this interaction, use the ``httpserver`` fixture from the
+`pytest-localserver plugin. <https://github.com/pytest-dev/pytest-localserver>`_ with synthetic data.
 
 .. code-block:: python
 
+    @pytest.mark.network
+    @pytest.mark.single_cpu
     def test_network(httpserver):
         httpserver.serve_content(content="content")
         result = pd.read_html(httpserver.url)