From 3bb72c7dc0af881fbefec789b00ba3274deb26ff Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 18:31:53 -0700
Subject: [PATCH 1/7] Use pytest-localserver instead of making network
connections
---
ci/deps/actions-310.yaml | 1 +
ci/deps/actions-311-downstream_compat.yaml | 1 +
ci/deps/actions-311.yaml | 1 +
ci/deps/actions-39-minimum_versions.yaml | 1 +
ci/deps/actions-39.yaml | 1 +
ci/deps/circle-310-arm64.yaml | 1 +
environment.yml | 1 +
pandas/tests/io/conftest.py | 15 +-
pandas/tests/io/excel/test_readers.py | 18 +-
pandas/tests/io/json/test_pandas.py | 12 +-
.../io/parser/common/test_file_buffer_url.py | 21 +-
pandas/tests/io/parser/test_network.py | 44 +--
pandas/tests/io/parser/test_read_fwf.py | 56 +--
pandas/tests/io/test_feather.py | 18 +-
pandas/tests/io/test_html.py | 137 ++++---
pandas/tests/io/test_parquet.py | 18 +-
pandas/tests/io/test_s3.py | 15 +-
pandas/tests/io/xml/test_xml.py | 371 +++++++++++++-----
pandas/tests/test_downstream.py | 23 +-
requirements-dev.txt | 1 +
20 files changed, 448 insertions(+), 308 deletions(-)
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 0923594f2c840..ffa7732c604a0 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -15,6 +15,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- boto3
# required dependencies
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 51c7a97ad6500..596f3476c9c4e 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -16,6 +16,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- boto3
# required dependencies
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 66b8650116854..9d60d734db5b3 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -15,6 +15,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- boto3
# required dependencies
diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml
index e1b4fdfb1d897..91961e4af2d1c 100644
--- a/ci/deps/actions-39-minimum_versions.yaml
+++ b/ci/deps/actions-39-minimum_versions.yaml
@@ -17,6 +17,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- boto3
# required dependencies
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index 8ff47dbb9cc95..6ea0d41b947dc 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -15,6 +15,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- boto3
# required dependencies
diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml
index ca9860fc20742..df4e8e285bd02 100644
--- a/ci/deps/circle-310-arm64.yaml
+++ b/ci/deps/circle-310-arm64.yaml
@@ -15,6 +15,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- boto3
# required dependencies
diff --git a/environment.yml b/environment.yml
index 6178fe896760f..8fd97e6fcc0e1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -17,6 +17,7 @@ dependencies:
- pytest-cov
- pytest-xdist>=2.2.0
- pytest-asyncio>=0.17.0
+ - pytest-localserver>=0.7.1
- coverage
# required dependencies
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
index c9890032f408a..170e2f61e7d4a 100644
--- a/pandas/tests/io/conftest.py
+++ b/pandas/tests/io/conftest.py
@@ -45,6 +45,11 @@ def feather_file(datapath):
return datapath("io", "data", "feather", "feather-0_3_1.feather")
+@pytest.fixture
+def xml_file(datapath):
+ return datapath("io", "data", "xml", "books.xml")
+
+
@pytest.fixture
def s3so(worker_id):
if is_ci_environment():
@@ -141,7 +146,9 @@ def s3_public_bucket(s3_resource):
@pytest.fixture
-def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_file):
+def s3_public_bucket_with_data(
+ s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file
+):
"""
The following datasets
are loaded.
@@ -158,6 +165,7 @@ def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
+ ("books.xml", xml_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
@@ -175,7 +183,9 @@ def s3_private_bucket(s3_resource):
@pytest.fixture
-def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feather_file):
+def s3_private_bucket_with_data(
+ s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file
+):
"""
The following datasets
are loaded.
@@ -192,6 +202,7 @@ def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feathe
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
+ ("books.xml", xml_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 88f55145b599a..f507314928784 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -876,19 +876,11 @@ def test_corrupt_bytes_raises(self, engine):
pd.read_excel(bad_stream)
@pytest.mark.network
- @tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/excel/test1.xlsx"
- ),
- check_before_test=True,
- )
- def test_read_from_http_url(self, read_ext):
- url = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/excel/test1" + read_ext
- )
- url_table = pd.read_excel(url)
+ @pytest.mark.single_cpu
+ def test_read_from_http_url(self, httpserver, read_ext):
+ with open("test1" + read_ext, "rb") as f:
+ httpserver.serve_content(content=f.read())
+ url_table = pd.read_excel(httpserver.url)
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 90c48012ccac9..b6b21f9962876 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -998,10 +998,7 @@ def test_round_trip_exception(self, datapath):
tm.assert_frame_equal(res, df)
@pytest.mark.network
- @tm.network(
- url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5",
- check_before_test=True,
- )
+ @pytest.mark.single_cpu
@pytest.mark.parametrize(
"field,dtype",
[
@@ -1010,9 +1007,10 @@ def test_round_trip_exception(self, datapath):
["updated_at", pd.DatetimeTZDtype(tz="UTC")],
],
)
- def test_url(self, field, dtype):
- url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5"
- result = read_json(url, convert_dates=True)
+ def test_url(self, field, dtype, httpserver):
+ data = '{"created_at": ["2023-06-23T18:21:36Z"], "closed_at": ["2023-06-23T18:21:36"], "updated_at": ["2023-06-23T18:21:36Z"]}\n' # noqa: E501
+ httpserver.serve_content(content=data)
+ result = read_json(httpserver.url, convert_dates=True)
assert result[field].dtype == dtype
def test_timedelta(self):
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
index ba196a532adf6..c0f5c1a203e94 100644
--- a/pandas/tests/io/parser/common/test_file_buffer_url.py
+++ b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -28,24 +28,17 @@
@pytest.mark.network
-@tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/parser/data/salaries.csv"
- ),
- check_before_test=True,
-)
-def test_url(all_parsers, csv_dir_path):
+@pytest.mark.single_cpu
+def test_url(all_parsers, csv_dir_path, httpserver):
parser = all_parsers
kwargs = {"sep": "\t"}
- url = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/parser/data/salaries.csv"
- )
- url_result = parser.read_csv(url, **kwargs)
-
local_path = os.path.join(csv_dir_path, "salaries.csv")
+ with open(local_path) as f:
+ httpserver.serve_content(content=f.read())
+
+ url_result = parser.read_csv(httpserver.url, **kwargs)
+
local_result = parser.read_csv(local_path, **kwargs)
tm.assert_frame_equal(url_result, local_result)
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index d8c58649984fa..184d9338d4c1f 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -22,27 +22,25 @@
@pytest.mark.network
-@tm.network(
- url=(
- "https://github.com/pandas-dev/pandas/raw/main/"
- "pandas/tests/io/parser/data/salaries.csv"
- ),
- check_before_test=True,
-)
+@pytest.mark.single_cpu
@pytest.mark.parametrize("mode", ["explicit", "infer"])
@pytest.mark.parametrize("engine", ["python", "c"])
def test_compressed_urls(
- salaries_table, mode, engine, compression_only, compression_to_extension
+ httpserver,
+ datapath,
+ salaries_table,
+ mode,
+ engine,
+ compression_only,
+ compression_to_extension,
):
# test reading compressed urls with various engines and
# extension inference
- extension = compression_to_extension[compression_only]
- base_url = (
- "https://github.com/pandas-dev/pandas/raw/main/"
- "pandas/tests/io/parser/data/salaries.csv"
- )
+ compression_to_extension[compression_only]
+ with open(datapath("io", "parser", "data", "salaries.csv")) as f:
+ httpserver.serve_content(content=f.read())
- url = base_url + extension
+ url = httpserver.url
if mode != "explicit":
compression_only = mode
@@ -52,24 +50,16 @@ def test_compressed_urls(
@pytest.mark.network
-@tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/parser/data/unicode_series.csv"
- ),
- check_before_test=True,
-)
-def test_url_encoding_csv():
+@pytest.mark.single_cpu
+def test_url_encoding_csv(httpserver, datapath):
"""
read_csv should honor the requested encoding for URLs.
GH 10424
"""
- path = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/parser/data/unicode_series.csv"
- )
- df = read_csv(path, encoding="latin-1", header=None)
+ with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
+ httpserver.serve_content(content=f.read())
+ df = read_csv(httpserver.url, encoding="latin-1", header=None)
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index c682963c462cc..7f622295472e4 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -1013,47 +1013,19 @@ def test_invalid_dtype_backend():
@pytest.mark.network
-@tm.network(
- url="ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt",
- check_before_test=True,
-)
-def test_url_urlopen():
- expected = pd.Index(
- [
- "CC",
- "Network",
- "Code",
- "StationId",
- "Latitude",
- "Longitude",
- "Elev",
- "dummy",
- "StationName",
- "From",
- "To",
- "Nrec",
- ],
- dtype="object",
- )
- url = "ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt"
- with urlopen(url) as f:
- result = read_fwf(
- f,
- widths=(2, 1, 3, 5, 9, 10, 7, 4, 30, 5, 5, 7),
- names=(
- "CC",
- "Network",
- "Code",
- "StationId",
- "Latitude",
- "Longitude",
- "Elev",
- "dummy",
- "StationName",
- "From",
- "To",
- "Nrec",
- ),
- ).columns
+@pytest.mark.single_cpu
+def test_url_urlopen(httpserver):
+ data = """\
+A B C D
+201158 360.242940 149.910199 11950.7
+201159 444.953632 166.985655 11788.4
+201160 364.136849 183.628767 11806.2
+201161 413.836124 184.375703 11916.8
+201162 502.953953 173.237159 12468.3
+"""
+ httpserver.serve_content(content=data)
+ expected = pd.Index(list("ABCD"))
+ with urlopen(httpserver.url) as f:
+ result = read_fwf(f).columns
tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index 7b139dc45624e..9de097fe8c0e6 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -142,21 +142,13 @@ def test_passthrough_keywords(self):
self.check_round_trip(df, write_kwargs={"version": 1})
@pytest.mark.network
- @tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/feather/feather-0_3_1.feather"
- ),
- check_before_test=True,
- )
- def test_http_path(self, feather_file):
+ @pytest.mark.single_cpu
+ def test_http_path(self, feather_file, httpserver):
# GH 29055
- url = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/feather/feather-0_3_1.feather"
- )
expected = read_feather(feather_file)
- res = read_feather(url)
+ with open(feather_file, "rb") as f:
+ httpserver.serve_content(content=f.read())
+ res = read_feather(httpserver.url)
tm.assert_frame_equal(expected, res)
def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 5c6c33de5ac5f..dd0ce417e1f15 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -35,7 +35,6 @@
)
from pandas.io.common import file_path_to_url
-import pandas.io.html
@pytest.fixture(
@@ -193,43 +192,30 @@ def test_dtype_backend(self, string_storage, dtype_backend):
tm.assert_frame_equal(result, expected)
@pytest.mark.network
- @tm.network(
- url=(
- "https://www.fdic.gov/resources/resolutions/"
- "bank-failures/failed-bank-list/index.html"
- ),
- check_before_test=True,
- )
- def test_banklist_url(self):
- url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa: E501
- df1 = self.read_html(
+ @pytest.mark.single_cpu
+ def test_banklist_url(self, httpserver, banklist_data):
+ with open(banklist_data) as f:
+ httpserver.serve_content(content=f.read())
+ df1 = self.read_html(
+ # lxml cannot find attrs leave out for now
+ httpserver.url,
+ match="First Federal Bank of Florida", # attrs={"class": "dataTable"}
+ )
# lxml cannot find attrs leave out for now
- url,
- match="First Federal Bank of Florida", # attrs={"class": "dataTable"}
- )
- # lxml cannot find attrs leave out for now
- df2 = self.read_html(
- url,
- match="Metcalf Bank",
- ) # attrs={"class": "dataTable"})
+ df2 = self.read_html(
+ httpserver.url,
+ match="Metcalf Bank",
+ ) # attrs={"class": "dataTable"})
assert_framelist_equal(df1, df2)
@pytest.mark.network
- @tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/html/spam.html"
- ),
- check_before_test=True,
- )
- def test_spam_url(self):
- url = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/html/spam.html"
- )
- df1 = self.read_html(url, match=".*Water.*")
- df2 = self.read_html(url, match="Unit")
+ @pytest.mark.single_cpu
+ def test_spam_url(self, httpserver, spam_data):
+ with open(spam_data) as f:
+ httpserver.serve_content(content=f.read())
+ df1 = self.read_html(httpserver.url, match=".*Water.*")
+ df2 = self.read_html(httpserver.url, match="Unit")
assert_framelist_equal(df1, df2)
@@ -366,21 +352,19 @@ def test_file_like(self, spam_data):
assert_framelist_equal(df1, df2)
@pytest.mark.network
- @tm.network
- def test_bad_url_protocol(self):
+ @pytest.mark.single_cpu
+ def test_bad_url_protocol(self, httpserver):
+ httpserver.serve_content("urlopen error unknown url type: git", code=404)
with pytest.raises(URLError, match="urlopen error unknown url type: git"):
self.read_html("git://github.com", match=".*Water.*")
@pytest.mark.slow
@pytest.mark.network
- @tm.network
- def test_invalid_url(self):
- msg = (
- "Name or service not known|Temporary failure in name resolution|"
- "No tables found"
- )
- with pytest.raises((URLError, ValueError), match=msg):
- self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*")
+ @pytest.mark.single_cpu
+ def test_invalid_url(self, httpserver):
+ httpserver.serve_content("Name or service not known", code=404)
+ with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"):
+ self.read_html(httpserver.url, match=".*Water.*")
@pytest.mark.slow
def test_file_url(self, banklist_data):
@@ -454,20 +438,69 @@ def test_negative_skiprows(self, spam_data):
with pytest.raises(ValueError, match=msg):
self.read_html(spam_data, match="Water", skiprows=-1)
+ @pytest.fixture
+ def python_docs(self):
+ return """
+
+
+ What's new in Python 2.7?
+ or all "What's new" documents since 2.0
+ Tutorial
+ start here
+ Library Reference
+ keep this under your pillow
+ Language Reference
+ describes syntax and language elements
+ Python Setup and Usage
+ how to use Python on different platforms
+ Python HOWTOs
+ in-depth documents on specific topics
+ |
+ Installing Python Modules
+ installing from the Python Package Index & other sources
+ Distributing Python Modules
+ publishing modules for installation by others
+ Extending and Embedding
+ tutorial for C/C++ programmers
+ Python/C API
+ reference for C/C++ programmers
+ FAQs
+ frequently asked questions (with answers!)
+ |
+
+
+ Indices and tables:
+
+
+ Python Global Module Index
+ quick access to all modules
+ General Index
+ all functions, classes, terms
+ Glossary
+ the most important terms explained
+ |
+ Search page
+ search this documentation
+ Complete Table of Contents
+ lists all sections and subsections
+ |
+
+ """ # noqa: E501
+
@pytest.mark.network
- @tm.network(url="https://docs.python.org/2/", check_before_test=True)
- def test_multiple_matches(self):
- url = "https://docs.python.org/2/"
- dfs = self.read_html(url, match="Python")
+ @pytest.mark.single_cpu
+ def test_multiple_matches(self, python_docs, httpserver):
+ httpserver.serve_content(content=python_docs)
+ dfs = self.read_html(httpserver.url, match="Python")
assert len(dfs) > 1
@pytest.mark.network
- @tm.network(url="https://docs.python.org/2/", check_before_test=True)
- def test_python_docs_table(self):
- url = "https://docs.python.org/2/"
- dfs = self.read_html(url, match="Python")
+ @pytest.mark.single_cpu
+ def test_python_docs_table(self, python_docs, httpserver):
+ httpserver.serve_content(content=python_docs)
+ dfs = self.read_html(httpserver.url, match="Python")
zz = [df.iloc[0, 0][0:4] for df in dfs]
- assert sorted(zz) == sorted(["Repo", "What"])
+ assert sorted(zz) == ["Pyth", "What"]
def test_empty_tables(self):
"""
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index f2ff526a58f99..35bf75d3928f8 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -376,21 +376,13 @@ def check_external_error_on_write(self, df, engine, exc):
to_parquet(df, path, engine, compression=None)
@pytest.mark.network
- @tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/"
- "main/pandas/tests/io/data/parquet/simple.parquet"
- ),
- check_before_test=True,
- )
- def test_parquet_read_from_url(self, df_compat, engine):
+ @pytest.mark.single_cpu
+ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
if engine != "auto":
pytest.importorskip(engine)
- url = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/"
- "main/pandas/tests/io/data/parquet/simple.parquet"
- )
- df = read_parquet(url)
+ with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
+ httpserver.serve_content(content=f.read())
+ df = read_parquet(httpserver.url)
tm.assert_frame_equal(df, df_compat)
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
index 5171ec04b0bcf..5c23000270c99 100644
--- a/pandas/tests/io/test_s3.py
+++ b/pandas/tests/io/test_s3.py
@@ -5,7 +5,6 @@
import pandas.util._test_decorators as td
from pandas import read_csv
-import pandas._testing as tm
def test_streaming_s3_objects():
@@ -21,28 +20,24 @@ def test_streaming_s3_objects():
@td.skip_if_no("s3fs")
-@pytest.mark.network
-@tm.network
-def test_read_without_creds_from_pub_bucket():
+def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data):
# GH 34626
# Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
- result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3)
+ result = read_csv(f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=3)
assert len(result) == 3
@td.skip_if_no("s3fs")
-@pytest.mark.network
-@tm.network
-def test_read_with_creds_from_pub_bucket(monkeypatch):
+@pytest.mark.single_cpu
+def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch):
# Ensure we can read from a public bucket with credentials
# GH 34626
- # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
# see https://github.com/spulec/moto/issues/1924 & 1952
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
df = read_csv(
- "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None
+ f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, header=None
)
assert len(df) == 5
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index b0e806caecc80..80c63d0e17de1 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -14,7 +14,6 @@
import numpy as np
import pytest
-from pandas.compat import is_ci_environment
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
EmptyDataError,
@@ -297,53 +296,16 @@ def test_parser_consistency_file(xml_books):
@pytest.mark.network
-@pytest.mark.slow
-@tm.network(
- url=(
- "https://data.cityofchicago.org/api/views/"
- "8pix-ypme/rows.xml?accessType=DOWNLOAD"
- ),
- check_before_test=True,
-)
-def test_parser_consistency_url(parser):
- url = (
- "https://data.cityofchicago.org/api/views/"
- "8pix-ypme/rows.xml?accessType=DOWNLOAD"
- )
-
- with tm.ensure_clean(filename="cta.xml") as path:
- (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False))
+@pytest.mark.single_cpu
+def test_parser_consistency_url(parser, httpserver):
+ httpserver.serve_content(content=xml_default_nmsp)
- df_xpath = read_xml(path, parser=parser)
- df_iter = read_xml(
- path,
- parser=parser,
- iterparse={
- "row": [
- "_id",
- "_uuid",
- "_position",
- "_address",
- "stop_id",
- "direction_id",
- "stop_name",
- "station_name",
- "station_descriptive_name",
- "map_id",
- "ada",
- "red",
- "blue",
- "g",
- "brn",
- "p",
- "pexp",
- "y",
- "pnk",
- "o",
- "location",
- ]
- },
- )
+ df_xpath = read_xml(xml_default_nmsp, parser=parser)
+ df_iter = read_xml(
+ BytesIO(xml_default_nmsp.encode()),
+ parser=parser,
+ iterparse={"row": ["shape", "degrees", "sides"]},
+ )
tm.assert_frame_equal(df_xpath, df_iter)
@@ -520,14 +482,11 @@ def test_wrong_file_path_etree():
@pytest.mark.network
-@tm.network(
- url="https://www.w3schools.com/xml/books.xml",
- check_before_test=True,
-)
@td.skip_if_no("lxml")
-def test_url():
- url = "https://www.w3schools.com/xml/books.xml"
- df_url = read_xml(url, xpath=".//book[count(*)=4]")
+def test_url(httpserver, xml_file):
+ with open(xml_file) as f:
+ httpserver.serve_content(content=f.read())
+ df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]")
df_expected = DataFrame(
{
@@ -536,7 +495,6 @@ def test_url():
"author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
"year": [2005, 2005, 2003],
"price": [30.00, 29.99, 39.95],
- "cover": [None, None, "paperback"],
}
)
@@ -544,11 +502,11 @@ def test_url():
@pytest.mark.network
-@tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True)
-def test_wrong_url(parser):
- with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")):
- url = "https://www.w3schools.com/xml/python.xml"
- read_xml(url, xpath=".//book[count(*)=4]", parser=parser)
+@pytest.mark.single_cpu
+def test_wrong_url(parser, httpserver):
+ httpserver.serve_content("NOT FOUND", code=404)
+ with pytest.raises(HTTPError, match=("HTTP Error 404: NOT FOUND")):
+ read_xml(httpserver.url, xpath=".//book[count(*)=4]", parser=parser)
# XPATH
@@ -1429,17 +1387,18 @@ def test_file_io_iterparse(xml_books, parser, mode):
@pytest.mark.network
-@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True)
-def test_url_path_error(parser):
- url = "https://www.w3schools.com/xml/books.xml"
- with pytest.raises(
- ParserError, match=("iterparse is designed for large XML files")
- ):
- read_xml(
- url,
- parser=parser,
- iterparse={"row": ["shape", "degrees", "sides", "date"]},
- )
+@pytest.mark.single_cpu
+def test_url_path_error(parser, httpserver, xml_file):
+ with open(xml_file) as f:
+ httpserver.serve_content(content=f.read())
+ with pytest.raises(
+ ParserError, match=("iterparse is designed for large XML files")
+ ):
+ read_xml(
+ httpserver.url,
+ parser=parser,
+ iterparse={"row": ["shape", "degrees", "sides", "date"]},
+ )
def test_compression_error(parser, compression_only):
@@ -1641,14 +1600,245 @@ def test_empty_data(xml_books, parser):
)
-@pytest.mark.network
@td.skip_if_no("lxml")
-@tm.network(
- url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True
-)
def test_online_stylesheet():
- xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml"
- xsl = "https://www.w3schools.com/xml/cdcatalog.xsl"
+ xml = """\
+
+
+
+ Empire Burlesque
+ Bob Dylan
+ USA
+ Columbia
+ 10.90
+ 1985
+
+
+ Hide your heart
+ Bonnie Tyler
+ UK
+ CBS Records
+ 9.90
+ 1988
+
+
+ Greatest Hits
+ Dolly Parton
+ USA
+ RCA
+ 9.90
+ 1982
+
+
+ Still got the blues
+ Gary Moore
+ UK
+ Virgin records
+ 10.20
+ 1990
+
+
+ Eros
+ Eros Ramazzotti
+ EU
+ BMG
+ 9.90
+ 1997
+
+
+ One night only
+ Bee Gees
+ UK
+ Polydor
+ 10.90
+ 1998
+
+
+ Sylvias Mother
+ Dr.Hook
+ UK
+ CBS
+ 8.10
+ 1973
+
+
+ Maggie May
+ Rod Stewart
+ UK
+ Pickwick
+ 8.50
+ 1990
+
+
+ Romanza
+ Andrea Bocelli
+ EU
+ Polydor
+ 10.80
+ 1996
+
+
+ When a man loves a woman
+ Percy Sledge
+ USA
+ Atlantic
+ 8.70
+ 1987
+
+
+ Black angel
+ Savage Rose
+ EU
+ Mega
+ 10.90
+ 1995
+
+
+ 1999 Grammy Nominees
+ Many
+ USA
+ Grammy
+ 10.20
+ 1999
+
+
+ For the good times
+ Kenny Rogers
+ UK
+ Mucik Master
+ 8.70
+ 1995
+
+
+ Big Willie style
+ Will Smith
+ USA
+ Columbia
+ 9.90
+ 1997
+
+
+ Tupelo Honey
+ Van Morrison
+ UK
+ Polydor
+ 8.20
+ 1971
+
+
+ Soulsville
+ Jorn Hoel
+ Norway
+ WEA
+ 7.90
+ 1996
+
+
+ The very best of
+ Cat Stevens
+ UK
+ Island
+ 8.90
+ 1990
+
+
+ Stop
+ Sam Brown
+ UK
+ A and M
+ 8.90
+ 1988
+
+
+ Bridge of Spies
+ T`Pau
+ UK
+ Siren
+ 7.90
+ 1987
+
+
+ Private Dancer
+ Tina Turner
+ UK
+ Capitol
+ 8.90
+ 1983
+
+
+ Midt om natten
+ Kim Larsen
+ EU
+ Medley
+ 7.80
+ 1983
+
+
+ Pavarotti Gala Concert
+ Luciano Pavarotti
+ UK
+ DECCA
+ 9.90
+ 1991
+
+
+ The dock of the bay
+ Otis Redding
+ USA
+ Stax Records
+ 7.90
+ 1968
+
+
+ Picture book
+ Simply Red
+ EU
+ Elektra
+ 7.20
+ 1985
+
+
+ Red
+ The Communards
+ UK
+ London
+ 7.80
+ 1987
+
+
+ Unchain my heart
+ Joe Cocker
+ USA
+ EMI
+ 8.20
+ 1987
+
+
+"""
+ xsl = """\
+
+
+
+
+
+ My CD Collection
+
+
+ Title |
+ Artist |
+
+
+
+ |
+ |
+
+
+
+
+
+
+
+"""
df_xsl = read_xml(
xml,
@@ -1740,32 +1930,15 @@ def test_unsuported_compression(parser):
@pytest.mark.network
+@pytest.mark.single_cpu
@td.skip_if_no("s3fs")
@td.skip_if_no("lxml")
-@pytest.mark.skipif(
- is_ci_environment(),
- reason="2022.1.17: Hanging on the CI min versions build.",
-)
-@tm.network
-def test_s3_parser_consistency():
- # Python Software Foundation (2019 IRS-990 RETURN)
- s3 = "s3://irs-form-990/201923199349319487_public.xml"
+def test_s3_parser_consistency(s3_public_bucket_with_data, s3so):
+ s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml"
- df_lxml = read_xml(
- s3,
- xpath=".//irs:Form990PartVIISectionAGrp",
- namespaces={"irs": "http://www.irs.gov/efile"},
- parser="lxml",
- storage_options={"anon": True},
- )
+ df_lxml = read_xml(s3, parser="lxml", storage_options=s3so)
- df_etree = read_xml(
- s3,
- xpath=".//irs:Form990PartVIISectionAGrp",
- namespaces={"irs": "http://www.irs.gov/efile"},
- parser="etree",
- storage_options={"anon": True},
- )
+ df_etree = read_xml(s3, parser="etree", storage_options=s3so)
tm.assert_frame_equal(df_lxml, df_etree)
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index 0f5fdbefd13d2..7354e313e24f4 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -139,14 +139,13 @@ def test_oo_optimized_datetime_index_unpickle():
)
-@pytest.mark.network
-@tm.network
def test_statsmodels():
statsmodels = import_module("statsmodels") # noqa: F841
- import statsmodels.api as sm
import statsmodels.formula.api as smf
- df = sm.datasets.get_rdataset("Guerry", "HistData").data
+ df = DataFrame(
+ {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)}
+ )
smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
@@ -163,11 +162,11 @@ def test_scikit_learn():
clf.predict(digits.data[-1:])
-@pytest.mark.network
-@tm.network
def test_seaborn():
seaborn = import_module("seaborn")
- tips = seaborn.load_dataset("tips")
+ tips = DataFrame(
+ {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)}
+ )
seaborn.stripplot(x="day", y="total_bill", data=tips)
@@ -177,16 +176,8 @@ def test_pandas_gbq():
pandas_gbq = import_module("pandas_gbq") # noqa: F841
-@pytest.mark.network
-@tm.network
-@pytest.mark.xfail(
- raises=ValueError,
- reason="The Quandl API key must be provided either through the api_key "
- "variable or through the environmental variable QUANDL_API_KEY",
-)
def test_pandas_datareader():
- pandas_datareader = import_module("pandas_datareader")
- pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01")
+ pandas_datareader = import_module("pandas_datareader") # noqa: F841
def test_pyarrow(df):
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 38a2ce7f66aa3..b1d8ce1cf2143 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,6 +10,7 @@ pytest>=7.3.2
pytest-cov
pytest-xdist>=2.2.0
pytest-asyncio>=0.17.0
+pytest-localserver>=0.7.1
coverage
python-dateutil
numpy
From c02deb9742027b9bed346b70f205b36625ba0c20 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:16:09 -0700
Subject: [PATCH 2/7] Fix test, remove network function
---
.../development/contributing_codebase.rst | 19 +-
pandas/_testing/_io.py | 253 ------------------
pandas/tests/io/parser/test_network.py | 9 +-
pandas/tests/io/xml/test_xml.py | 1 +
4 files changed, 13 insertions(+), 269 deletions(-)
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 311120fc527d4..17d33bcb306bc 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -613,22 +613,15 @@ Testing involving network connectivity
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and
-lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the
-``tm.network`` decorator.
+lack of ownership of the server that is being connected to. If network connectivity is absolutely required, mock
+the network connection using the ``httpserver`` fixture from the
+`pytest-localserver plugin. `_
.. code-block:: python
- @tm.network # noqa
- def test_network():
- result = package.call_to_internet()
-
-If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator.
-
-.. code-block:: python
-
- @tm.network("https://www.somespecificsite.com", check_before_test=True)
- def test_network():
- result = pd.read_html("https://www.somespecificsite.com")
+ def test_network(httpserver):
+ httpserver.serve_content(content="content")
+ result = pd.read_html(httpserver.url)
Example
^^^^^^^
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
index d79968a580e40..fa0bc58a132d4 100644
--- a/pandas/_testing/_io.py
+++ b/pandas/_testing/_io.py
@@ -1,10 +1,8 @@
from __future__ import annotations
import bz2
-from functools import wraps
import gzip
import io
-import socket
import tarfile
from typing import (
TYPE_CHECKING,
@@ -20,8 +18,6 @@
from pandas._testing._random import rands
from pandas._testing.contexts import ensure_clean
-from pandas.io.common import urlopen
-
if TYPE_CHECKING:
from pandas._typing import (
FilePath,
@@ -33,255 +29,6 @@
Series,
)
-# skip tests on exceptions with these messages
-_network_error_messages = (
- # 'urlopen error timed out',
- # 'timeout: timed out',
- # 'socket.timeout: timed out',
- "timed out",
- "Server Hangup",
- "HTTP Error 503: Service Unavailable",
- "502: Proxy Error",
- "HTTP Error 502: internal error",
- "HTTP Error 502",
- "HTTP Error 503",
- "HTTP Error 403",
- "HTTP Error 400",
- "Temporary failure in name resolution",
- "Name or service not known",
- "Connection refused",
- "certificate verify",
-)
-
-# or this e.errno/e.reason.errno
-_network_errno_vals = (
- 101, # Network is unreachable
- 111, # Connection refused
- 110, # Connection timed out
- 104, # Connection reset Error
- 54, # Connection reset by peer
- 60, # urllib.error.URLError: [Errno 60] Connection timed out
-)
-
-# Both of the above shouldn't mask real issues such as 404's
-# or refused connections (changed DNS).
-# But some tests (test_data yahoo) contact incredibly flakey
-# servers.
-
-# and conditionally raise on exception types in _get_default_network_errors
-
-
-def _get_default_network_errors():
- # Lazy import for http.client & urllib.error
- # because it imports many things from the stdlib
- import http.client
- import urllib.error
-
- return (
- OSError,
- http.client.HTTPException,
- TimeoutError,
- urllib.error.URLError,
- socket.timeout,
- )
-
-
-def optional_args(decorator):
- """
- allows a decorator to take optional positional and keyword arguments.
- Assumes that taking a single, callable, positional argument means that
- it is decorating a function, i.e. something like this::
-
- @my_decorator
- def function(): pass
-
- Calls decorator with decorator(f, *args, **kwargs)
- """
-
- @wraps(decorator)
- def wrapper(*args, **kwargs):
- def dec(f):
- return decorator(f, *args, **kwargs)
-
- is_decorating = not kwargs and len(args) == 1 and callable(args[0])
- if is_decorating:
- f = args[0]
- args = ()
- return dec(f)
- else:
- return dec
-
- return wrapper
-
-
-# error: Untyped decorator makes function "network" untyped
-@optional_args # type: ignore[misc]
-def network(
- t,
- url: str = "https://www.google.com",
- raise_on_error: bool = False,
- check_before_test: bool = False,
- error_classes=None,
- skip_errnos=_network_errno_vals,
- _skip_on_messages=_network_error_messages,
-):
- """
- Label a test as requiring network connection and, if an error is
- encountered, only raise if it does not find a network connection.
-
- In comparison to ``network``, this assumes an added contract to your test:
- you must assert that, under normal conditions, your test will ONLY fail if
- it does not have network connectivity.
-
- You can call this in 3 ways: as a standard decorator, with keyword
- arguments, or with a positional argument that is the url to check.
-
- Parameters
- ----------
- t : callable
- The test requiring network connectivity.
- url : path
- The url to test via ``pandas.io.common.urlopen`` to check
- for connectivity. Defaults to 'https://www.google.com'.
- raise_on_error : bool
- If True, never catches errors.
- check_before_test : bool
- If True, checks connectivity before running the test case.
- error_classes : tuple or Exception
- error classes to ignore. If not in ``error_classes``, raises the error.
- defaults to OSError. Be careful about changing the error classes here.
- skip_errnos : iterable of int
- Any exception that has .errno or .reason.erno set to one
- of these values will be skipped with an appropriate
- message.
- _skip_on_messages: iterable of string
- any exception e for which one of the strings is
- a substring of str(e) will be skipped with an appropriate
- message. Intended to suppress errors where an errno isn't available.
-
- Notes
- -----
- * ``raise_on_error`` supersedes ``check_before_test``
-
- Returns
- -------
- t : callable
- The decorated test ``t``, with checks for connectivity errors.
-
- Example
- -------
-
- Tests decorated with @network will fail if it's possible to make a network
- connection to another URL (defaults to google.com)::
-
- >>> from pandas import _testing as tm
- >>> @tm.network
- ... def test_network():
- ... with pd.io.common.urlopen("rabbit://bonanza.com"):
- ... pass
- >>> test_network() # doctest: +SKIP
- Traceback
- ...
- URLError:
-
- You can specify alternative URLs::
-
- >>> @tm.network("https://www.yahoo.com")
- ... def test_something_with_yahoo():
- ... raise OSError("Failure Message")
- >>> test_something_with_yahoo() # doctest: +SKIP
- Traceback (most recent call last):
- ...
- OSError: Failure Message
-
- If you set check_before_test, it will check the url first and not run the
- test on failure::
-
- >>> @tm.network("failing://url.blaher", check_before_test=True)
- ... def test_something():
- ... print("I ran!")
- ... raise ValueError("Failure")
- >>> test_something() # doctest: +SKIP
- Traceback (most recent call last):
- ...
-
- Errors not related to networking will always be raised.
- """
- import pytest
-
- if error_classes is None:
- error_classes = _get_default_network_errors()
-
- t.network = True
-
- @wraps(t)
- def wrapper(*args, **kwargs):
- if (
- check_before_test
- and not raise_on_error
- and not can_connect(url, error_classes)
- ):
- pytest.skip(
- f"May not have network connectivity because cannot connect to {url}"
- )
- try:
- return t(*args, **kwargs)
- except Exception as err:
- errno = getattr(err, "errno", None)
- if not errno and hasattr(errno, "reason"):
- # error: "Exception" has no attribute "reason"
- errno = getattr(err.reason, "errno", None) # type: ignore[attr-defined]
-
- if errno in skip_errnos:
- pytest.skip(f"Skipping test due to known errno and error {err}")
-
- e_str = str(err)
-
- if any(m.lower() in e_str.lower() for m in _skip_on_messages):
- pytest.skip(
- f"Skipping test because exception message is known and error {err}"
- )
-
- if not isinstance(err, error_classes) or raise_on_error:
- raise
- pytest.skip(f"Skipping test due to lack of connectivity and error {err}")
-
- return wrapper
-
-
-def can_connect(url, error_classes=None) -> bool:
- """
- Try to connect to the given url. True if succeeds, False if OSError
- raised
-
- Parameters
- ----------
- url : basestring
- The URL to try to connect to
-
- Returns
- -------
- connectable : bool
- Return True if no OSError (unable to connect) or URLError (bad url) was
- raised
- """
- if error_classes is None:
- error_classes = _get_default_network_errors()
-
- try:
- with urlopen(url, timeout=20) as response:
- # Timeout just in case rate-limiting is applied
- if (
- response.info().get("Content-type") == "text/html"
- and response.status != 200
- ):
- return False
- except error_classes:
- return False
- else:
- return True
-
-
# ------------------------------------------------------------------
# File-IO
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index 184d9338d4c1f..ba0307cf5111e 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -36,11 +36,14 @@ def test_compressed_urls(
):
# test reading compressed urls with various engines and
# extension inference
- compression_to_extension[compression_only]
- with open(datapath("io", "parser", "data", "salaries.csv")) as f:
+ if compression_only == "tar":
+ pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
+
+ extension = compression_to_extension[compression_only]
+ with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
httpserver.serve_content(content=f.read())
- url = httpserver.url
+ url = httpserver.url + "/salaries.csv" + extension
if mode != "explicit":
compression_only = mode
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 80c63d0e17de1..40609598916a6 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -482,6 +482,7 @@ def test_wrong_file_path_etree():
@pytest.mark.network
+@pytest.mark.single_cpu
@td.skip_if_no("lxml")
def test_url(httpserver, xml_file):
with open(xml_file) as f:
From c958362fa529c3975c7185e8dd9b67013ed775fa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:39:14 -0700
Subject: [PATCH 3/7] remove network from init
---
pandas/_testing/__init__.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index de3dd58d3b716..fbbdfa4b8a5bf 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -51,7 +51,6 @@
)
from pandas._testing._io import (
close,
- network,
round_trip_localpath,
round_trip_pathlib,
round_trip_pickle,
@@ -1150,7 +1149,6 @@ def shares_memory(left, right) -> bool:
"makeUIntIndex",
"maybe_produces_warning",
"NARROW_NP_DTYPES",
- "network",
"NP_NAT_OBJECTS",
"NULL_OBJECTS",
"OBJECT_DTYPES",
From 693d7170ebfdc957d110f11bce28b1520c45e85d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 21:17:56 -0700
Subject: [PATCH 4/7] Ignore disutils from datareader, s3so
---
pandas/tests/io/test_s3.py | 17 ++++++++++++-----
pyproject.toml | 1 +
2 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
index 5c23000270c99..35250f1dd3081 100644
--- a/pandas/tests/io/test_s3.py
+++ b/pandas/tests/io/test_s3.py
@@ -20,16 +20,20 @@ def test_streaming_s3_objects():
@td.skip_if_no("s3fs")
-def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data):
+@pytest.mark.single_cpu
+def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so):
# GH 34626
- # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
- result = read_csv(f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=3)
+ result = read_csv(
+ f"s3://{s3_public_bucket_with_data.name}/tips.csv",
+ nrows=3,
+ storage_options=s3so,
+ )
assert len(result) == 3
@td.skip_if_no("s3fs")
@pytest.mark.single_cpu
-def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch):
+def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so):
# Ensure we can read from a public bucket with credentials
# GH 34626
@@ -38,6 +42,9 @@ def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch
monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
df = read_csv(
- f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, header=None
+ f"s3://{s3_public_bucket_with_data.name}/tips.csv",
+ nrows=5,
+ header=None,
+ storage_options=s3so,
)
assert len(df) == 5
diff --git a/pyproject.toml b/pyproject.toml
index 0d1bca886a638..ef257b3143598 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -483,6 +483,7 @@ filterwarnings = [
"ignore:a closed node found in the registry:UserWarning:tables",
"ignore:`np.object` is a deprecated:DeprecationWarning:tables",
"ignore:tostring:DeprecationWarning:tables",
+ "ignore:distutils Version classes are deprecated:DeprecationWarning:pandas_datareader",
"ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr",
"ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet",
"ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec",
From 08549b4c26bc89ab5b2ecfced6f8614914862d76 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 23 Jun 2023 22:15:27 -0700
Subject: [PATCH 5/7] specify encoding
---
pandas/tests/io/parser/common/test_file_buffer_url.py | 2 +-
pandas/tests/io/test_html.py | 4 ++--
pandas/tests/io/xml/test_xml.py | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
index c0f5c1a203e94..5ee629947db48 100644
--- a/pandas/tests/io/parser/common/test_file_buffer_url.py
+++ b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -34,7 +34,7 @@ def test_url(all_parsers, csv_dir_path, httpserver):
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
- with open(local_path) as f:
+ with open(local_path, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
url_result = parser.read_csv(httpserver.url, **kwargs)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index dd0ce417e1f15..d17e4b08b5a4d 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -194,7 +194,7 @@ def test_dtype_backend(self, string_storage, dtype_backend):
@pytest.mark.network
@pytest.mark.single_cpu
def test_banklist_url(self, httpserver, banklist_data):
- with open(banklist_data) as f:
+ with open(banklist_data, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
df1 = self.read_html(
# lxml cannot find attrs leave out for now
@@ -212,7 +212,7 @@ def test_banklist_url(self, httpserver, banklist_data):
@pytest.mark.network
@pytest.mark.single_cpu
def test_spam_url(self, httpserver, spam_data):
- with open(spam_data) as f:
+ with open(spam_data, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
df1 = self.read_html(httpserver.url, match=".*Water.*")
df2 = self.read_html(httpserver.url, match="Unit")
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 40609598916a6..49fb43bf40006 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -1390,7 +1390,7 @@ def test_file_io_iterparse(xml_books, parser, mode):
@pytest.mark.network
@pytest.mark.single_cpu
def test_url_path_error(parser, httpserver, xml_file):
- with open(xml_file) as f:
+ with open(xml_file, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
with pytest.raises(
ParserError, match=("iterparse is designed for large XML files")
From 6b09451a583a78bcbdbf0b8c98da045869757d7b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Sat, 24 Jun 2023 10:46:02 -0700
Subject: [PATCH 6/7] Specify encoding
---
pandas/tests/io/xml/test_xml.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 49fb43bf40006..a3a1646bc4748 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -485,7 +485,7 @@ def test_wrong_file_path_etree():
@pytest.mark.single_cpu
@td.skip_if_no("lxml")
def test_url(httpserver, xml_file):
- with open(xml_file) as f:
+ with open(xml_file, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]")
From 82acdc1ded645aa674fe2f858ca479e9917700e3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 26 Jun 2023 15:32:24 -0700
Subject: [PATCH 7/7] Clarify contributing doc
---
doc/source/development/contributing_codebase.rst | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 17d33bcb306bc..00f9fd74e01ca 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -612,13 +612,14 @@ deleted when the context block is exited.
Testing involving network connectivity
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and
-lack of ownership of the server that is being connected to. If network connectivity is absolutely required, mock
-the network connection using the ``httpserver`` fixture from the
-`pytest-localserver plugin. `_
+A unit test should not access a public data set over the internet due to flakiness of network connections and
+lack of ownership of the server that is being connected to. To mock this interaction, use the ``httpserver`` fixture from the
+`pytest-localserver plugin. `_ with synthetic data.
.. code-block:: python
+ @pytest.mark.network
+ @pytest.mark.single_cpu
def test_network(httpserver):
httpserver.serve_content(content="content")
result = pd.read_html(httpserver.url)