From 3bb72c7dc0af881fbefec789b00ba3274deb26ff Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:31:53 -0700 Subject: [PATCH 1/7] Use pytest-localserver instead of making network connections --- ci/deps/actions-310.yaml | 1 + ci/deps/actions-311-downstream_compat.yaml | 1 + ci/deps/actions-311.yaml | 1 + ci/deps/actions-39-minimum_versions.yaml | 1 + ci/deps/actions-39.yaml | 1 + ci/deps/circle-310-arm64.yaml | 1 + environment.yml | 1 + pandas/tests/io/conftest.py | 15 +- pandas/tests/io/excel/test_readers.py | 18 +- pandas/tests/io/json/test_pandas.py | 12 +- .../io/parser/common/test_file_buffer_url.py | 21 +- pandas/tests/io/parser/test_network.py | 44 +-- pandas/tests/io/parser/test_read_fwf.py | 56 +-- pandas/tests/io/test_feather.py | 18 +- pandas/tests/io/test_html.py | 137 ++++--- pandas/tests/io/test_parquet.py | 18 +- pandas/tests/io/test_s3.py | 15 +- pandas/tests/io/xml/test_xml.py | 371 +++++++++++++----- pandas/tests/test_downstream.py | 23 +- requirements-dev.txt | 1 + 20 files changed, 448 insertions(+), 308 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 0923594f2c840..ffa7732c604a0 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 51c7a97ad6500..596f3476c9c4e 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -16,6 +16,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 66b8650116854..9d60d734db5b3 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index e1b4fdfb1d897..91961e4af2d1c 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -17,6 +17,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 8ff47dbb9cc95..6ea0d41b947dc 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index ca9860fc20742..df4e8e285bd02 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -15,6 +15,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/environment.yml b/environment.yml index 6178fe896760f..8fd97e6fcc0e1 100644 --- a/environment.yml +++ b/environment.yml @@ -17,6 +17,7 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - coverage # required dependencies diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index c9890032f408a..170e2f61e7d4a 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -45,6 +45,11 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") +@pytest.fixture +def xml_file(datapath): + return datapath("io", "data", "xml", "books.xml") + + @pytest.fixture def s3so(worker_id): if is_ci_environment(): @@ -141,7 +146,9 @@ def s3_public_bucket(s3_resource): @pytest.fixture -def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_file): +def s3_public_bucket_with_data( + s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file +): """ The following datasets are loaded. @@ -158,6 +165,7 @@ def s3_public_bucket_with_data(s3_public_bucket, tips_file, jsonl_file, feather_ ("tips.csv.bz2", tips_file + ".bz2"), ("items.jsonl", jsonl_file), ("simple_dataset.feather", feather_file), + ("books.xml", xml_file), ] for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: @@ -175,7 +183,9 @@ def s3_private_bucket(s3_resource): @pytest.fixture -def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feather_file): +def s3_private_bucket_with_data( + s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file +): """ The following datasets are loaded. @@ -192,6 +202,7 @@ def s3_private_bucket_with_data(s3_private_bucket, tips_file, jsonl_file, feathe ("tips.csv.bz2", tips_file + ".bz2"), ("items.jsonl", jsonl_file), ("simple_dataset.feather", feather_file), + ("books.xml", xml_file), ] for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 88f55145b599a..f507314928784 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -876,19 +876,11 @@ def test_corrupt_bytes_raises(self, engine): pd.read_excel(bad_stream) @pytest.mark.network - @tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/data/excel/test1.xlsx" - ), - check_before_test=True, - ) - def test_read_from_http_url(self, read_ext): - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/data/excel/test1" + read_ext - ) - url_table = pd.read_excel(url) + @pytest.mark.single_cpu + def test_read_from_http_url(self, httpserver, read_ext): + with open("test1" + read_ext, "rb") as f: + httpserver.serve_content(content=f.read()) + url_table = pd.read_excel(httpserver.url) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 90c48012ccac9..b6b21f9962876 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -998,10 +998,7 @@ def test_round_trip_exception(self, datapath): tm.assert_frame_equal(res, df) @pytest.mark.network - @tm.network( - url="https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5", - check_before_test=True, - ) + @pytest.mark.single_cpu @pytest.mark.parametrize( "field,dtype", [ @@ -1010,9 +1007,10 @@ def test_round_trip_exception(self, datapath): ["updated_at", pd.DatetimeTZDtype(tz="UTC")], ], ) - def test_url(self, field, dtype): - url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5" - result = read_json(url, convert_dates=True) + def test_url(self, field, dtype, httpserver): + data = '{"created_at": ["2023-06-23T18:21:36Z"], "closed_at": ["2023-06-23T18:21:36"], "updated_at": ["2023-06-23T18:21:36Z"]}\n' # noqa: E501 + httpserver.serve_content(content=data) + result = read_json(httpserver.url, convert_dates=True) assert result[field].dtype == dtype def test_timedelta(self): diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index ba196a532adf6..c0f5c1a203e94 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -28,24 +28,17 @@ @pytest.mark.network -@tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/salaries.csv" - ), - check_before_test=True, -) -def test_url(all_parsers, csv_dir_path): +@pytest.mark.single_cpu +def test_url(all_parsers, csv_dir_path, httpserver): parser = all_parsers kwargs = {"sep": "\t"} - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/salaries.csv" - ) - url_result = parser.read_csv(url, **kwargs) - local_path = os.path.join(csv_dir_path, "salaries.csv") + with open(local_path) as f: + httpserver.serve_content(content=f.read()) + + url_result = parser.read_csv(httpserver.url, **kwargs) + local_result = parser.read_csv(local_path, **kwargs) tm.assert_frame_equal(url_result, local_result) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index d8c58649984fa..184d9338d4c1f 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -22,27 +22,25 @@ @pytest.mark.network -@tm.network( - url=( - "https://github.com/pandas-dev/pandas/raw/main/" - "pandas/tests/io/parser/data/salaries.csv" - ), - check_before_test=True, -) +@pytest.mark.single_cpu @pytest.mark.parametrize("mode", ["explicit", "infer"]) @pytest.mark.parametrize("engine", ["python", "c"]) def test_compressed_urls( - salaries_table, mode, engine, compression_only, compression_to_extension + httpserver, + datapath, + salaries_table, + mode, + engine, + compression_only, + compression_to_extension, ): # test reading compressed urls with various engines and # extension inference - extension = compression_to_extension[compression_only] - base_url = ( - "https://github.com/pandas-dev/pandas/raw/main/" - "pandas/tests/io/parser/data/salaries.csv" - ) + compression_to_extension[compression_only] + with open(datapath("io", "parser", "data", "salaries.csv")) as f: + httpserver.serve_content(content=f.read()) - url = base_url + extension + url = httpserver.url if mode != "explicit": compression_only = mode @@ -52,24 +50,16 @@ def test_compressed_urls( @pytest.mark.network -@tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/unicode_series.csv" - ), - check_before_test=True, -) -def test_url_encoding_csv(): +@pytest.mark.single_cpu +def test_url_encoding_csv(httpserver, datapath): """ read_csv should honor the requested encoding for URLs. GH 10424 """ - path = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/parser/data/unicode_series.csv" - ) - df = read_csv(path, encoding="latin-1", header=None) + with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f: + httpserver.serve_content(content=f.read()) + df = read_csv(httpserver.url, encoding="latin-1", header=None) assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index c682963c462cc..7f622295472e4 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -1013,47 +1013,19 @@ def test_invalid_dtype_backend(): @pytest.mark.network -@tm.network( - url="ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt", - check_before_test=True, -) -def test_url_urlopen(): - expected = pd.Index( - [ - "CC", - "Network", - "Code", - "StationId", - "Latitude", - "Longitude", - "Elev", - "dummy", - "StationName", - "From", - "To", - "Nrec", - ], - dtype="object", - ) - url = "ftp://ftp.ncdc.noaa.gov/pub/data/igra/igra2-station-list.txt" - with urlopen(url) as f: - result = read_fwf( - f, - widths=(2, 1, 3, 5, 9, 10, 7, 4, 30, 5, 5, 7), - names=( - "CC", - "Network", - "Code", - "StationId", - "Latitude", - "Longitude", - "Elev", - "dummy", - "StationName", - "From", - "To", - "Nrec", - ), - ).columns +@pytest.mark.single_cpu +def test_url_urlopen(httpserver): + data = """\ +A B C D +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + httpserver.serve_content(content=data) + expected = pd.Index(list("ABCD")) + with urlopen(httpserver.url) as f: + result = read_fwf(f).columns tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 7b139dc45624e..9de097fe8c0e6 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -142,21 +142,13 @@ def test_passthrough_keywords(self): self.check_round_trip(df, write_kwargs={"version": 1}) @pytest.mark.network - @tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/data/feather/feather-0_3_1.feather" - ), - check_before_test=True, - ) - def test_http_path(self, feather_file): + @pytest.mark.single_cpu + def test_http_path(self, feather_file, httpserver): # GH 29055 - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/data/feather/feather-0_3_1.feather" - ) expected = read_feather(feather_file) - res = read_feather(url) + with open(feather_file, "rb") as f: + httpserver.serve_content(content=f.read()) + res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) def test_read_feather_dtype_backend(self, string_storage, dtype_backend): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 5c6c33de5ac5f..dd0ce417e1f15 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -35,7 +35,6 @@ ) from pandas.io.common import file_path_to_url -import pandas.io.html @pytest.fixture( @@ -193,43 +192,30 @@ def test_dtype_backend(self, string_storage, dtype_backend): tm.assert_frame_equal(result, expected) @pytest.mark.network - @tm.network( - url=( - "https://www.fdic.gov/resources/resolutions/" - "bank-failures/failed-bank-list/index.html" - ), - check_before_test=True, - ) - def test_banklist_url(self): - url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/index.html" # noqa: E501 - df1 = self.read_html( + @pytest.mark.single_cpu + def test_banklist_url(self, httpserver, banklist_data): + with open(banklist_data) as f: + httpserver.serve_content(content=f.read()) + df1 = self.read_html( + # lxml cannot find attrs leave out for now + httpserver.url, + match="First Federal Bank of Florida", # attrs={"class": "dataTable"} + ) # lxml cannot find attrs leave out for now - url, - match="First Federal Bank of Florida", # attrs={"class": "dataTable"} - ) - # lxml cannot find attrs leave out for now - df2 = self.read_html( - url, - match="Metcalf Bank", - ) # attrs={"class": "dataTable"}) + df2 = self.read_html( + httpserver.url, + match="Metcalf Bank", + ) # attrs={"class": "dataTable"}) assert_framelist_equal(df1, df2) @pytest.mark.network - @tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/data/html/spam.html" - ), - check_before_test=True, - ) - def test_spam_url(self): - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/main/" - "pandas/tests/io/data/html/spam.html" - ) - df1 = self.read_html(url, match=".*Water.*") - df2 = self.read_html(url, match="Unit") + @pytest.mark.single_cpu + def test_spam_url(self, httpserver, spam_data): + with open(spam_data) as f: + httpserver.serve_content(content=f.read()) + df1 = self.read_html(httpserver.url, match=".*Water.*") + df2 = self.read_html(httpserver.url, match="Unit") assert_framelist_equal(df1, df2) @@ -366,21 +352,19 @@ def test_file_like(self, spam_data): assert_framelist_equal(df1, df2) @pytest.mark.network - @tm.network - def test_bad_url_protocol(self): + @pytest.mark.single_cpu + def test_bad_url_protocol(self, httpserver): + httpserver.serve_content("urlopen error unknown url type: git", code=404) with pytest.raises(URLError, match="urlopen error unknown url type: git"): self.read_html("git://github.com", match=".*Water.*") @pytest.mark.slow @pytest.mark.network - @tm.network - def test_invalid_url(self): - msg = ( - "Name or service not known|Temporary failure in name resolution|" - "No tables found" - ) - with pytest.raises((URLError, ValueError), match=msg): - self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*") + @pytest.mark.single_cpu + def test_invalid_url(self, httpserver): + httpserver.serve_content("Name or service not known", code=404) + with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"): + self.read_html(httpserver.url, match=".*Water.*") @pytest.mark.slow def test_file_url(self, banklist_data): @@ -454,20 +438,69 @@ def test_negative_skiprows(self, spam_data): with pytest.raises(ValueError, match=msg): self.read_html(spam_data, match="Water", skiprows=-1) + @pytest.fixture + def python_docs(self): + return """ + + +
+ + + + + + + + + + + + +
+ +

Indices and tables:

+ + +
+ + + + + + +
+ """ # noqa: E501 + @pytest.mark.network - @tm.network(url="https://docs.python.org/2/", check_before_test=True) - def test_multiple_matches(self): - url = "https://docs.python.org/2/" - dfs = self.read_html(url, match="Python") + @pytest.mark.single_cpu + def test_multiple_matches(self, python_docs, httpserver): + httpserver.serve_content(content=python_docs) + dfs = self.read_html(httpserver.url, match="Python") assert len(dfs) > 1 @pytest.mark.network - @tm.network(url="https://docs.python.org/2/", check_before_test=True) - def test_python_docs_table(self): - url = "https://docs.python.org/2/" - dfs = self.read_html(url, match="Python") + @pytest.mark.single_cpu + def test_python_docs_table(self, python_docs, httpserver): + httpserver.serve_content(content=python_docs) + dfs = self.read_html(httpserver.url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] - assert sorted(zz) == sorted(["Repo", "What"]) + assert sorted(zz) == ["Pyth", "What"] def test_empty_tables(self): """ diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f2ff526a58f99..35bf75d3928f8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -376,21 +376,13 @@ def check_external_error_on_write(self, df, engine, exc): to_parquet(df, path, engine, compression=None) @pytest.mark.network - @tm.network( - url=( - "https://raw.githubusercontent.com/pandas-dev/pandas/" - "main/pandas/tests/io/data/parquet/simple.parquet" - ), - check_before_test=True, - ) - def test_parquet_read_from_url(self, df_compat, engine): + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): if engine != "auto": pytest.importorskip(engine) - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/" - "main/pandas/tests/io/data/parquet/simple.parquet" - ) - df = read_parquet(url) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url) tm.assert_frame_equal(df, df_compat) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 5171ec04b0bcf..5c23000270c99 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -5,7 +5,6 @@ import pandas.util._test_decorators as td from pandas import read_csv -import pandas._testing as tm def test_streaming_s3_objects(): @@ -21,28 +20,24 @@ def test_streaming_s3_objects(): @td.skip_if_no("s3fs") -@pytest.mark.network -@tm.network -def test_read_without_creds_from_pub_bucket(): +def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data): # GH 34626 # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt - result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3) + result = read_csv(f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=3) assert len(result) == 3 @td.skip_if_no("s3fs") -@pytest.mark.network -@tm.network -def test_read_with_creds_from_pub_bucket(monkeypatch): +@pytest.mark.single_cpu +def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch): # Ensure we can read from a public bucket with credentials # GH 34626 - # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt # temporary workaround as moto fails for botocore >= 1.11 otherwise, # see https://github.com/spulec/moto/issues/1924 & 1952 monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( - "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None + f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, header=None ) assert len(df) == 5 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b0e806caecc80..80c63d0e17de1 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -14,7 +14,6 @@ import numpy as np import pytest -from pandas.compat import is_ci_environment from pandas.compat._optional import import_optional_dependency from pandas.errors import ( EmptyDataError, @@ -297,53 +296,16 @@ def test_parser_consistency_file(xml_books): @pytest.mark.network -@pytest.mark.slow -@tm.network( - url=( - "https://data.cityofchicago.org/api/views/" - "8pix-ypme/rows.xml?accessType=DOWNLOAD" - ), - check_before_test=True, -) -def test_parser_consistency_url(parser): - url = ( - "https://data.cityofchicago.org/api/views/" - "8pix-ypme/rows.xml?accessType=DOWNLOAD" - ) - - with tm.ensure_clean(filename="cta.xml") as path: - (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False)) +@pytest.mark.single_cpu +def test_parser_consistency_url(parser, httpserver): + httpserver.serve_content(content=xml_default_nmsp) - df_xpath = read_xml(path, parser=parser) - df_iter = read_xml( - path, - parser=parser, - iterparse={ - "row": [ - "_id", - "_uuid", - "_position", - "_address", - "stop_id", - "direction_id", - "stop_name", - "station_name", - "station_descriptive_name", - "map_id", - "ada", - "red", - "blue", - "g", - "brn", - "p", - "pexp", - "y", - "pnk", - "o", - "location", - ] - }, - ) + df_xpath = read_xml(xml_default_nmsp, parser=parser) + df_iter = read_xml( + BytesIO(xml_default_nmsp.encode()), + parser=parser, + iterparse={"row": ["shape", "degrees", "sides"]}, + ) tm.assert_frame_equal(df_xpath, df_iter) @@ -520,14 +482,11 @@ def test_wrong_file_path_etree(): @pytest.mark.network -@tm.network( - url="https://www.w3schools.com/xml/books.xml", - check_before_test=True, -) @td.skip_if_no("lxml") -def test_url(): - url = "https://www.w3schools.com/xml/books.xml" - df_url = read_xml(url, xpath=".//book[count(*)=4]") +def test_url(httpserver, xml_file): + with open(xml_file) as f: + httpserver.serve_content(content=f.read()) + df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]") df_expected = DataFrame( { @@ -536,7 +495,6 @@ def test_url(): "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"], "year": [2005, 2005, 2003], "price": [30.00, 29.99, 39.95], - "cover": [None, None, "paperback"], } ) @@ -544,11 +502,11 @@ def test_url(): @pytest.mark.network -@tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True) -def test_wrong_url(parser): - with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")): - url = "https://www.w3schools.com/xml/python.xml" - read_xml(url, xpath=".//book[count(*)=4]", parser=parser) +@pytest.mark.single_cpu +def test_wrong_url(parser, httpserver): + httpserver.serve_content("NOT FOUND", code=404) + with pytest.raises(HTTPError, match=("HTTP Error 404: NOT FOUND")): + read_xml(httpserver.url, xpath=".//book[count(*)=4]", parser=parser) # XPATH @@ -1429,17 +1387,18 @@ def test_file_io_iterparse(xml_books, parser, mode): @pytest.mark.network -@tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True) -def test_url_path_error(parser): - url = "https://www.w3schools.com/xml/books.xml" - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - url, - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) +@pytest.mark.single_cpu +def test_url_path_error(parser, httpserver, xml_file): + with open(xml_file) as f: + httpserver.serve_content(content=f.read()) + with pytest.raises( + ParserError, match=("iterparse is designed for large XML files") + ): + read_xml( + httpserver.url, + parser=parser, + iterparse={"row": ["shape", "degrees", "sides", "date"]}, + ) def test_compression_error(parser, compression_only): @@ -1641,14 +1600,245 @@ def test_empty_data(xml_books, parser): ) -@pytest.mark.network @td.skip_if_no("lxml") -@tm.network( - url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True -) def test_online_stylesheet(): - xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml" - xsl = "https://www.w3schools.com/xml/cdcatalog.xsl" + xml = """\ + + + + Empire Burlesque + Bob Dylan + USA + Columbia + 10.90 + 1985 + + + Hide your heart + Bonnie Tyler + UK + CBS Records + 9.90 + 1988 + + + Greatest Hits + Dolly Parton + USA + RCA + 9.90 + 1982 + + + Still got the blues + Gary Moore + UK + Virgin records + 10.20 + 1990 + + + Eros + Eros Ramazzotti + EU + BMG + 9.90 + 1997 + + + One night only + Bee Gees + UK + Polydor + 10.90 + 1998 + + + Sylvias Mother + Dr.Hook + UK + CBS + 8.10 + 1973 + + + Maggie May + Rod Stewart + UK + Pickwick + 8.50 + 1990 + + + Romanza + Andrea Bocelli + EU + Polydor + 10.80 + 1996 + + + When a man loves a woman + Percy Sledge + USA + Atlantic + 8.70 + 1987 + + + Black angel + Savage Rose + EU + Mega + 10.90 + 1995 + + + 1999 Grammy Nominees + Many + USA + Grammy + 10.20 + 1999 + + + For the good times + Kenny Rogers + UK + Mucik Master + 8.70 + 1995 + + + Big Willie style + Will Smith + USA + Columbia + 9.90 + 1997 + + + Tupelo Honey + Van Morrison + UK + Polydor + 8.20 + 1971 + + + Soulsville + Jorn Hoel + Norway + WEA + 7.90 + 1996 + + + The very best of + Cat Stevens + UK + Island + 8.90 + 1990 + + + Stop + Sam Brown + UK + A and M + 8.90 + 1988 + + + Bridge of Spies + T`Pau + UK + Siren + 7.90 + 1987 + + + Private Dancer + Tina Turner + UK + Capitol + 8.90 + 1983 + + + Midt om natten + Kim Larsen + EU + Medley + 7.80 + 1983 + + + Pavarotti Gala Concert + Luciano Pavarotti + UK + DECCA + 9.90 + 1991 + + + The dock of the bay + Otis Redding + USA + Stax Records + 7.90 + 1968 + + + Picture book + Simply Red + EU + Elektra + 7.20 + 1985 + + + Red + The Communards + UK + London + 7.80 + 1987 + + + Unchain my heart + Joe Cocker + USA + EMI + 8.20 + 1987 + + +""" + xsl = """\ + + + + + +

My CD Collection

+ + + + + + + + + + + +
TitleArtist
+ + +
+
+""" df_xsl = read_xml( xml, @@ -1740,32 +1930,15 @@ def test_unsuported_compression(parser): @pytest.mark.network +@pytest.mark.single_cpu @td.skip_if_no("s3fs") @td.skip_if_no("lxml") -@pytest.mark.skipif( - is_ci_environment(), - reason="2022.1.17: Hanging on the CI min versions build.", -) -@tm.network -def test_s3_parser_consistency(): - # Python Software Foundation (2019 IRS-990 RETURN) - s3 = "s3://irs-form-990/201923199349319487_public.xml" +def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): + s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml" - df_lxml = read_xml( - s3, - xpath=".//irs:Form990PartVIISectionAGrp", - namespaces={"irs": "http://www.irs.gov/efile"}, - parser="lxml", - storage_options={"anon": True}, - ) + df_lxml = read_xml(s3, parser="lxml", storage_options=s3so) - df_etree = read_xml( - s3, - xpath=".//irs:Form990PartVIISectionAGrp", - namespaces={"irs": "http://www.irs.gov/efile"}, - parser="etree", - storage_options={"anon": True}, - ) + df_etree = read_xml(s3, parser="etree", storage_options=s3so) tm.assert_frame_equal(df_lxml, df_etree) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 0f5fdbefd13d2..7354e313e24f4 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -139,14 +139,13 @@ def test_oo_optimized_datetime_index_unpickle(): ) -@pytest.mark.network -@tm.network def test_statsmodels(): statsmodels = import_module("statsmodels") # noqa: F841 - import statsmodels.api as sm import statsmodels.formula.api as smf - df = sm.datasets.get_rdataset("Guerry", "HistData").data + df = DataFrame( + {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)} + ) smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit() @@ -163,11 +162,11 @@ def test_scikit_learn(): clf.predict(digits.data[-1:]) -@pytest.mark.network -@tm.network def test_seaborn(): seaborn = import_module("seaborn") - tips = seaborn.load_dataset("tips") + tips = DataFrame( + {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)} + ) seaborn.stripplot(x="day", y="total_bill", data=tips) @@ -177,16 +176,8 @@ def test_pandas_gbq(): pandas_gbq = import_module("pandas_gbq") # noqa: F841 -@pytest.mark.network -@tm.network -@pytest.mark.xfail( - raises=ValueError, - reason="The Quandl API key must be provided either through the api_key " - "variable or through the environmental variable QUANDL_API_KEY", -) def test_pandas_datareader(): - pandas_datareader = import_module("pandas_datareader") - pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01") + pandas_datareader = import_module("pandas_datareader") # noqa: F841 def test_pyarrow(df): diff --git a/requirements-dev.txt b/requirements-dev.txt index 38a2ce7f66aa3..b1d8ce1cf2143 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,6 +10,7 @@ pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 pytest-asyncio>=0.17.0 +pytest-localserver>=0.7.1 coverage python-dateutil numpy From c02deb9742027b9bed346b70f205b36625ba0c20 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Jun 2023 19:16:09 -0700 Subject: [PATCH 2/7] Fix test, remove network function --- .../development/contributing_codebase.rst | 19 +- pandas/_testing/_io.py | 253 ------------------ pandas/tests/io/parser/test_network.py | 9 +- pandas/tests/io/xml/test_xml.py | 1 + 4 files changed, 13 insertions(+), 269 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 311120fc527d4..17d33bcb306bc 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -613,22 +613,15 @@ Testing involving network connectivity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and -lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the -``tm.network`` decorator. +lack of ownership of the server that is being connected to. If network connectivity is absolutely required, mock +the network connection using the ``httpserver`` fixture from the +`pytest-localserver plugin. `_ .. code-block:: python - @tm.network # noqa - def test_network(): - result = package.call_to_internet() - -If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator. - -.. code-block:: python - - @tm.network("https://www.somespecificsite.com", check_before_test=True) - def test_network(): - result = pd.read_html("https://www.somespecificsite.com") + def test_network(httpserver): + httpserver.serve_content(content="content") + result = pd.read_html(httpserver.url) Example ^^^^^^^ diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index d79968a580e40..fa0bc58a132d4 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -1,10 +1,8 @@ from __future__ import annotations import bz2 -from functools import wraps import gzip import io -import socket import tarfile from typing import ( TYPE_CHECKING, @@ -20,8 +18,6 @@ from pandas._testing._random import rands from pandas._testing.contexts import ensure_clean -from pandas.io.common import urlopen - if TYPE_CHECKING: from pandas._typing import ( FilePath, @@ -33,255 +29,6 @@ Series, ) -# skip tests on exceptions with these messages -_network_error_messages = ( - # 'urlopen error timed out', - # 'timeout: timed out', - # 'socket.timeout: timed out', - "timed out", - "Server Hangup", - "HTTP Error 503: Service Unavailable", - "502: Proxy Error", - "HTTP Error 502: internal error", - "HTTP Error 502", - "HTTP Error 503", - "HTTP Error 403", - "HTTP Error 400", - "Temporary failure in name resolution", - "Name or service not known", - "Connection refused", - "certificate verify", -) - -# or this e.errno/e.reason.errno -_network_errno_vals = ( - 101, # Network is unreachable - 111, # Connection refused - 110, # Connection timed out - 104, # Connection reset Error - 54, # Connection reset by peer - 60, # urllib.error.URLError: [Errno 60] Connection timed out -) - -# Both of the above shouldn't mask real issues such as 404's -# or refused connections (changed DNS). -# But some tests (test_data yahoo) contact incredibly flakey -# servers. - -# and conditionally raise on exception types in _get_default_network_errors - - -def _get_default_network_errors(): - # Lazy import for http.client & urllib.error - # because it imports many things from the stdlib - import http.client - import urllib.error - - return ( - OSError, - http.client.HTTPException, - TimeoutError, - urllib.error.URLError, - socket.timeout, - ) - - -def optional_args(decorator): - """ - allows a decorator to take optional positional and keyword arguments. - Assumes that taking a single, callable, positional argument means that - it is decorating a function, i.e. something like this:: - - @my_decorator - def function(): pass - - Calls decorator with decorator(f, *args, **kwargs) - """ - - @wraps(decorator) - def wrapper(*args, **kwargs): - def dec(f): - return decorator(f, *args, **kwargs) - - is_decorating = not kwargs and len(args) == 1 and callable(args[0]) - if is_decorating: - f = args[0] - args = () - return dec(f) - else: - return dec - - return wrapper - - -# error: Untyped decorator makes function "network" untyped -@optional_args # type: ignore[misc] -def network( - t, - url: str = "https://www.google.com", - raise_on_error: bool = False, - check_before_test: bool = False, - error_classes=None, - skip_errnos=_network_errno_vals, - _skip_on_messages=_network_error_messages, -): - """ - Label a test as requiring network connection and, if an error is - encountered, only raise if it does not find a network connection. - - In comparison to ``network``, this assumes an added contract to your test: - you must assert that, under normal conditions, your test will ONLY fail if - it does not have network connectivity. - - You can call this in 3 ways: as a standard decorator, with keyword - arguments, or with a positional argument that is the url to check. - - Parameters - ---------- - t : callable - The test requiring network connectivity. - url : path - The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'https://www.google.com'. - raise_on_error : bool - If True, never catches errors. - check_before_test : bool - If True, checks connectivity before running the test case. - error_classes : tuple or Exception - error classes to ignore. If not in ``error_classes``, raises the error. - defaults to OSError. Be careful about changing the error classes here. - skip_errnos : iterable of int - Any exception that has .errno or .reason.erno set to one - of these values will be skipped with an appropriate - message. - _skip_on_messages: iterable of string - any exception e for which one of the strings is - a substring of str(e) will be skipped with an appropriate - message. Intended to suppress errors where an errno isn't available. - - Notes - ----- - * ``raise_on_error`` supersedes ``check_before_test`` - - Returns - ------- - t : callable - The decorated test ``t``, with checks for connectivity errors. - - Example - ------- - - Tests decorated with @network will fail if it's possible to make a network - connection to another URL (defaults to google.com):: - - >>> from pandas import _testing as tm - >>> @tm.network - ... def test_network(): - ... with pd.io.common.urlopen("rabbit://bonanza.com"): - ... pass - >>> test_network() # doctest: +SKIP - Traceback - ... - URLError: - - You can specify alternative URLs:: - - >>> @tm.network("https://www.yahoo.com") - ... def test_something_with_yahoo(): - ... raise OSError("Failure Message") - >>> test_something_with_yahoo() # doctest: +SKIP - Traceback (most recent call last): - ... - OSError: Failure Message - - If you set check_before_test, it will check the url first and not run the - test on failure:: - - >>> @tm.network("failing://url.blaher", check_before_test=True) - ... def test_something(): - ... print("I ran!") - ... raise ValueError("Failure") - >>> test_something() # doctest: +SKIP - Traceback (most recent call last): - ... - - Errors not related to networking will always be raised. - """ - import pytest - - if error_classes is None: - error_classes = _get_default_network_errors() - - t.network = True - - @wraps(t) - def wrapper(*args, **kwargs): - if ( - check_before_test - and not raise_on_error - and not can_connect(url, error_classes) - ): - pytest.skip( - f"May not have network connectivity because cannot connect to {url}" - ) - try: - return t(*args, **kwargs) - except Exception as err: - errno = getattr(err, "errno", None) - if not errno and hasattr(errno, "reason"): - # error: "Exception" has no attribute "reason" - errno = getattr(err.reason, "errno", None) # type: ignore[attr-defined] - - if errno in skip_errnos: - pytest.skip(f"Skipping test due to known errno and error {err}") - - e_str = str(err) - - if any(m.lower() in e_str.lower() for m in _skip_on_messages): - pytest.skip( - f"Skipping test because exception message is known and error {err}" - ) - - if not isinstance(err, error_classes) or raise_on_error: - raise - pytest.skip(f"Skipping test due to lack of connectivity and error {err}") - - return wrapper - - -def can_connect(url, error_classes=None) -> bool: - """ - Try to connect to the given url. True if succeeds, False if OSError - raised - - Parameters - ---------- - url : basestring - The URL to try to connect to - - Returns - ------- - connectable : bool - Return True if no OSError (unable to connect) or URLError (bad url) was - raised - """ - if error_classes is None: - error_classes = _get_default_network_errors() - - try: - with urlopen(url, timeout=20) as response: - # Timeout just in case rate-limiting is applied - if ( - response.info().get("Content-type") == "text/html" - and response.status != 200 - ): - return False - except error_classes: - return False - else: - return True - - # ------------------------------------------------------------------ # File-IO diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 184d9338d4c1f..ba0307cf5111e 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -36,11 +36,14 @@ def test_compressed_urls( ): # test reading compressed urls with various engines and # extension inference - compression_to_extension[compression_only] - with open(datapath("io", "parser", "data", "salaries.csv")) as f: + if compression_only == "tar": + pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data") + + extension = compression_to_extension[compression_only] + with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f: httpserver.serve_content(content=f.read()) - url = httpserver.url + url = httpserver.url + "/salaries.csv" + extension if mode != "explicit": compression_only = mode diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 80c63d0e17de1..40609598916a6 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -482,6 +482,7 @@ def test_wrong_file_path_etree(): @pytest.mark.network +@pytest.mark.single_cpu @td.skip_if_no("lxml") def test_url(httpserver, xml_file): with open(xml_file) as f: From c958362fa529c3975c7185e8dd9b67013ed775fa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Jun 2023 19:39:14 -0700 Subject: [PATCH 3/7] remove network from init --- pandas/_testing/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index de3dd58d3b716..fbbdfa4b8a5bf 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -51,7 +51,6 @@ ) from pandas._testing._io import ( close, - network, round_trip_localpath, round_trip_pathlib, round_trip_pickle, @@ -1150,7 +1149,6 @@ def shares_memory(left, right) -> bool: "makeUIntIndex", "maybe_produces_warning", "NARROW_NP_DTYPES", - "network", "NP_NAT_OBJECTS", "NULL_OBJECTS", "OBJECT_DTYPES", From 693d7170ebfdc957d110f11bce28b1520c45e85d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Jun 2023 21:17:56 -0700 Subject: [PATCH 4/7] Ignore disutils from datareader, s3so --- pandas/tests/io/test_s3.py | 17 ++++++++++++----- pyproject.toml | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 5c23000270c99..35250f1dd3081 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -20,16 +20,20 @@ def test_streaming_s3_objects(): @td.skip_if_no("s3fs") -def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data): +@pytest.mark.single_cpu +def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): # GH 34626 - # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt - result = read_csv(f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=3) + result = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv", + nrows=3, + storage_options=s3so, + ) assert len(result) == 3 @td.skip_if_no("s3fs") @pytest.mark.single_cpu -def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch): +def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so): # Ensure we can read from a public bucket with credentials # GH 34626 @@ -38,6 +42,9 @@ def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( - f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, header=None + f"s3://{s3_public_bucket_with_data.name}/tips.csv", + nrows=5, + header=None, + storage_options=s3so, ) assert len(df) == 5 diff --git a/pyproject.toml b/pyproject.toml index 0d1bca886a638..ef257b3143598 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -483,6 +483,7 @@ filterwarnings = [ "ignore:a closed node found in the registry:UserWarning:tables", "ignore:`np.object` is a deprecated:DeprecationWarning:tables", "ignore:tostring:DeprecationWarning:tables", + "ignore:distutils Version classes are deprecated:DeprecationWarning:pandas_datareader", "ignore:distutils Version classes are deprecated:DeprecationWarning:numexpr", "ignore:distutils Version classes are deprecated:DeprecationWarning:fastparquet", "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", From 08549b4c26bc89ab5b2ecfced6f8614914862d76 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 23 Jun 2023 22:15:27 -0700 Subject: [PATCH 5/7] specify encoding --- pandas/tests/io/parser/common/test_file_buffer_url.py | 2 +- pandas/tests/io/test_html.py | 4 ++-- pandas/tests/io/xml/test_xml.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c0f5c1a203e94..5ee629947db48 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -34,7 +34,7 @@ def test_url(all_parsers, csv_dir_path, httpserver): kwargs = {"sep": "\t"} local_path = os.path.join(csv_dir_path, "salaries.csv") - with open(local_path) as f: + with open(local_path, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) url_result = parser.read_csv(httpserver.url, **kwargs) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index dd0ce417e1f15..d17e4b08b5a4d 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -194,7 +194,7 @@ def test_dtype_backend(self, string_storage, dtype_backend): @pytest.mark.network @pytest.mark.single_cpu def test_banklist_url(self, httpserver, banklist_data): - with open(banklist_data) as f: + with open(banklist_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) df1 = self.read_html( # lxml cannot find attrs leave out for now @@ -212,7 +212,7 @@ def test_banklist_url(self, httpserver, banklist_data): @pytest.mark.network @pytest.mark.single_cpu def test_spam_url(self, httpserver, spam_data): - with open(spam_data) as f: + with open(spam_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) df1 = self.read_html(httpserver.url, match=".*Water.*") df2 = self.read_html(httpserver.url, match="Unit") diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 40609598916a6..49fb43bf40006 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1390,7 +1390,7 @@ def test_file_io_iterparse(xml_books, parser, mode): @pytest.mark.network @pytest.mark.single_cpu def test_url_path_error(parser, httpserver, xml_file): - with open(xml_file) as f: + with open(xml_file, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) with pytest.raises( ParserError, match=("iterparse is designed for large XML files") From 6b09451a583a78bcbdbf0b8c98da045869757d7b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 24 Jun 2023 10:46:02 -0700 Subject: [PATCH 6/7] Specify encoding --- pandas/tests/io/xml/test_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 49fb43bf40006..a3a1646bc4748 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -485,7 +485,7 @@ def test_wrong_file_path_etree(): @pytest.mark.single_cpu @td.skip_if_no("lxml") def test_url(httpserver, xml_file): - with open(xml_file) as f: + with open(xml_file, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]") From 82acdc1ded645aa674fe2f858ca479e9917700e3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 26 Jun 2023 15:32:24 -0700 Subject: [PATCH 7/7] Clarify contributing doc --- doc/source/development/contributing_codebase.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 17d33bcb306bc..00f9fd74e01ca 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -612,13 +612,14 @@ deleted when the context block is exited. Testing involving network connectivity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and -lack of ownership of the server that is being connected to. If network connectivity is absolutely required, mock -the network connection using the ``httpserver`` fixture from the -`pytest-localserver plugin. `_ +A unit test should not access a public data set over the internet due to flakiness of network connections and +lack of ownership of the server that is being connected to. To mock this interaction, use the ``httpserver`` fixture from the +`pytest-localserver plugin. `_ with synthetic data. .. code-block:: python + @pytest.mark.network + @pytest.mark.single_cpu def test_network(httpserver): httpserver.serve_content(content="content") result = pd.read_html(httpserver.url)