diff --git a/pandas/conftest.py b/pandas/conftest.py index 1dcfc88eb1bfd..b2f1377a9fb32 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -30,6 +30,7 @@ from decimal import Decimal import operator import os +from pathlib import Path from typing import ( Callable, Hashable, @@ -1167,6 +1168,16 @@ def strict_data_files(pytestconfig): return pytestconfig.getoption("--strict-data-files") +@pytest.fixture +def tests_path() -> Path: + return Path(__file__).parent / "tests" + + +@pytest.fixture +def tests_io_data_path(tests_path) -> Path: + return tests_path / "io" / "data" + + @pytest.fixture def datapath(strict_data_files: str) -> Callable[..., str]: """ diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 65cc369416352..2aec361d46b99 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,6 +5,7 @@ from __future__ import annotations import io +from os import PathLike from typing import ( TYPE_CHECKING, Any, @@ -326,10 +327,13 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: ) if (not hasattr(self.path_or_buffer, "read")) and ( - not isinstance(self.path_or_buffer, str) + not isinstance(self.path_or_buffer, (str, PathLike)) or is_url(self.path_or_buffer) or is_fsspec_url(self.path_or_buffer) - or self.path_or_buffer.startswith((" """ - kml = datapath("io", "data", "xml", "cta_rail_lines.kml") - with pytest.raises( XMLSyntaxError, match=("Extra content at the end of the document") ): - read_xml(kml, stylesheet=xsl) + read_xml(kml_cta_rail_lines, stylesheet=xsl) @td.skip_if_no("lxml") -def test_incorrect_xsl_eval(datapath): +def test_incorrect_xsl_eval(kml_cta_rail_lines): from lxml.etree import XSLTParseError xsl = """\ @@ -1313,14 +1263,12 @@ def test_incorrect_xsl_eval(datapath): """ - kml = datapath("io", "data", "xml", "cta_rail_lines.kml") - with pytest.raises(XSLTParseError, match=("failed to compile")): - read_xml(kml, stylesheet=xsl) + read_xml(kml_cta_rail_lines, stylesheet=xsl) @td.skip_if_no("lxml") -def test_incorrect_xsl_apply(datapath): +def test_incorrect_xsl_apply(kml_cta_rail_lines): from lxml.etree import XSLTApplyError xsl = """\ @@ -1335,55 +1283,46 @@ def test_incorrect_xsl_apply(datapath): """ - kml = datapath("io", "data", "xml", "cta_rail_lines.kml") - with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")): - read_xml(kml, stylesheet=xsl) + read_xml(kml_cta_rail_lines, stylesheet=xsl) @td.skip_if_no("lxml") -def test_wrong_stylesheet(): +def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path): from lxml.etree import XMLSyntaxError - kml = os.path.join("data", "xml", "cta_rail_lines.kml") - xsl = os.path.join("data", "xml", "flatten.xsl") + xsl = xml_data_path / "flatten.xsl" with pytest.raises( XMLSyntaxError, match=("Start tag expected, '<' not found"), ): - read_xml(kml, stylesheet=xsl) + read_xml(kml_cta_rail_lines, stylesheet=xsl) @td.skip_if_no("lxml") -def test_stylesheet_file_close(datapath, mode): - kml = datapath("io", "data", "xml", "cta_rail_lines.kml") - xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - +def test_stylesheet_file_close(kml_cta_rail_lines, xsl_flatten_doc, mode): # note: By default the bodies of untyped functions are not checked, # consider using --check-untyped-defs xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] - with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: + with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: xsl_obj = StringIO(f.read()) - read_xml(kml, stylesheet=xsl_obj) + read_xml(kml_cta_rail_lines, stylesheet=xsl_obj) assert not f.closed @td.skip_if_no("lxml") -def test_stylesheet_with_etree(): - kml = os.path.join("data", "xml", "cta_rail_lines.kml") - xsl = os.path.join("data", "xml", "flatten_doc.xsl") - +def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc): with pytest.raises( ValueError, match=("To use stylesheet, you need lxml installed") ): - read_xml(kml, parser="etree", stylesheet=xsl) + read_xml(kml_cta_rail_lines, parser="etree", stylesheet=xsl_flatten_doc) @td.skip_if_no("lxml") @@ -1413,10 +1352,8 @@ def test_string_error(parser): ) -def test_file_like_iterparse(datapath, parser, mode): - filename = datapath("io", "data", "xml", "books.xml") - - with open(filename, mode, encoding="utf-8" if mode == "r" else None) as f: +def test_file_like_iterparse(xml_books, parser, mode): + with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "r" and parser == "lxml": with pytest.raises( TypeError, match=("reading file objects must return bytes objects") @@ -1449,12 +1386,10 @@ def test_file_like_iterparse(datapath, parser, mode): tm.assert_frame_equal(df_filelike, df_expected) -def test_file_io_iterparse(datapath, parser, mode): - filename = datapath("io", "data", "xml", "books.xml") - +def test_file_io_iterparse(xml_books, parser, mode): funcIO = StringIO if mode == "r" else BytesIO with open( - filename, + xml_books, mode, encoding="utf-8" if mode == "r" else None, ) as f: @@ -1522,22 +1457,20 @@ def test_compression_error(parser, compression_only): ) -def test_wrong_dict_type(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") +def test_wrong_dict_type(xml_books, parser): with pytest.raises(TypeError, match="list is not a valid type for iterparse"): read_xml( - filename, + xml_books, parser=parser, iterparse=["category", "title", "year", "author", "price"], ) -def test_wrong_dict_value(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") +def test_wrong_dict_value(xml_books, parser): with pytest.raises( TypeError, match=" is not a valid type for value in iterparse" ): - read_xml(filename, parser=parser, iterparse={"book": "category"}) + read_xml(xml_books, parser=parser, iterparse={"book": "category"}) def test_bad_xml(parser): @@ -1688,23 +1621,21 @@ def test_processing_instruction(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_no_result(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") +def test_no_result(xml_books, parser): with pytest.raises( ParserError, match="No result from selected items in iterparse." ): read_xml( - filename, + xml_books, parser=parser, iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]}, ) -def test_empty_data(datapath, parser): - filename = datapath("io", "data", "xml", "books.xml") +def test_empty_data(xml_books, parser): with pytest.raises(EmptyDataError, match="No columns to parse from file"): read_xml( - filename, + xml_books, parser=parser, iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]}, )