Skip to content

ENH: Add I/O support of XML with pandas.read_xml and DataFrame.to_xml… #39516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 60 commits into from
Feb 27, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
b67d876
ENH: Add i/o support of XML with pandas.read_xml and DataFrame.to_xml…
ParfaitG Feb 1, 2021
98e3bcd
Merge branch 'master' into read_xml
ParfaitG Feb 1, 2021
cd79a06
Refactor code for base classes, add tests, adjust whatsnew entry
ParfaitG Feb 3, 2021
6c06dc2
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 3, 2021
fadcb67
Fixed import_optional_dependency() args
ParfaitG Feb 3, 2021
ac5fd3a
Fix fixture and param name collision and check two errors in tests
ParfaitG Feb 3, 2021
25ba341
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 3, 2021
143402a
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 3, 2021
938b0a0
Adjusted tests to handle etree version issues
ParfaitG Feb 3, 2021
a92c21e
Add appropriate etree skips in tests
ParfaitG Feb 3, 2021
51f10f2
Remove check for warnings in tests
ParfaitG Feb 3, 2021
3520d58
Adjust code to conform to mypy and docstring validation
ParfaitG Feb 4, 2021
4832562
Add read_xml to TestPDApi test and fix for etree test
ParfaitG Feb 4, 2021
2914c32
Add read_xml to TestPDApi test and fix for etree test
ParfaitG Feb 4, 2021
72d0e93
Replace lxml ImportWarning for ImportError with added tests
ParfaitG Feb 4, 2021
6453f6e
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 4, 2021
8af695e
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 5, 2021
b80b8ce
Adjust fixture for lxml skip and add error validation in tests
ParfaitG Feb 5, 2021
a6cfc90
Add conditional skips for envs without lxml
ParfaitG Feb 5, 2021
6c4e0b4
Clean up whatnew rst of rebase issue
ParfaitG Feb 5, 2021
a57fd35
Fix unescaped emphasis and wording in read_xml docstring
ParfaitG Feb 5, 2021
16cbcd3
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 8, 2021
23439b4
Add XML section in io.rst and lxml dependency for read_xml in install…
ParfaitG Feb 8, 2021
2effae0
Add section title in whatsnew and tree builder for lxml dependency in…
ParfaitG Feb 10, 2021
878eebe
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 10, 2021
35fa6a6
Clean up merge issue in whatsnew, remove escape in io.rst, adjust exc…
ParfaitG Feb 11, 2021
80d44f9
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 11, 2021
f861d53
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 11, 2021
947840a
Remove redundant try/except and fix default namespace condition
ParfaitG Feb 16, 2021
f8dc56c
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 16, 2021
cb34dde
Replace path or buffer handling with get_handle and add compression a…
ParfaitG Feb 20, 2021
3133486
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 20, 2021
a7716b8
Fix issues in tests from other Python envs
ParfaitG Feb 21, 2021
701d225
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 21, 2021
5b93c16
Fix precommit issue with import line
ParfaitG Feb 21, 2021
9a0dfb4
Adjust code and tests per twoertwein comments
ParfaitG Feb 21, 2021
9556035
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 21, 2021
82ac370
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 22, 2021
c478cb0
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 22, 2021
e23200d
Remove redundancy and object names in XML parse and rename tests for …
ParfaitG Feb 23, 2021
b0b3759
Resolve merge conflict with upstream/master
ParfaitG Feb 23, 2021
b48e257
Add XML table in install.rst
ParfaitG Feb 23, 2021
453ac40
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 23, 2021
9b21636
Streamline filepath_or_buffer handling and add TypeError tests
ParfaitG Feb 23, 2021
bea318c
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 23, 2021
49343b1
Fix lxml test on few Python envs
ParfaitG Feb 23, 2021
ce986bc
Adjust io handling in context maanger
ParfaitG Feb 24, 2021
347d58b
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 24, 2021
e2f80db
Add and fix tests for special filepath_or_buffer values
ParfaitG Feb 24, 2021
c7e1e11
Fix tests for better example and wrong parser
ParfaitG Feb 24, 2021
9790e7c
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 24, 2021
df9ecf4
Adjust to handle empty string stylesheet with tests
ParfaitG Feb 24, 2021
46719b7
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 24, 2021
5d75d51
Move methods out of class, adjust xpath check, and data frame formatting
ParfaitG Feb 25, 2021
66c01d2
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 25, 2021
5c0af6e
Update tests to conform to mypy
ParfaitG Feb 25, 2021
2eae8ad
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 25, 2021
603644e
Import methods to avoid duplication and add typing to parse_doc
ParfaitG Feb 27, 2021
3ec7297
Merge remote-tracking branch 'upstream/master' into read_xml
ParfaitG Feb 27, 2021
6194f83
Refactor code and revert changes to avoid optional module type hints
ParfaitG Feb 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions pandas/tests/io/formats/test_to_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,12 +1074,6 @@ def test_stylesheet_wrong_path(datapath):
geom_df.to_xml(stylesheet=xsl)


@td.skip_if_no("lxml")
def test_stylesheet_not_path_buffer():
with pytest.raises(AttributeError, match=("__enter__")):
geom_df.to_xml(stylesheet=DataFrame)


@td.skip_if_no("lxml")
@pytest.mark.parametrize("val", ["", b""])
def test_empty_string_stylesheet(val):
Expand Down
65 changes: 44 additions & 21 deletions pandas/tests/io/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
etree
[X] - ImportError: "lxml not found, please install or use the etree parser."
[X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType"
[X] - TypeError: "expected str, bytes or os.PathLike object, not type"
[X] - ValueError: "Either element or attributes can be parsed not both."
[X] - ValueError: "xpath does not return any nodes..."
[X] - SyntaxError: "You have used an incorrect or unsupported XPath"
Expand Down Expand Up @@ -237,6 +236,28 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
tm.assert_frame_equal(df_str, df_expected)


@td.skip_if_no("lxml")
def test_closed_file_lxml(datapath):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can probably parametrize these two tests.

In general, I'm not sure whether it is necessary to enforce generic error messages for "obviously" wrong inputs (None/closed files handles). @jreback

One test to add (or extending an existing test) is to make sure that a user-provided file handle is not closed by read/to_xml.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Understood. I can remove those wrong input tests. I tried simulating how users may behave (having answered many StackOverflow pandas answers from newbies!). Will parametrize and add file handle close tests.

xml = datapath("io", "data", "xml", "baby_names.xml")

with open(xml, "rb") as f:
f.read()

with pytest.raises(ValueError, match="I/O operation on closed file"):
read_xml(f, parser="lxml")


def test_closed_file_etree(datapath):
xml = datapath("io", "data", "xml", "baby_names.xml")

with open(xml, "rb") as f:
f.read()

with pytest.raises(ValueError, match="read of closed file"):
read_xml(f, parser="etree")


@td.skip_if_no("lxml")
@pytest.mark.parametrize("val", ["", b""])
def test_empty_string_lxml(val):
from lxml.etree import XMLSyntaxError
Expand Down Expand Up @@ -279,29 +300,18 @@ def test_wrong_file_path_etree():


@td.skip_if_no("lxml")
def test_none_path_buffer_lxml():
with pytest.raises(AttributeError, match=("__enter__")):
read_xml(None, parser="lxml")
def test_none_file_path_lxml():
xml_var = None
with pytest.raises(AttributeError, match="__enter__"):
read_xml(xml_var, parser="lxml")


def test_none_path_buffer_etree():
def test_none_file_path_etree():
xml_var = None
with pytest.raises(
TypeError, match=("expected str, bytes or os.PathLike object, not NoneType")
TypeError, match="expected str, bytes or os.PathLike object, not NoneType"
):
read_xml(None, parser="etree")


@td.skip_if_no("lxml")
def test_not_path_buffer_lxml():
with pytest.raises(AttributeError, match=("__enter__")):
read_xml(DataFrame, parser="lxml")


def test_not_path_buffer_etree():
with pytest.raises(
TypeError, match=("expected str, bytes or os.PathLike object, not type")
):
read_xml(DataFrame, parser="etree")
read_xml(xml_var, parser="etree")


@tm.network
Expand Down Expand Up @@ -921,9 +931,14 @@ def test_wrong_stylesheet():

@td.skip_if_no("lxml")
def test_stylesheet_not_path_buffer():
from lxml.etree import XMLSyntaxError

kml = os.path.join("data", "xml", "cta_rail_lines.kml")

with pytest.raises(AttributeError, match=("__enter__")):
with pytest.raises(
(AttributeError, XMLSyntaxError),
match=("__enter__|Start tag expected, '<' not found"),
):
read_xml(kml, stylesheet={"a": 1})


Expand All @@ -937,6 +952,14 @@ def test_stylesheet_with_etree(datapath):
read_xml(kml, parser="etree", stylesheet=xsl)


@td.skip_if_no("lxml")
@pytest.mark.parametrize("val", ["", b""])
def test_empty_stylesheet(val):
kml = os.path.join("data", "xml", "cta_rail_lines.kml")

read_xml(kml, parser="etree", stylesheet=val)


@tm.network
@td.skip_if_no("lxml")
def test_online_stylesheet():
Expand Down