Skip to content

Commit d1d9b7f

Browse files
authored
CLN/DOC: Adjust xpath validation and error messaging in read_xml with IO tools doc note and example (#48386)
* CLN/DOC: Adjust xpath validation and error messaging in read_xml with IO tools doc note and example * Fix xpath validation for child elements with added test * Adjust IO tools docs for backticks on XML names
1 parent da6cae7 commit d1d9b7f

File tree

3 files changed

+110
-17
lines changed

3 files changed

+110
-17
lines changed

doc/source/user_guide/io.rst

+36
Original file line numberDiff line numberDiff line change
@@ -3174,6 +3174,42 @@ But assigning *any* temporary name to correct URI allows parsing by nodes.
31743174
However, if XPath does not reference node names such as default, ``/*``, then
31753175
``namespaces`` is not required.
31763176

3177+
.. note::
3178+
3179+
Since ``xpath`` identifies the parent of content to be parsed, only immediate
3180+
desendants which include child nodes or current attributes are parsed.
3181+
Therefore, ``read_xml`` will not parse the text of grandchildren or other
3182+
descendants and will not parse attributes of any descendant. To retrieve
3183+
lower level content, adjust xpath to lower level. For example,
3184+
3185+
.. ipython:: python
3186+
:okwarning:
3187+
3188+
xml = """
3189+
<data>
3190+
<row>
3191+
<shape sides="4">square</shape>
3192+
<degrees>360</degrees>
3193+
</row>
3194+
<row>
3195+
<shape sides="0">circle</shape>
3196+
<degrees>360</degrees>
3197+
</row>
3198+
<row>
3199+
<shape sides="3">triangle</shape>
3200+
<degrees>180</degrees>
3201+
</row>
3202+
</data>"""
3203+
3204+
df = pd.read_xml(xml, xpath="./row")
3205+
df
3206+
3207+
shows the attribute ``sides`` on ``shape`` element was not parsed as
3208+
expected since this attribute resides on the child of ``row`` element
3209+
and not ``row`` element itself. In other words, ``sides`` attribute is a
3210+
grandchild level descendant of ``row`` element. However, the ``xpath``
3211+
targets ``row`` element which covers only its children and attributes.
3212+
31773213
With `lxml`_ as parser, you can flatten nested XML documents with an XSLT
31783214
script which also can be string/file/URL types. As background, `XSLT`_ is
31793215
a special-purpose language written in a special XML file that can transform

pandas/io/xml.py

+35-17
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
387387

388388
return dicts
389389

390-
def _validate_path(self) -> None:
390+
def _validate_path(self) -> list[Any]:
391391
"""
392392
Validate xpath.
393393
@@ -446,8 +446,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
446446

447447
if self.iterparse is None:
448448
self.xml_doc = self._parse_doc(self.path_or_buffer)
449-
self._validate_path()
450-
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
449+
elems = self._validate_path()
451450

452451
self._validate_names()
453452

@@ -459,7 +458,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
459458

460459
return xml_dicts
461460

462-
def _validate_path(self) -> None:
461+
def _validate_path(self) -> list[Any]:
463462
"""
464463
Notes
465464
-----
@@ -468,18 +467,28 @@ def _validate_path(self) -> None:
468467
"""
469468

470469
msg = (
471-
"xpath does not return any nodes. "
470+
"xpath does not return any nodes or attributes. "
471+
"Be sure to specify in `xpath` the parent nodes of "
472+
"children and attributes to parse. "
472473
"If document uses namespaces denoted with "
473474
"xmlns, be sure to define namespaces and "
474475
"use them in xpath."
475476
)
476477
try:
477-
elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
478+
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
479+
children = [ch for el in elems for ch in el.findall("*")]
480+
attrs = {k: v for el in elems for k, v in el.attrib.items()}
481+
478482
if elems is None:
479483
raise ValueError(msg)
480484

481-
if elems is not None and elems.find("*") is None and elems.attrib is None:
482-
raise ValueError(msg)
485+
if elems is not None:
486+
if self.elems_only and children == []:
487+
raise ValueError(msg)
488+
elif self.attrs_only and attrs == {}:
489+
raise ValueError(msg)
490+
elif children == [] and attrs == {}:
491+
raise ValueError(msg)
483492

484493
except (KeyError, SyntaxError):
485494
raise SyntaxError(
@@ -488,6 +497,8 @@ def _validate_path(self) -> None:
488497
"undeclared namespace prefix."
489498
)
490499

500+
return elems
501+
491502
def _validate_names(self) -> None:
492503
children: list[Any]
493504

@@ -554,8 +565,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
554565
self.xsl_doc = self._parse_doc(self.stylesheet)
555566
self.xml_doc = self._transform_doc()
556567

557-
self._validate_path()
558-
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
568+
elems = self._validate_path()
559569

560570
self._validate_names()
561571

@@ -567,25 +577,33 @@ def parse_data(self) -> list[dict[str, str | None]]:
567577

568578
return xml_dicts
569579

570-
def _validate_path(self) -> None:
580+
def _validate_path(self) -> list[Any]:
571581

572582
msg = (
573-
"xpath does not return any nodes. "
574-
"Be sure row level nodes are in xpath. "
583+
"xpath does not return any nodes or attributes. "
584+
"Be sure to specify in `xpath` the parent nodes of "
585+
"children and attributes to parse. "
575586
"If document uses namespaces denoted with "
576587
"xmlns, be sure to define namespaces and "
577588
"use them in xpath."
578589
)
579590

580591
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
581-
children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
582-
attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
592+
children = [ch for el in elems for ch in el.xpath("*")]
593+
attrs = {k: v for el in elems for k, v in el.attrib.items()}
583594

584595
if elems == []:
585596
raise ValueError(msg)
586597

587-
if elems != [] and attrs == [] and children == []:
588-
raise ValueError(msg)
598+
if elems != []:
599+
if self.elems_only and children == []:
600+
raise ValueError(msg)
601+
elif self.attrs_only and attrs == {}:
602+
raise ValueError(msg)
603+
elif children == [] and attrs == {}:
604+
raise ValueError(msg)
605+
606+
return elems
589607

590608
def _validate_names(self) -> None:
591609
children: list[Any]

pandas/tests/io/xml/test_xml.py

+39
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,45 @@ def test_elem_and_attrs_only(datapath, parser):
760760
read_xml(filename, elems_only=True, attrs_only=True, parser=parser)
761761

762762

763+
def test_empty_attrs_only(parser):
764+
xml = """
765+
<data>
766+
<row>
767+
<shape sides="4">square</shape>
768+
<degrees>360</degrees>
769+
</row>
770+
<row>
771+
<shape sides="0">circle</shape>
772+
<degrees>360</degrees>
773+
</row>
774+
<row>
775+
<shape sides="3">triangle</shape>
776+
<degrees>180</degrees>
777+
</row>
778+
</data>"""
779+
780+
with pytest.raises(
781+
ValueError,
782+
match=("xpath does not return any nodes or attributes"),
783+
):
784+
read_xml(xml, xpath="./row", attrs_only=True, parser=parser)
785+
786+
787+
def test_empty_elems_only(parser):
788+
xml = """
789+
<data>
790+
<row sides="4" shape="square" degrees="360"/>
791+
<row sides="0" shape="circle" degrees="360"/>
792+
<row sides="3" shape="triangle" degrees="180"/>
793+
</data>"""
794+
795+
with pytest.raises(
796+
ValueError,
797+
match=("xpath does not return any nodes or attributes"),
798+
):
799+
read_xml(xml, xpath="./row", elems_only=True, parser=parser)
800+
801+
763802
@td.skip_if_no("lxml")
764803
def test_attribute_centric_xml():
765804
xml = """\

0 commit comments

Comments
 (0)