Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ebc96ae

Browse files
authoredJul 1, 2022
REF: Reduce duplicative methods between XML parser classes (#47553)
* REF: Reduce duplicative methods between XML parser classes * Add typing to base class methods
1 parent 231b9fa commit ebc96ae

File tree

1 file changed

+119
-260
lines changed

1 file changed

+119
-260
lines changed
 

‎pandas/io/xml.py

Lines changed: 119 additions & 260 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import io
88
from typing import (
99
Any,
10+
Callable,
1011
Sequence,
1112
)
1213

@@ -177,7 +178,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
177178

178179
raise AbstractMethodError(self)
179180

180-
def _parse_nodes(self) -> list[dict[str, str | None]]:
181+
def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
181182
"""
182183
Parse xml nodes.
183184
@@ -197,102 +198,6 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
197198
will have optional keys filled with None values.
198199
"""
199200

200-
raise AbstractMethodError(self)
201-
202-
def _iterparse_nodes(self) -> list[dict[str, str | None]]:
203-
"""
204-
Iterparse xml nodes.
205-
206-
This method will read in local disk, decompressed XML files for elements
207-
and underlying descendants using iterparse, a method to iterate through
208-
an XML tree without holding entire XML tree in memory.
209-
210-
Raises
211-
------
212-
TypeError
213-
* If `iterparse` is not a dict or its dict value is not list-like.
214-
ParserError
215-
* If `path_or_buffer` is not a physical, decompressed file on disk.
216-
* If no data is returned from selected items in `iterparse`.
217-
218-
Notes
219-
-----
220-
Namespace URIs will be removed from return node values. Also,
221-
elements with missing children or attributes in submitted list
222-
will have optional keys filled with None values.
223-
"""
224-
225-
raise AbstractMethodError(self)
226-
227-
def _validate_path(self) -> None:
228-
"""
229-
Validate xpath.
230-
231-
This method checks for syntax, evaluation, or empty nodes return.
232-
233-
Raises
234-
------
235-
SyntaxError
236-
* If xpah is not supported or issues with namespaces.
237-
238-
ValueError
239-
* If xpah does not return any nodes.
240-
"""
241-
242-
raise AbstractMethodError(self)
243-
244-
def _validate_names(self) -> None:
245-
"""
246-
Validate names.
247-
248-
This method will check if names is a list-like and aligns
249-
with length of parse nodes.
250-
251-
Raises
252-
------
253-
ValueError
254-
* If value is not a list and less then length of nodes.
255-
"""
256-
raise AbstractMethodError(self)
257-
258-
def _parse_doc(self, raw_doc) -> bytes:
259-
"""
260-
Build tree from path_or_buffer.
261-
262-
This method will parse XML object into tree
263-
either from string/bytes or file location.
264-
"""
265-
raise AbstractMethodError(self)
266-
267-
268-
class _EtreeFrameParser(_XMLFrameParser):
269-
"""
270-
Internal class to parse XML into DataFrames with the Python
271-
standard library XML module: `xml.etree.ElementTree`.
272-
"""
273-
274-
def parse_data(self) -> list[dict[str, str | None]]:
275-
from xml.etree.ElementTree import XML
276-
277-
if self.stylesheet is not None:
278-
raise ValueError(
279-
"To use stylesheet, you need lxml installed and selected as parser."
280-
)
281-
282-
if self.iterparse is None:
283-
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
284-
self._validate_path()
285-
286-
self._validate_names()
287-
288-
xml_dicts: list[dict[str, str | None]] = (
289-
self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
290-
)
291-
292-
return xml_dicts
293-
294-
def _parse_nodes(self) -> list[dict[str, str | None]]:
295-
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
296201
dicts: list[dict[str, str | None]]
297202

298203
if self.elems_only and self.attrs_only:
@@ -375,8 +280,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
375280

376281
return dicts
377282

378-
def _iterparse_nodes(self) -> list[dict[str, str | None]]:
379-
from xml.etree.ElementTree import iterparse
283+
def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
284+
"""
285+
Iterparse xml nodes.
286+
287+
This method will read in local disk, decompressed XML files for elements
288+
and underlying descendants using iterparse, a method to iterate through
289+
an XML tree without holding entire XML tree in memory.
290+
291+
Raises
292+
------
293+
TypeError
294+
* If `iterparse` is not a dict or its dict value is not list-like.
295+
ParserError
296+
* If `path_or_buffer` is not a physical, decompressed file on disk.
297+
* If no data is returned from selected items in `iterparse`.
298+
299+
Notes
300+
-----
301+
Namespace URIs will be removed from return node values. Also,
302+
elements with missing children or attributes in submitted list
303+
will have optional keys filled with None values.
304+
"""
380305

381306
dicts: list[dict[str, str | None]] = []
382307
row: dict[str, str | None] | None = None
@@ -433,7 +358,13 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
433358
if curr_elem == row_node and row is not None:
434359
dicts.append(row)
435360
row = None
361+
436362
elem.clear()
363+
if hasattr(elem, "getprevious"):
364+
while (
365+
elem.getprevious() is not None and elem.getparent() is not None
366+
):
367+
del elem.getparent()[0]
437368

438369
if dicts == []:
439370
raise ParserError("No result from selected items in iterparse.")
@@ -446,6 +377,81 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
446377

447378
return dicts
448379

380+
def _validate_path(self) -> None:
381+
"""
382+
Validate xpath.
383+
384+
This method checks for syntax, evaluation, or empty nodes return.
385+
386+
Raises
387+
------
388+
SyntaxError
389+
* If xpah is not supported or issues with namespaces.
390+
391+
ValueError
392+
* If xpah does not return any nodes.
393+
"""
394+
395+
raise AbstractMethodError(self)
396+
397+
def _validate_names(self) -> None:
398+
"""
399+
Validate names.
400+
401+
This method will check if names is a list-like and aligns
402+
with length of parse nodes.
403+
404+
Raises
405+
------
406+
ValueError
407+
* If value is not a list and less then length of nodes.
408+
"""
409+
raise AbstractMethodError(self)
410+
411+
def _parse_doc(
412+
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
413+
) -> bytes:
414+
"""
415+
Build tree from path_or_buffer.
416+
417+
This method will parse XML object into tree
418+
either from string/bytes or file location.
419+
"""
420+
raise AbstractMethodError(self)
421+
422+
423+
class _EtreeFrameParser(_XMLFrameParser):
424+
"""
425+
Internal class to parse XML into DataFrames with the Python
426+
standard library XML module: `xml.etree.ElementTree`.
427+
"""
428+
429+
def parse_data(self) -> list[dict[str, str | None]]:
430+
from xml.etree.ElementTree import (
431+
XML,
432+
iterparse,
433+
)
434+
435+
if self.stylesheet is not None:
436+
raise ValueError(
437+
"To use stylesheet, you need lxml installed and selected as parser."
438+
)
439+
440+
if self.iterparse is None:
441+
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
442+
self._validate_path()
443+
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
444+
445+
self._validate_names()
446+
447+
xml_dicts: list[dict[str, str | None]] = (
448+
self._parse_nodes(elems)
449+
if self.iterparse is None
450+
else self._iterparse_nodes(iterparse)
451+
)
452+
453+
return xml_dicts
454+
449455
def _validate_path(self) -> None:
450456
"""
451457
Notes
@@ -495,7 +501,9 @@ def _validate_names(self) -> None:
495501
f"{type(self.names).__name__} is not a valid type for names"
496502
)
497503

498-
def _parse_doc(self, raw_doc) -> bytes:
504+
def _parse_doc(
505+
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
506+
) -> bytes:
499507
from xml.etree.ElementTree import (
500508
XMLParser,
501509
parse,
@@ -531,7 +539,10 @@ def parse_data(self) -> list[dict[str, str | None]]:
531539
validate xpath, names, optionally parse and run XSLT,
532540
and parse original or transformed XML and return specific nodes.
533541
"""
534-
from lxml.etree import XML
542+
from lxml.etree import (
543+
XML,
544+
iterparse,
545+
)
535546

536547
if self.iterparse is None:
537548
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
@@ -541,172 +552,18 @@ def parse_data(self) -> list[dict[str, str | None]]:
541552
self.xml_doc = XML(self._transform_doc())
542553

543554
self._validate_path()
555+
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
544556

545557
self._validate_names()
546558

547559
xml_dicts: list[dict[str, str | None]] = (
548-
self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
560+
self._parse_nodes(elems)
561+
if self.iterparse is None
562+
else self._iterparse_nodes(iterparse)
549563
)
550564

551565
return xml_dicts
552566

553-
def _parse_nodes(self) -> list[dict[str, str | None]]:
554-
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
555-
dicts: list[dict[str, str | None]]
556-
557-
if self.elems_only and self.attrs_only:
558-
raise ValueError("Either element or attributes can be parsed not both.")
559-
560-
elif self.elems_only:
561-
if self.names:
562-
dicts = [
563-
{
564-
**(
565-
{el.tag: el.text.strip()}
566-
if el.text and not el.text.isspace()
567-
else {}
568-
),
569-
**{
570-
nm: ch.text.strip() if ch.text else None
571-
for nm, ch in zip(self.names, el.xpath("*"))
572-
},
573-
}
574-
for el in elems
575-
]
576-
else:
577-
dicts = [
578-
{
579-
ch.tag: ch.text.strip() if ch.text else None
580-
for ch in el.xpath("*")
581-
}
582-
for el in elems
583-
]
584-
585-
elif self.attrs_only:
586-
dicts = [el.attrib for el in elems]
587-
588-
else:
589-
if self.names:
590-
dicts = [
591-
{
592-
**el.attrib,
593-
**(
594-
{el.tag: el.text.strip()}
595-
if el.text and not el.text.isspace()
596-
else {}
597-
),
598-
**{
599-
nm: ch.text.strip() if ch.text else None
600-
for nm, ch in zip(self.names, el.xpath("*"))
601-
},
602-
}
603-
for el in elems
604-
]
605-
else:
606-
dicts = [
607-
{
608-
**el.attrib,
609-
**(
610-
{el.tag: el.text.strip()}
611-
if el.text and not el.text.isspace()
612-
else {}
613-
),
614-
**{
615-
ch.tag: ch.text.strip() if ch.text else None
616-
for ch in el.xpath("*")
617-
},
618-
}
619-
for el in elems
620-
]
621-
622-
if self.namespaces or "}" in list(dicts[0].keys())[0]:
623-
dicts = [
624-
{k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
625-
for d in dicts
626-
]
627-
628-
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
629-
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
630-
631-
if self.names:
632-
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
633-
634-
return dicts
635-
636-
def _iterparse_nodes(self) -> list[dict[str, str | None]]:
637-
from lxml.etree import iterparse
638-
639-
dicts: list[dict[str, str | None]] = []
640-
row: dict[str, str | None] | None = None
641-
642-
if not isinstance(self.iterparse, dict):
643-
raise TypeError(
644-
f"{type(self.iterparse).__name__} is not a valid type for iterparse"
645-
)
646-
647-
row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
648-
if not is_list_like(self.iterparse[row_node]):
649-
raise TypeError(
650-
f"{type(self.iterparse[row_node])} is not a valid type "
651-
"for value in iterparse"
652-
)
653-
654-
if (
655-
not isinstance(self.path_or_buffer, str)
656-
or is_url(self.path_or_buffer)
657-
or is_fsspec_url(self.path_or_buffer)
658-
or self.path_or_buffer.startswith(("<?xml", "<"))
659-
or infer_compression(self.path_or_buffer, "infer") is not None
660-
):
661-
raise ParserError(
662-
"iterparse is designed for large XML files that are fully extracted on "
663-
"local disk and not as compressed files or online sources."
664-
)
665-
666-
for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
667-
curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
668-
669-
if event == "start":
670-
if curr_elem == row_node:
671-
row = {}
672-
673-
if row is not None:
674-
if self.names:
675-
for col, nm in zip(self.iterparse[row_node], self.names):
676-
if curr_elem == col:
677-
elem_val = elem.text.strip() if elem.text else None
678-
if elem_val not in row.values() and nm not in row:
679-
row[nm] = elem_val
680-
if col in elem.attrib:
681-
if elem.attrib[col] not in row.values() and nm not in row:
682-
row[nm] = elem.attrib[col]
683-
else:
684-
for col in self.iterparse[row_node]:
685-
if curr_elem == col:
686-
row[col] = elem.text.strip() if elem.text else None
687-
if col in elem.attrib:
688-
row[col] = elem.attrib[col]
689-
690-
if event == "end":
691-
if curr_elem == row_node and row is not None:
692-
dicts.append(row)
693-
row = None
694-
695-
elem.clear()
696-
while elem.getprevious() is not None and elem.getparent() is not None:
697-
del elem.getparent()[0]
698-
699-
if dicts == []:
700-
raise ParserError("No result from selected items in iterparse.")
701-
702-
keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
703-
dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
704-
705-
if self.names:
706-
dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
707-
708-
return dicts
709-
710567
def _validate_path(self) -> None:
711568

712569
msg = (
@@ -748,7 +605,9 @@ def _validate_names(self) -> None:
748605
f"{type(self.names).__name__} is not a valid type for names"
749606
)
750607

751-
def _parse_doc(self, raw_doc) -> bytes:
608+
def _parse_doc(
609+
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
610+
) -> bytes:
752611
from lxml.etree import (
753612
XMLParser,
754613
fromstring,

0 commit comments

Comments
 (0)
Please sign in to comment.