diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 83cc01752682f..95fac0c739895 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -7,6 +7,7 @@ import io from typing import ( Any, + Callable, Sequence, ) @@ -177,7 +178,7 @@ def parse_data(self) -> list[dict[str, str | None]]: raise AbstractMethodError(self) - def _parse_nodes(self) -> list[dict[str, str | None]]: + def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: """ Parse xml nodes. @@ -197,102 +198,6 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: will have optional keys filled with None values. """ - raise AbstractMethodError(self) - - def _iterparse_nodes(self) -> list[dict[str, str | None]]: - """ - Iterparse xml nodes. - - This method will read in local disk, decompressed XML files for elements - and underlying descendants using iterparse, a method to iterate through - an XML tree without holding entire XML tree in memory. - - Raises - ------ - TypeError - * If `iterparse` is not a dict or its dict value is not list-like. - ParserError - * If `path_or_buffer` is not a physical, decompressed file on disk. - * If no data is returned from selected items in `iterparse`. - - Notes - ----- - Namespace URIs will be removed from return node values. Also, - elements with missing children or attributes in submitted list - will have optional keys filled with None values. - """ - - raise AbstractMethodError(self) - - def _validate_path(self) -> None: - """ - Validate xpath. - - This method checks for syntax, evaluation, or empty nodes return. - - Raises - ------ - SyntaxError - * If xpah is not supported or issues with namespaces. - - ValueError - * If xpah does not return any nodes. - """ - - raise AbstractMethodError(self) - - def _validate_names(self) -> None: - """ - Validate names. - - This method will check if names is a list-like and aligns - with length of parse nodes. - - Raises - ------ - ValueError - * If value is not a list and less then length of nodes. - """ - raise AbstractMethodError(self) - - def _parse_doc(self, raw_doc) -> bytes: - """ - Build tree from path_or_buffer. - - This method will parse XML object into tree - either from string/bytes or file location. - """ - raise AbstractMethodError(self) - - -class _EtreeFrameParser(_XMLFrameParser): - """ - Internal class to parse XML into DataFrames with the Python - standard library XML module: `xml.etree.ElementTree`. - """ - - def parse_data(self) -> list[dict[str, str | None]]: - from xml.etree.ElementTree import XML - - if self.stylesheet is not None: - raise ValueError( - "To use stylesheet, you need lxml installed and selected as parser." - ) - - if self.iterparse is None: - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) - self._validate_path() - - self._validate_names() - - xml_dicts: list[dict[str, str | None]] = ( - self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() - ) - - return xml_dicts - - def _parse_nodes(self) -> list[dict[str, str | None]]: - elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) dicts: list[dict[str, str | None]] if self.elems_only and self.attrs_only: @@ -375,8 +280,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]: return dicts - def _iterparse_nodes(self) -> list[dict[str, str | None]]: - from xml.etree.ElementTree import iterparse + def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: + """ + Iterparse xml nodes. + + This method will read in local disk, decompressed XML files for elements + and underlying descendants using iterparse, a method to iterate through + an XML tree without holding entire XML tree in memory. + + Raises + ------ + TypeError + * If `iterparse` is not a dict or its dict value is not list-like. + ParserError + * If `path_or_buffer` is not a physical, decompressed file on disk. + * If no data is returned from selected items in `iterparse`. + + Notes + ----- + Namespace URIs will be removed from return node values. Also, + elements with missing children or attributes in submitted list + will have optional keys filled with None values. + """ dicts: list[dict[str, str | None]] = [] row: dict[str, str | None] | None = None @@ -433,7 +358,13 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: if curr_elem == row_node and row is not None: dicts.append(row) row = None + elem.clear() + if hasattr(elem, "getprevious"): + while ( + elem.getprevious() is not None and elem.getparent() is not None + ): + del elem.getparent()[0] if dicts == []: raise ParserError("No result from selected items in iterparse.") @@ -446,6 +377,81 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]: return dicts + def _validate_path(self) -> None: + """ + Validate xpath. + + This method checks for syntax, evaluation, or empty nodes return. + + Raises + ------ + SyntaxError + * If xpah is not supported or issues with namespaces. + + ValueError + * If xpah does not return any nodes. + """ + + raise AbstractMethodError(self) + + def _validate_names(self) -> None: + """ + Validate names. + + This method will check if names is a list-like and aligns + with length of parse nodes. + + Raises + ------ + ValueError + * If value is not a list and less then length of nodes. + """ + raise AbstractMethodError(self) + + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> bytes: + """ + Build tree from path_or_buffer. + + This method will parse XML object into tree + either from string/bytes or file location. + """ + raise AbstractMethodError(self) + + +class _EtreeFrameParser(_XMLFrameParser): + """ + Internal class to parse XML into DataFrames with the Python + standard library XML module: `xml.etree.ElementTree`. + """ + + def parse_data(self) -> list[dict[str, str | None]]: + from xml.etree.ElementTree import ( + XML, + iterparse, + ) + + if self.stylesheet is not None: + raise ValueError( + "To use stylesheet, you need lxml installed and selected as parser." + ) + + if self.iterparse is None: + self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self._validate_path() + elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) + + self._validate_names() + + xml_dicts: list[dict[str, str | None]] = ( + self._parse_nodes(elems) + if self.iterparse is None + else self._iterparse_nodes(iterparse) + ) + + return xml_dicts + def _validate_path(self) -> None: """ Notes @@ -495,7 +501,9 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self, raw_doc) -> bytes: + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> bytes: from xml.etree.ElementTree import ( XMLParser, parse, @@ -531,7 +539,10 @@ def parse_data(self) -> list[dict[str, str | None]]: validate xpath, names, optionally parse and run XSLT, and parse original or transformed XML and return specific nodes. """ - from lxml.etree import XML + from lxml.etree import ( + XML, + iterparse, + ) if self.iterparse is None: self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) @@ -541,172 +552,18 @@ def parse_data(self) -> list[dict[str, str | None]]: self.xml_doc = XML(self._transform_doc()) self._validate_path() + elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) self._validate_names() xml_dicts: list[dict[str, str | None]] = ( - self._parse_nodes() if self.iterparse is None else self._iterparse_nodes() + self._parse_nodes(elems) + if self.iterparse is None + else self._iterparse_nodes(iterparse) ) return xml_dicts - def _parse_nodes(self) -> list[dict[str, str | None]]: - elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) - dicts: list[dict[str, str | None]] - - if self.elems_only and self.attrs_only: - raise ValueError("Either element or attributes can be parsed not both.") - - elif self.elems_only: - if self.names: - dicts = [ - { - **( - {el.tag: el.text.strip()} - if el.text and not el.text.isspace() - else {} - ), - **{ - nm: ch.text.strip() if ch.text else None - for nm, ch in zip(self.names, el.xpath("*")) - }, - } - for el in elems - ] - else: - dicts = [ - { - ch.tag: ch.text.strip() if ch.text else None - for ch in el.xpath("*") - } - for el in elems - ] - - elif self.attrs_only: - dicts = [el.attrib for el in elems] - - else: - if self.names: - dicts = [ - { - **el.attrib, - **( - {el.tag: el.text.strip()} - if el.text and not el.text.isspace() - else {} - ), - **{ - nm: ch.text.strip() if ch.text else None - for nm, ch in zip(self.names, el.xpath("*")) - }, - } - for el in elems - ] - else: - dicts = [ - { - **el.attrib, - **( - {el.tag: el.text.strip()} - if el.text and not el.text.isspace() - else {} - ), - **{ - ch.tag: ch.text.strip() if ch.text else None - for ch in el.xpath("*") - }, - } - for el in elems - ] - - if self.namespaces or "}" in list(dicts[0].keys())[0]: - dicts = [ - {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} - for d in dicts - ] - - keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) - dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] - - if self.names: - dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] - - return dicts - - def _iterparse_nodes(self) -> list[dict[str, str | None]]: - from lxml.etree import iterparse - - dicts: list[dict[str, str | None]] = [] - row: dict[str, str | None] | None = None - - if not isinstance(self.iterparse, dict): - raise TypeError( - f"{type(self.iterparse).__name__} is not a valid type for iterparse" - ) - - row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" - if not is_list_like(self.iterparse[row_node]): - raise TypeError( - f"{type(self.iterparse[row_node])} is not a valid type " - "for value in iterparse" - ) - - if ( - not isinstance(self.path_or_buffer, str) - or is_url(self.path_or_buffer) - or is_fsspec_url(self.path_or_buffer) - or self.path_or_buffer.startswith((" None: msg = ( @@ -748,7 +605,9 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self, raw_doc) -> bytes: + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> bytes: from lxml.etree import ( XMLParser, fromstring,