pandas-dev · Jul 1, 2022
diff --git a/‎pandas/io/xml.py
Lines changed: 119 additions & 260 deletions b/‎pandas/io/xml.py
Lines changed: 119 additions & 260 deletions
@@ -7,6 +7,7 @@
 import io
 from typing import (
     Any,
+    Callable,
     Sequence,
 )
 
@@ -177,7 +178,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
 
         raise AbstractMethodError(self)
 
-    def _parse_nodes(self) -> list[dict[str, str | None]]:
+    def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
         """
         Parse xml nodes.
 
@@ -197,102 +198,6 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
         will have optional keys filled with None values.
         """
 
-        raise AbstractMethodError(self)
-
-    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
-        """
-        Iterparse xml nodes.
-
-        This method will read in local disk, decompressed XML files for elements
-        and underlying descendants using iterparse, a method to iterate through
-        an XML tree without holding entire XML tree in memory.
-
-        Raises
-        ------
-        TypeError
-            * If `iterparse` is not a dict or its dict value is not list-like.
-        ParserError
-            * If `path_or_buffer` is not a physical, decompressed file on disk.
-            * If no data is returned from selected items in `iterparse`.
-
-        Notes
-        -----
-        Namespace URIs will be removed from return node values. Also,
-        elements with missing children or attributes in submitted list
-        will have optional keys filled with None values.
-        """
-
-        raise AbstractMethodError(self)
-
-    def _validate_path(self) -> None:
-        """
-        Validate xpath.
-
-        This method checks for syntax, evaluation, or empty nodes return.
-
-        Raises
-        ------
-        SyntaxError
-            * If xpah is not supported or issues with namespaces.
-
-        ValueError
-            * If xpah does not return any nodes.
-        """
-
-        raise AbstractMethodError(self)
-
-    def _validate_names(self) -> None:
-        """
-        Validate names.
-
-        This method will check if names is a list-like and aligns
-        with length of parse nodes.
-
-        Raises
-        ------
-        ValueError
-            * If value is not a list and less then length of nodes.
-        """
-        raise AbstractMethodError(self)
-
-    def _parse_doc(self, raw_doc) -> bytes:
-        """
-        Build tree from path_or_buffer.
-
-        This method will parse XML object into tree
-        either from string/bytes or file location.
-        """
-        raise AbstractMethodError(self)
-
-
-class _EtreeFrameParser(_XMLFrameParser):
-    """
-    Internal class to parse XML into DataFrames with the Python
-    standard library XML module: `xml.etree.ElementTree`.
-    """
-
-    def parse_data(self) -> list[dict[str, str | None]]:
-        from xml.etree.ElementTree import XML
-
-        if self.stylesheet is not None:
-            raise ValueError(
-                "To use stylesheet, you need lxml installed and selected as parser."
-            )
-
-        if self.iterparse is None:
-            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
-            self._validate_path()
-
-        self._validate_names()
-
-        xml_dicts: list[dict[str, str | None]] = (
-            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
-        )
-
-        return xml_dicts
-
-    def _parse_nodes(self) -> list[dict[str, str | None]]:
-        elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
         dicts: list[dict[str, str | None]]
 
         if self.elems_only and self.attrs_only:
@@ -375,8 +280,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
 
         return dicts
 
-    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
-        from xml.etree.ElementTree import iterparse
+    def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
+        """
+        Iterparse xml nodes.
+
+        This method will read in local disk, decompressed XML files for elements
+        and underlying descendants using iterparse, a method to iterate through
+        an XML tree without holding entire XML tree in memory.
+
+        Raises
+        ------
+        TypeError
+            * If `iterparse` is not a dict or its dict value is not list-like.
+        ParserError
+            * If `path_or_buffer` is not a physical, decompressed file on disk.
+            * If no data is returned from selected items in `iterparse`.
+
+        Notes
+        -----
+        Namespace URIs will be removed from return node values. Also,
+        elements with missing children or attributes in submitted list
+        will have optional keys filled with None values.
+        """
 
         dicts: list[dict[str, str | None]] = []
         row: dict[str, str | None] | None = None
@@ -433,7 +358,13 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
                 if curr_elem == row_node and row is not None:
                     dicts.append(row)
                     row = None
+
                 elem.clear()
+                if hasattr(elem, "getprevious"):
+                    while (
+                        elem.getprevious() is not None and elem.getparent() is not None
+                    ):
+                        del elem.getparent()[0]
 
         if dicts == []:
             raise ParserError("No result from selected items in iterparse.")
@@ -446,6 +377,81 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
 
         return dicts
 
+    def _validate_path(self) -> None:
+        """
+        Validate xpath.
+
+        This method checks for syntax, evaluation, or empty nodes return.
+
+        Raises
+        ------
+        SyntaxError
+            * If xpah is not supported or issues with namespaces.
+
+        ValueError
+            * If xpah does not return any nodes.
+        """
+
+        raise AbstractMethodError(self)
+
+    def _validate_names(self) -> None:
+        """
+        Validate names.
+
+        This method will check if names is a list-like and aligns
+        with length of parse nodes.
+
+        Raises
+        ------
+        ValueError
+            * If value is not a list and less then length of nodes.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> bytes:
+        """
+        Build tree from path_or_buffer.
+
+        This method will parse XML object into tree
+        either from string/bytes or file location.
+        """
+        raise AbstractMethodError(self)
+
+
+class _EtreeFrameParser(_XMLFrameParser):
+    """
+    Internal class to parse XML into DataFrames with the Python
+    standard library XML module: `xml.etree.ElementTree`.
+    """
+
+    def parse_data(self) -> list[dict[str, str | None]]:
+        from xml.etree.ElementTree import (
+            XML,
+            iterparse,
+        )
+
+        if self.stylesheet is not None:
+            raise ValueError(
+                "To use stylesheet, you need lxml installed and selected as parser."
+            )
+
+        if self.iterparse is None:
+            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+            self._validate_path()
+            elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+
+        self._validate_names()
+
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes(elems)
+            if self.iterparse is None
+            else self._iterparse_nodes(iterparse)
+        )
+
+        return xml_dicts
+
     def _validate_path(self) -> None:
         """
         Notes
@@ -495,7 +501,9 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self, raw_doc) -> bytes:
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> bytes:
         from xml.etree.ElementTree import (
             XMLParser,
             parse,
@@ -531,7 +539,10 @@ def parse_data(self) -> list[dict[str, str | None]]:
         validate xpath, names, optionally parse and run XSLT,
         and parse original or transformed XML and return specific nodes.
         """
-        from lxml.etree import XML
+        from lxml.etree import (
+            XML,
+            iterparse,
+        )
 
         if self.iterparse is None:
             self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
@@ -541,172 +552,18 @@ def parse_data(self) -> list[dict[str, str | None]]:
                 self.xml_doc = XML(self._transform_doc())
 
             self._validate_path()
+            elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
 
         self._validate_names()
 
         xml_dicts: list[dict[str, str | None]] = (
-            self._parse_nodes() if self.iterparse is None else self._iterparse_nodes()
+            self._parse_nodes(elems)
+            if self.iterparse is None
+            else self._iterparse_nodes(iterparse)
         )
 
         return xml_dicts
 
-    def _parse_nodes(self) -> list[dict[str, str | None]]:
-        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
-        dicts: list[dict[str, str | None]]
-
-        if self.elems_only and self.attrs_only:
-            raise ValueError("Either element or attributes can be parsed not both.")
-
-        elif self.elems_only:
-            if self.names:
-                dicts = [
-                    {
-                        **(
-                            {el.tag: el.text.strip()}
-                            if el.text and not el.text.isspace()
-                            else {}
-                        ),
-                        **{
-                            nm: ch.text.strip() if ch.text else None
-                            for nm, ch in zip(self.names, el.xpath("*"))
-                        },
-                    }
-                    for el in elems
-                ]
-            else:
-                dicts = [
-                    {
-                        ch.tag: ch.text.strip() if ch.text else None
-                        for ch in el.xpath("*")
-                    }
-                    for el in elems
-                ]
-
-        elif self.attrs_only:
-            dicts = [el.attrib for el in elems]
-
-        else:
-            if self.names:
-                dicts = [
-                    {
-                        **el.attrib,
-                        **(
-                            {el.tag: el.text.strip()}
-                            if el.text and not el.text.isspace()
-                            else {}
-                        ),
-                        **{
-                            nm: ch.text.strip() if ch.text else None
-                            for nm, ch in zip(self.names, el.xpath("*"))
-                        },
-                    }
-                    for el in elems
-                ]
-            else:
-                dicts = [
-                    {
-                        **el.attrib,
-                        **(
-                            {el.tag: el.text.strip()}
-                            if el.text and not el.text.isspace()
-                            else {}
-                        ),
-                        **{
-                            ch.tag: ch.text.strip() if ch.text else None
-                            for ch in el.xpath("*")
-                        },
-                    }
-                    for el in elems
-                ]
-
-        if self.namespaces or "}" in list(dicts[0].keys())[0]:
-            dicts = [
-                {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
-                for d in dicts
-            ]
-
-        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
-        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
-
-        if self.names:
-            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
-
-        return dicts
-
-    def _iterparse_nodes(self) -> list[dict[str, str | None]]:
-        from lxml.etree import iterparse
-
-        dicts: list[dict[str, str | None]] = []
-        row: dict[str, str | None] | None = None
-
-        if not isinstance(self.iterparse, dict):
-            raise TypeError(
-                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
-            )
-
-        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
-        if not is_list_like(self.iterparse[row_node]):
-            raise TypeError(
-                f"{type(self.iterparse[row_node])} is not a valid type "
-                "for value in iterparse"
-            )
-
-        if (
-            not isinstance(self.path_or_buffer, str)
-            or is_url(self.path_or_buffer)
-            or is_fsspec_url(self.path_or_buffer)
-            or self.path_or_buffer.startswith(("<?xml", "<"))
-            or infer_compression(self.path_or_buffer, "infer") is not None
-        ):
-            raise ParserError(
-                "iterparse is designed for large XML files that are fully extracted on "
-                "local disk and not as compressed files or online sources."
-            )
-
-        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
-            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
-
-            if event == "start":
-                if curr_elem == row_node:
-                    row = {}
-
-            if row is not None:
-                if self.names:
-                    for col, nm in zip(self.iterparse[row_node], self.names):
-                        if curr_elem == col:
-                            elem_val = elem.text.strip() if elem.text else None
-                            if elem_val not in row.values() and nm not in row:
-                                row[nm] = elem_val
-                        if col in elem.attrib:
-                            if elem.attrib[col] not in row.values() and nm not in row:
-                                row[nm] = elem.attrib[col]
-                else:
-                    for col in self.iterparse[row_node]:
-                        if curr_elem == col:
-                            row[col] = elem.text.strip() if elem.text else None
-                        if col in elem.attrib:
-                            row[col] = elem.attrib[col]
-
-            if event == "end":
-                if curr_elem == row_node and row is not None:
-                    dicts.append(row)
-                    row = None
-
-                elem.clear()
-                while elem.getprevious() is not None and elem.getparent() is not None:
-                    del elem.getparent()[0]
-
-        if dicts == []:
-            raise ParserError("No result from selected items in iterparse.")
-
-        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
-        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
-
-        if self.names:
-            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
-
-        return dicts
-
     def _validate_path(self) -> None:
 
         msg = (
@@ -748,7 +605,9 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self, raw_doc) -> bytes:
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> bytes:
         from lxml.etree import (
             XMLParser,
             fromstring,