Skip to content

TYP: Add typing for remaining IO XML methods with conditional for lxml #40340

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 16, 2021
Merged
4 changes: 2 additions & 2 deletions pandas/io/formats/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ class EtreeXMLFormatter(BaseXMLFormatter):
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
"""

def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.validate_columns()
Expand Down Expand Up @@ -452,7 +452,7 @@ class LxmlXMLFormatter(BaseXMLFormatter):
modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
"""

def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

self.validate_columns()
Expand Down
110 changes: 61 additions & 49 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
)
from pandas.io.parsers import TextParser

lxml = import_optional_dependency("lxml.etree", errors="ignore")


class _XMLFrameParser:
"""
Expand Down Expand Up @@ -90,7 +92,6 @@ class _XMLFrameParser:
To subclass this class effectively you must override the following methods:`
* :func:`parse_data`
* :func:`_parse_nodes`
* :func:`_parse_doc`
* :func:`_validate_names`
* :func:`_validate_path`

Expand All @@ -111,7 +112,7 @@ def __init__(
stylesheet,
compression,
storage_options,
):
) -> None:
self.path_or_buffer = path_or_buffer
self.xpath = xpath
self.namespaces = namespaces
Expand Down Expand Up @@ -187,16 +188,6 @@ def _validate_names(self) -> None:
"""
raise AbstractMethodError(self)

def _parse_doc(self):
"""
Build tree from io.

This method will parse io object into tree for parsing
conditionally by its specific object type.
"""

raise AbstractMethodError(self)


class _EtreeFrameParser(_XMLFrameParser):
"""
Expand All @@ -209,7 +200,7 @@ class _EtreeFrameParser(_XMLFrameParser):
ElementTree,
)

def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

def parse_data(self) -> List[Dict[str, Optional[str]]]:
Expand Down Expand Up @@ -357,6 +348,12 @@ def _validate_names(self) -> None:
)

def _parse_doc(self) -> Union[Element, ElementTree]:
"""
Build tree from path_or_buffer.

This method will parse XML object into tree
either from string/bytes or file location.
"""
from xml.etree.ElementTree import (
XMLParser,
parse,
Expand All @@ -383,7 +380,7 @@ class _LxmlFrameParser(_XMLFrameParser):
XPath 1.0 and XSLT 1.0.
"""

def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

def parse_data(self) -> List[Dict[str, Optional[str]]]:
Expand Down Expand Up @@ -491,21 +488,6 @@ def _parse_nodes(self) -> List[Dict[str, Optional[str]]]:

return dicts

def _transform_doc(self):
"""
Transform original tree using stylesheet.

This method will transform original xml using XSLT script into
am ideally flatter xml document for easier parsing and migration
to Data Frame.
"""
from lxml.etree import XSLT

transformer = XSLT(self.xsl_doc)
new_doc = transformer(self.xml_doc)

return new_doc

def _validate_path(self) -> None:

msg = (
Expand Down Expand Up @@ -553,31 +535,62 @@ def _validate_names(self) -> None:
f"{type(self.names).__name__} is not a valid type for names"
)

def _parse_doc(self, raw_doc):
if lxml is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm i find this odd that this is not in a function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I modify this entirely which did not pass ci/code_checks

from lxml.etree import (
XMLParser,
fromstring,
parse,
Element,
ElementTree,
)

handle_data = get_data_from_filepath(
filepath_or_buffer=raw_doc,
encoding=self.encoding,
compression=self.compression,
storage_options=self.storage_options,
)
def _parse_doc(self, raw_doc) -> Union[Element, ElementTree]:
"""
Build tree from path_or_buffer.

with preprocess_data(handle_data) as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
This method will parse XML object into tree
either from string/bytes or file location.
"""

if isinstance(xml_data, io.StringIO):
doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
else:
doc = parse(xml_data, parser=curr_parser)
from lxml.etree import (
XMLParser,
fromstring,
parse,
)

handle_data = get_data_from_filepath(
filepath_or_buffer=raw_doc,
encoding=self.encoding,
compression=self.compression,
storage_options=self.storage_options,
)

with preprocess_data(handle_data) as xml_data:
curr_parser = XMLParser(encoding=self.encoding)

if isinstance(xml_data, io.StringIO):
doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
else:
doc = parse(xml_data, parser=curr_parser)

return doc

def _transform_doc(self) -> Element:
"""
Transform original tree using stylesheet.

This method will transform original xml using XSLT script into
am ideally flatter xml document for easier parsing and migration
to Data Frame.
"""
from lxml.etree import (
XML,
XSLT,
)

transformer = XSLT(self.xsl_doc)
new_doc = transformer(self.xml_doc)

return doc
return XML(bytes(new_doc))


def get_data_from_filepath(
Expand Down Expand Up @@ -694,7 +707,6 @@ def _parse(
* If parser is not lxml or etree.
"""

lxml = import_optional_dependency("lxml.etree", errors="ignore")
p: Union[_EtreeFrameParser, _LxmlFrameParser]

if parser == "lxml":
Expand Down