-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
TYP: type read_xml and deprecate passing positional arguments #45133
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
baa3ab2
ab8f1c6
9ff4878
6998251
0b85c7d
3accd12
1da9569
a2348a2
ba355ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -5,19 +5,24 @@ | |||
from __future__ import annotations | ||||
|
||||
import io | ||||
from typing import Sequence | ||||
|
||||
from pandas._typing import ( | ||||
CompressionOptions, | ||||
FilePath, | ||||
ReadBuffer, | ||||
StorageOptions, | ||||
XMLParsers, | ||||
) | ||||
from pandas.compat._optional import import_optional_dependency | ||||
from pandas.errors import ( | ||||
AbstractMethodError, | ||||
ParserError, | ||||
) | ||||
from pandas.util._decorators import doc | ||||
from pandas.util._decorators import ( | ||||
deprecate_nonkeyword_arguments, | ||||
doc, | ||||
) | ||||
|
||||
from pandas.core.dtypes.common import is_list_like | ||||
|
||||
|
@@ -98,17 +103,17 @@ class _XMLFrameParser: | |||
|
||||
def __init__( | ||||
self, | ||||
path_or_buffer, | ||||
xpath, | ||||
namespaces, | ||||
elems_only, | ||||
attrs_only, | ||||
names, | ||||
encoding, | ||||
stylesheet, | ||||
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], | ||||
xpath: str, | ||||
twoertwein marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
namespaces: dict | None, | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not familiar with how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not familiar enough either, found only cases like this, but not 100% sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. namespaces is passed to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also please fix this existing typo in the doc-string (two "s" at the end): Line 57 in 3accd12
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Namespaces map string prefixes to URIs so can only be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||||
elems_only: bool, | ||||
attrs_only: bool, | ||||
names: Sequence[str] | None, | ||||
twoertwein marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
encoding: str | None, | ||||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, | ||||
compression: CompressionOptions, | ||||
storage_options: StorageOptions, | ||||
) -> None: | ||||
): | ||||
self.path_or_buffer = path_or_buffer | ||||
self.xpath = xpath | ||||
self.namespaces = namespaces | ||||
|
@@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser): | |||
XPath 1.0 and XSLT 1.0. | ||||
""" | ||||
|
||||
def __init__(self, *args, **kwargs) -> None: | ||||
super().__init__(*args, **kwargs) | ||||
|
||||
def parse_data(self) -> list[dict[str, str | None]]: | ||||
""" | ||||
Parse xml data. | ||||
|
@@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes: | |||
curr_parser = XMLParser(encoding=self.encoding) | ||||
|
||||
if isinstance(xml_data, io.StringIO): | ||||
if self.encoding is None: | ||||
raise TypeError( | ||||
"Can not pass encoding None when input is StringIO." | ||||
) | ||||
|
||||
doc = fromstring( | ||||
xml_data.getvalue().encode(self.encoding), parser=curr_parser | ||||
) | ||||
|
@@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes: | |||
|
||||
def get_data_from_filepath( | ||||
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], | ||||
encoding, | ||||
encoding: str | None, | ||||
compression: CompressionOptions, | ||||
storage_options: StorageOptions, | ||||
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: | ||||
|
@@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes. | |||
|
||||
|
||||
def _parse( | ||||
path_or_buffer, | ||||
xpath, | ||||
namespaces, | ||||
elems_only, | ||||
attrs_only, | ||||
names, | ||||
encoding, | ||||
parser, | ||||
stylesheet, | ||||
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], | ||||
xpath: str, | ||||
namespaces: dict | None, | ||||
elems_only: bool, | ||||
attrs_only: bool, | ||||
names: Sequence[str] | None, | ||||
encoding: str | None, | ||||
parser: XMLParsers, | ||||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, | ||||
compression: CompressionOptions, | ||||
storage_options: StorageOptions, | ||||
**kwargs, | ||||
|
@@ -686,11 +693,11 @@ def _parse( | |||
* If parser is not lxml or etree. | ||||
""" | ||||
|
||||
lxml = import_optional_dependency("lxml.etree", errors="ignore") | ||||
|
||||
p: _EtreeFrameParser | _LxmlFrameParser | ||||
|
||||
if parser == "lxml": | ||||
lxml = import_optional_dependency("lxml.etree", errors="ignore") | ||||
|
||||
if lxml is not None: | ||||
p = _LxmlFrameParser( | ||||
path_or_buffer, | ||||
|
@@ -728,19 +735,23 @@ def _parse( | |||
return _data_to_frame(data=data_dicts, **kwargs) | ||||
|
||||
|
||||
@deprecate_nonkeyword_arguments( | ||||
version=None, allowed_args=["path_or_buffer"], stacklevel=2 | ||||
) | ||||
@doc( | ||||
storage_options=_shared_docs["storage_options"], | ||||
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", | ||||
) | ||||
def read_xml( | ||||
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], | ||||
xpath: str | None = "./*", | ||||
namespaces: dict | list[dict] | None = None, | ||||
elems_only: bool | None = False, | ||||
attrs_only: bool | None = False, | ||||
names: list[str] | None = None, | ||||
xpath: str = "./*", | ||||
namespaces: dict | None = None, | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. list[dict] does not work, this raises There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned below, namespaces are distinct prefix/URI pairs within an XML doc, so should only be |
||||
elems_only: bool = False, | ||||
attrs_only: bool = False, | ||||
names: Sequence[str] | None = None, | ||||
# encoding can not be None for lxml and StringIO input | ||||
encoding: str | None = "utf-8", | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can type this correctly after enforcing the positional deprecation |
||||
parser: str | None = "lxml", | ||||
parser: XMLParsers = "lxml", | ||||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, | ||||
compression: CompressionOptions = "infer", | ||||
storage_options: StorageOptions = None, | ||||
|
@@ -765,7 +776,7 @@ def read_xml( | |||
expressions. For more complex XPath, use ``lxml`` which requires | ||||
installation. | ||||
|
||||
namespaces : dict, optional | ||||
namespaces : dict, list of dicts, optional | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again namespaces will not be a list of dicts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep removed, forgot this |
||||
The namespaces defined in XML document as dicts with key being | ||||
namespace prefix and value the URI. There is no need to include all | ||||
namespaces in XML, only the ones used in ``xpath`` expression. | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -769,6 +769,18 @@ def test_stylesheet_file(datapath): | |
tm.assert_frame_equal(df_kml, df_style) | ||
|
||
|
||
def test_read_xml_passing_as_positional_deprecated(datapath): | ||
# GH#45133 | ||
kml = datapath("io", "data", "xml", "cta_rail_lines.kml") | ||
|
||
with tm.assert_produces_warning(FutureWarning, match="keyword-only"): | ||
read_xml( | ||
kml, | ||
".//k:Placemark", | ||
namespaces={"k": "http://www.opengis.net/kml/2.2"}, | ||
) | ||
|
||
|
||
@td.skip_if_no("lxml") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good test for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added and moved |
||
def test_stylesheet_file_like(datapath, mode): | ||
kml = datapath("io", "data", "xml", "cta_rail_lines.kml") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice addition for this type. Maybe inspired by new IO XML module.