Skip to content

TYP: type read_xml and deprecate passing positional arguments #45133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ Other Deprecations
- Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`)
- Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`)
- Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`):
- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`):
- Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
- Deprecated the behavior of :func:`to_datetime` with the string "now" with ``utc=False``; in a future version this will match ``Timestamp("now")``, which in turn matches :meth:`Timestamp.now` returning the local time (:issue:`18705`)
- Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def closed(self) -> bool:
CompressionOptions = Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
]
XMLParsers = Literal["lxml", "etree"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice addition for this type. Maybe inspired by new IO XML module.



# types in DataFrameFormatter
Expand Down
73 changes: 42 additions & 31 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,24 @@
from __future__ import annotations

import io
from typing import Sequence

from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
StorageOptions,
XMLParsers,
)
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
AbstractMethodError,
ParserError,
)
from pandas.util._decorators import doc
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)

from pandas.core.dtypes.common import is_list_like

Expand Down Expand Up @@ -98,17 +103,17 @@ class _XMLFrameParser:

def __init__(
self,
path_or_buffer,
xpath,
namespaces,
elems_only,
attrs_only,
names,
encoding,
stylesheet,
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str,
namespaces: dict[str, str] | None,
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
encoding: str | None,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
storage_options: StorageOptions,
) -> None:
):
self.path_or_buffer = path_or_buffer
self.xpath = xpath
self.namespaces = namespaces
Expand Down Expand Up @@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser):
XPath 1.0 and XSLT 1.0.
"""

def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)

def parse_data(self) -> list[dict[str, str | None]]:
"""
Parse xml data.
Expand Down Expand Up @@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes:
curr_parser = XMLParser(encoding=self.encoding)

if isinstance(xml_data, io.StringIO):
if self.encoding is None:
raise TypeError(
"Can not pass encoding None when input is StringIO."
)

doc = fromstring(
xml_data.getvalue().encode(self.encoding), parser=curr_parser
)
Expand All @@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes:

def get_data_from_filepath(
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
encoding,
encoding: str | None,
compression: CompressionOptions,
storage_options: StorageOptions,
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
Expand Down Expand Up @@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes.


def _parse(
path_or_buffer,
xpath,
namespaces,
elems_only,
attrs_only,
names,
encoding,
parser,
stylesheet,
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str,
namespaces: dict[str, str] | None,
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
encoding: str | None,
parser: XMLParsers,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
storage_options: StorageOptions,
**kwargs,
Expand All @@ -686,11 +693,11 @@ def _parse(
* If parser is not lxml or etree.
"""

lxml = import_optional_dependency("lxml.etree", errors="ignore")

p: _EtreeFrameParser | _LxmlFrameParser

if parser == "lxml":
lxml = import_optional_dependency("lxml.etree", errors="ignore")

if lxml is not None:
p = _LxmlFrameParser(
path_or_buffer,
Expand Down Expand Up @@ -728,19 +735,23 @@ def _parse(
return _data_to_frame(data=data_dicts, **kwargs)


@deprecate_nonkeyword_arguments(
version=None, allowed_args=["path_or_buffer"], stacklevel=2
)
@doc(
storage_options=_shared_docs["storage_options"],
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
)
def read_xml(
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
xpath: str | None = "./*",
namespaces: dict | list[dict] | None = None,
elems_only: bool | None = False,
attrs_only: bool | None = False,
names: list[str] | None = None,
xpath: str = "./*",
namespaces: dict[str, str] | None = None,
elems_only: bool = False,
attrs_only: bool = False,
names: Sequence[str] | None = None,
# encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can type this correctly after enforcing the positional deprecation

parser: str | None = "lxml",
parser: XMLParsers = "lxml",
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath):
tm.assert_frame_equal(df_lxml, df_etree)


@td.skip_if_no("lxml")
def test_wrong_encoding_for_lxml():
# GH#45133
data = """<data>
<row>
<a>c</a>
</row>
</data>
"""
with pytest.raises(TypeError, match="encoding None"):
read_xml(StringIO(data), parser="lxml", encoding=None)


def test_none_encoding_etree():
# GH#45133
data = """<data>
<row>
<a>c</a>
</row>
</data>
"""
result = read_xml(StringIO(data), parser="etree", encoding=None)
expected = DataFrame({"a": ["c"]})
tm.assert_frame_equal(result, expected)


# PARSER


Expand Down Expand Up @@ -769,6 +795,19 @@ def test_stylesheet_file(datapath):
tm.assert_frame_equal(df_kml, df_style)


def test_read_xml_passing_as_positional_deprecated(datapath, parser):
# GH#45133
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")

with tm.assert_produces_warning(FutureWarning, match="keyword-only"):
read_xml(
kml,
".//k:Placemark",
namespaces={"k": "http://www.opengis.net/kml/2.2"},
parser=parser,
)


@td.skip_if_no("lxml")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good test for None encoding. Can this new test be moved few lines earlier under previous ENCODING section (lines ~700-730). I wonder if we should also test for etree with None expecting success with tm.assert_frame_equal. etree tends to be more active in API so may change defaults with future versions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added and moved

def test_stylesheet_file_like(datapath, mode):
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
Expand Down