Skip to content

Commit a2aa477

Browse files
authored
TYP: type read_xml and deprecate passing positional arguments (#45133)
1 parent 490b189 commit a2aa477

File tree

4 files changed

+83
-31
lines changed

4 files changed

+83
-31
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ Other Deprecations
611611
- Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`)
612612
- Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`)
613613
- Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`):
614+
- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`):
614615
- Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`)
615616
- Deprecated the behavior of :func:`to_datetime` with the string "now" with ``utc=False``; in a future version this will match ``Timestamp("now")``, which in turn matches :meth:`Timestamp.now` returning the local time (:issue:`18705`)
616617
- Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`)

pandas/_typing.py

+1
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ def closed(self) -> bool:
246246
CompressionOptions = Optional[
247247
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
248248
]
249+
XMLParsers = Literal["lxml", "etree"]
249250

250251

251252
# types in DataFrameFormatter

pandas/io/xml.py

+42-31
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,24 @@
55
from __future__ import annotations
66

77
import io
8+
from typing import Sequence
89

910
from pandas._typing import (
1011
CompressionOptions,
1112
FilePath,
1213
ReadBuffer,
1314
StorageOptions,
15+
XMLParsers,
1416
)
1517
from pandas.compat._optional import import_optional_dependency
1618
from pandas.errors import (
1719
AbstractMethodError,
1820
ParserError,
1921
)
20-
from pandas.util._decorators import doc
22+
from pandas.util._decorators import (
23+
deprecate_nonkeyword_arguments,
24+
doc,
25+
)
2126

2227
from pandas.core.dtypes.common import is_list_like
2328

@@ -98,17 +103,17 @@ class _XMLFrameParser:
98103

99104
def __init__(
100105
self,
101-
path_or_buffer,
102-
xpath,
103-
namespaces,
104-
elems_only,
105-
attrs_only,
106-
names,
107-
encoding,
108-
stylesheet,
106+
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
107+
xpath: str,
108+
namespaces: dict[str, str] | None,
109+
elems_only: bool,
110+
attrs_only: bool,
111+
names: Sequence[str] | None,
112+
encoding: str | None,
113+
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
109114
compression: CompressionOptions,
110115
storage_options: StorageOptions,
111-
) -> None:
116+
):
112117
self.path_or_buffer = path_or_buffer
113118
self.xpath = xpath
114119
self.namespaces = namespaces
@@ -371,9 +376,6 @@ class _LxmlFrameParser(_XMLFrameParser):
371376
XPath 1.0 and XSLT 1.0.
372377
"""
373378

374-
def __init__(self, *args, **kwargs) -> None:
375-
super().__init__(*args, **kwargs)
376-
377379
def parse_data(self) -> list[dict[str, str | None]]:
378380
"""
379381
Parse xml data.
@@ -544,6 +546,11 @@ def _parse_doc(self, raw_doc) -> bytes:
544546
curr_parser = XMLParser(encoding=self.encoding)
545547

546548
if isinstance(xml_data, io.StringIO):
549+
if self.encoding is None:
550+
raise TypeError(
551+
"Can not pass encoding None when input is StringIO."
552+
)
553+
547554
doc = fromstring(
548555
xml_data.getvalue().encode(self.encoding), parser=curr_parser
549556
)
@@ -570,7 +577,7 @@ def _transform_doc(self) -> bytes:
570577

571578
def get_data_from_filepath(
572579
filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
573-
encoding,
580+
encoding: str | None,
574581
compression: CompressionOptions,
575582
storage_options: StorageOptions,
576583
) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
@@ -658,15 +665,15 @@ class that build Data Frame and infers specific dtypes.
658665

659666

660667
def _parse(
661-
path_or_buffer,
662-
xpath,
663-
namespaces,
664-
elems_only,
665-
attrs_only,
666-
names,
667-
encoding,
668-
parser,
669-
stylesheet,
668+
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
669+
xpath: str,
670+
namespaces: dict[str, str] | None,
671+
elems_only: bool,
672+
attrs_only: bool,
673+
names: Sequence[str] | None,
674+
encoding: str | None,
675+
parser: XMLParsers,
676+
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
670677
compression: CompressionOptions,
671678
storage_options: StorageOptions,
672679
**kwargs,
@@ -686,11 +693,11 @@ def _parse(
686693
* If parser is not lxml or etree.
687694
"""
688695

689-
lxml = import_optional_dependency("lxml.etree", errors="ignore")
690-
691696
p: _EtreeFrameParser | _LxmlFrameParser
692697

693698
if parser == "lxml":
699+
lxml = import_optional_dependency("lxml.etree", errors="ignore")
700+
694701
if lxml is not None:
695702
p = _LxmlFrameParser(
696703
path_or_buffer,
@@ -728,19 +735,23 @@ def _parse(
728735
return _data_to_frame(data=data_dicts, **kwargs)
729736

730737

738+
@deprecate_nonkeyword_arguments(
739+
version=None, allowed_args=["path_or_buffer"], stacklevel=2
740+
)
731741
@doc(
732742
storage_options=_shared_docs["storage_options"],
733743
decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
734744
)
735745
def read_xml(
736746
path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
737-
xpath: str | None = "./*",
738-
namespaces: dict | list[dict] | None = None,
739-
elems_only: bool | None = False,
740-
attrs_only: bool | None = False,
741-
names: list[str] | None = None,
747+
xpath: str = "./*",
748+
namespaces: dict[str, str] | None = None,
749+
elems_only: bool = False,
750+
attrs_only: bool = False,
751+
names: Sequence[str] | None = None,
752+
# encoding can not be None for lxml and StringIO input
742753
encoding: str | None = "utf-8",
743-
parser: str | None = "lxml",
754+
parser: XMLParsers = "lxml",
744755
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
745756
compression: CompressionOptions = "infer",
746757
storage_options: StorageOptions = None,

pandas/tests/io/xml/test_xml.py

+39
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath):
729729
tm.assert_frame_equal(df_lxml, df_etree)
730730

731731

732+
@td.skip_if_no("lxml")
733+
def test_wrong_encoding_for_lxml():
734+
# GH#45133
735+
data = """<data>
736+
<row>
737+
<a>c</a>
738+
</row>
739+
</data>
740+
"""
741+
with pytest.raises(TypeError, match="encoding None"):
742+
read_xml(StringIO(data), parser="lxml", encoding=None)
743+
744+
745+
def test_none_encoding_etree():
746+
# GH#45133
747+
data = """<data>
748+
<row>
749+
<a>c</a>
750+
</row>
751+
</data>
752+
"""
753+
result = read_xml(StringIO(data), parser="etree", encoding=None)
754+
expected = DataFrame({"a": ["c"]})
755+
tm.assert_frame_equal(result, expected)
756+
757+
732758
# PARSER
733759

734760

@@ -769,6 +795,19 @@ def test_stylesheet_file(datapath):
769795
tm.assert_frame_equal(df_kml, df_style)
770796

771797

798+
def test_read_xml_passing_as_positional_deprecated(datapath, parser):
799+
# GH#45133
800+
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
801+
802+
with tm.assert_produces_warning(FutureWarning, match="keyword-only"):
803+
read_xml(
804+
kml,
805+
".//k:Placemark",
806+
namespaces={"k": "http://www.opengis.net/kml/2.2"},
807+
parser=parser,
808+
)
809+
810+
772811
@td.skip_if_no("lxml")
773812
def test_stylesheet_file_like(datapath, mode):
774813
kml = datapath("io", "data", "xml", "cta_rail_lines.kml")

0 commit comments

Comments
 (0)