From baa3ab2edc82d6ef29cc1a7248cfd31e97d046e1 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 30 Dec 2021 21:05:15 +0100 Subject: [PATCH 1/8] TYP: Type read_xml and adjust type hints --- pandas/_typing.py | 1 + pandas/io/xml.py | 59 +++++++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 159d57fb27c89..5c8f0be39f712 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -246,6 +246,7 @@ def closed(self) -> bool: CompressionOptions = Optional[ Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict] ] +XMLParsers = Literal["lxml", "etree"] # types in DataFrameFormatter diff --git a/pandas/io/xml.py b/pandas/io/xml.py index d72f02fa817ce..f520ad982b404 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,12 +5,14 @@ from __future__ import annotations import io +from typing import Iterable from pandas._typing import ( CompressionOptions, FilePath, ReadBuffer, StorageOptions, + XMLParsers, ) from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -98,17 +100,17 @@ class _XMLFrameParser: def __init__( self, - path_or_buffer, - xpath, - namespaces, - elems_only, - attrs_only, - names, - encoding, - stylesheet, + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + xpath: str, + namespaces: dict | list[dict] | None, + elems_only: bool, + attrs_only: bool, + names: Iterable[str] | None, + encoding: str | None, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, storage_options: StorageOptions, - ) -> None: + ): self.path_or_buffer = path_or_buffer self.xpath = xpath self.namespaces = namespaces @@ -371,9 +373,6 @@ class _LxmlFrameParser(_XMLFrameParser): XPath 1.0 and XSLT 1.0. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - def parse_data(self) -> list[dict[str, str | None]]: """ Parse xml data. @@ -570,7 +569,7 @@ def _transform_doc(self) -> bytes: def get_data_from_filepath( filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], - encoding, + encoding: str | None, compression: CompressionOptions, storage_options: StorageOptions, ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: @@ -658,15 +657,15 @@ class that build Data Frame and infers specific dtypes. def _parse( - path_or_buffer, - xpath, - namespaces, - elems_only, - attrs_only, - names, - encoding, - parser, - stylesheet, + path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], + xpath: str, + namespaces: dict | list[dict] | None, + elems_only: bool, + attrs_only: bool, + names: Iterable[str] | None, + encoding: str | None, + parser: XMLParsers, + stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, storage_options: StorageOptions, **kwargs, @@ -686,11 +685,11 @@ def _parse( * If parser is not lxml or etree. """ - lxml = import_optional_dependency("lxml.etree", errors="ignore") - p: _EtreeFrameParser | _LxmlFrameParser if parser == "lxml": + lxml = import_optional_dependency("lxml.etree", errors="ignore") + if lxml is not None: p = _LxmlFrameParser( path_or_buffer, @@ -734,13 +733,13 @@ def _parse( ) def read_xml( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], - xpath: str | None = "./*", + xpath: str = "./*", namespaces: dict | list[dict] | None = None, - elems_only: bool | None = False, - attrs_only: bool | None = False, - names: list[str] | None = None, + elems_only: bool = False, + attrs_only: bool = False, + names: Iterable[str] | None = None, encoding: str | None = "utf-8", - parser: str | None = "lxml", + parser: XMLParsers = "lxml", stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, @@ -765,7 +764,7 @@ def read_xml( expressions. For more complex XPath, use ``lxml`` which requires installation. - namespaces : dict, optional + namespaces : dict, list of dicts, optional The namespaces defined in XML document as dicts with key being namespace prefix and value the URI. There is no need to include all namespaces in XML, only the ones used in ``xpath`` expression. From ab8f1c61bca0d6e0fde384eb85259f46f028383c Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 30 Dec 2021 22:00:13 +0100 Subject: [PATCH 2/8] Fix typing issues --- pandas/io/xml.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index f520ad982b404..9ded3864d05ae 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,7 +5,7 @@ from __future__ import annotations import io -from typing import Iterable +from typing import Sequence from pandas._typing import ( CompressionOptions, @@ -102,10 +102,10 @@ def __init__( self, path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str, - namespaces: dict | list[dict] | None, + namespaces: dict | None, elems_only: bool, attrs_only: bool, - names: Iterable[str] | None, + names: Sequence[str] | None, encoding: str | None, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, @@ -543,6 +543,11 @@ def _parse_doc(self, raw_doc) -> bytes: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): + if self.encoding is None: + raise TypeError( + "Can not pass encoding None when input is StringIO." + ) + doc = fromstring( xml_data.getvalue().encode(self.encoding), parser=curr_parser ) @@ -659,10 +664,10 @@ class that build Data Frame and infers specific dtypes. def _parse( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str, - namespaces: dict | list[dict] | None, + namespaces: dict | None, elems_only: bool, attrs_only: bool, - names: Iterable[str] | None, + names: Sequence[str] | None, encoding: str | None, parser: XMLParsers, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, @@ -734,10 +739,11 @@ def _parse( def read_xml( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str = "./*", - namespaces: dict | list[dict] | None = None, + namespaces: dict | None = None, elems_only: bool = False, attrs_only: bool = False, - names: Iterable[str] | None = None, + names: Sequence[str] | None = None, + # encoding can not be None for lxml and StringIO input encoding: str | None = "utf-8", parser: XMLParsers = "lxml", stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, From 9ff4878e5edb2a109656d2b9ddfc4e501f6302c8 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 30 Dec 2021 22:06:30 +0100 Subject: [PATCH 3/8] Deprecate positional --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/xml.py | 8 +++++++- pandas/tests/io/xml/test_xml.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 3924191bebcfd..3899d518db95a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -602,6 +602,7 @@ Other Deprecations - Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`) - Deprecated passing non boolean argument to sort in :func:`concat` (:issue:`41518`) - Deprecated passing arguments as positional for :func:`read_fwf` other than ``filepath_or_buffer`` (:issue:`41485`): +- Deprecated passing arguments as positional for :func:`read_xml` other than ``path_or_buffer`` (:issue:`45133`): - Deprecated passing ``skipna=None`` for :meth:`DataFrame.mad` and :meth:`Series.mad`, pass ``skipna=True`` instead (:issue:`44580`) - Deprecated :meth:`DateOffset.apply`, use ``offset + other`` instead (:issue:`44522`) - Deprecated parameter ``names`` in :meth:`Index.copy` (:issue:`44916`) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 9ded3864d05ae..d70920a36bf04 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -19,7 +19,10 @@ AbstractMethodError, ParserError, ) -from pandas.util._decorators import doc +from pandas.util._decorators import ( + deprecate_nonkeyword_arguments, + doc, +) from pandas.core.dtypes.common import is_list_like @@ -732,6 +735,9 @@ def _parse( return _data_to_frame(data=data_dicts, **kwargs) +@deprecate_nonkeyword_arguments( + version=None, allowed_args=["path_or_buffer"], stacklevel=2 +) @doc( storage_options=_shared_docs["storage_options"], decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b72111ec6cf1e..266bdbbf49653 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -769,6 +769,18 @@ def test_stylesheet_file(datapath): tm.assert_frame_equal(df_kml, df_style) +def test_read_xml_passing_as_positional_deprecated(datapath): + # GH#45133 + kml = datapath("io", "data", "xml", "cta_rail_lines.kml") + + with tm.assert_produces_warning(FutureWarning, match="keyword-only"): + read_xml( + kml, + ".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + ) + + @td.skip_if_no("lxml") def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") From 69982517730869d60e21c10f19f22358ee241e8b Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 30 Dec 2021 22:08:54 +0100 Subject: [PATCH 4/8] Adjust comment --- pandas/io/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index d70920a36bf04..bb1e0eb134b41 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -776,7 +776,7 @@ def read_xml( expressions. For more complex XPath, use ``lxml`` which requires installation. - namespaces : dict, list of dicts, optional + namespaces : dict, optional The namespaces defined in XML document as dicts with key being namespace prefix and value the URI. There is no need to include all namespaces in XML, only the ones used in ``xpath`` expression. From 0b85c7d6e2a70b64d49c46ee5b9a6c33c41e9574 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 30 Dec 2021 22:42:01 +0100 Subject: [PATCH 5/8] Fix test --- pandas/tests/io/xml/test_xml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 266bdbbf49653..5634b64a5faff 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -769,6 +769,7 @@ def test_stylesheet_file(datapath): tm.assert_frame_equal(df_kml, df_style) +@td.skip_if_no("lxml") def test_read_xml_passing_as_positional_deprecated(datapath): # GH#45133 kml = datapath("io", "data", "xml", "cta_rail_lines.kml") From 3accd12b0211014695733cce2dd3797c840c2a92 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 30 Dec 2021 23:15:27 +0100 Subject: [PATCH 6/8] Add test --- pandas/tests/io/xml/test_xml.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 5634b64a5faff..be29d34854d2a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -782,6 +782,19 @@ def test_read_xml_passing_as_positional_deprecated(datapath): ) +@td.skip_if_no("lxml") +def test_wrong_encoding_for_lxml(): + # GH#45133 + data = """ + + c + + +""" + with pytest.raises(TypeError, match="encoding None"): + read_xml(StringIO(data), parser="lxml", encoding=None) + + @td.skip_if_no("lxml") def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") From 1da956908cfddc7d5c5b2f7118b1b8f82979dc6a Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 31 Dec 2021 22:47:46 +0100 Subject: [PATCH 7/8] Add test and adjust type hint --- pandas/io/xml.py | 6 ++--- pandas/tests/io/xml/test_xml.py | 39 ++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index bb1e0eb134b41..ad87b18bd1683 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -105,7 +105,7 @@ def __init__( self, path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str, - namespaces: dict | None, + namespaces: dict[str, str] | None, elems_only: bool, attrs_only: bool, names: Sequence[str] | None, @@ -667,7 +667,7 @@ class that build Data Frame and infers specific dtypes. def _parse( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str, - namespaces: dict | None, + namespaces: dict[str, str] | None, elems_only: bool, attrs_only: bool, names: Sequence[str] | None, @@ -745,7 +745,7 @@ def _parse( def read_xml( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str = "./*", - namespaces: dict | None = None, + namespaces: dict[str, str] | None = None, elems_only: bool = False, attrs_only: bool = False, names: Sequence[str] | None = None, diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index be29d34854d2a..e02bf1b685c05 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -729,6 +729,32 @@ def test_parser_consistency_with_encoding(datapath): tm.assert_frame_equal(df_lxml, df_etree) +@td.skip_if_no("lxml") +def test_wrong_encoding_for_lxml(): + # GH#45133 + data = """ + + c + + +""" + with pytest.raises(TypeError, match="encoding None"): + read_xml(StringIO(data), parser="lxml", encoding=None) + + +def test_none_encoding_etree(): + # GH#45133 + data = """ + + c + + +""" + result = read_xml(StringIO(data), parser="etree", encoding=None) + expected = DataFrame({"a": ["c"]}) + tm.assert_frame_equal(result, expected) + + # PARSER @@ -782,19 +808,6 @@ def test_read_xml_passing_as_positional_deprecated(datapath): ) -@td.skip_if_no("lxml") -def test_wrong_encoding_for_lxml(): - # GH#45133 - data = """ - - c - - -""" - with pytest.raises(TypeError, match="encoding None"): - read_xml(StringIO(data), parser="lxml", encoding=None) - - @td.skip_if_no("lxml") def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") From ba355bacf47fe3b2f858167dfeaec4e04576ba79 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 1 Jan 2022 12:26:13 +0100 Subject: [PATCH 8/8] Adjust dep test --- pandas/tests/io/xml/test_xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index e02bf1b685c05..8809c423a29ba 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -795,8 +795,7 @@ def test_stylesheet_file(datapath): tm.assert_frame_equal(df_kml, df_style) -@td.skip_if_no("lxml") -def test_read_xml_passing_as_positional_deprecated(datapath): +def test_read_xml_passing_as_positional_deprecated(datapath, parser): # GH#45133 kml = datapath("io", "data", "xml", "cta_rail_lines.kml") @@ -805,6 +804,7 @@ def test_read_xml_passing_as_positional_deprecated(datapath): kml, ".//k:Placemark", namespaces={"k": "http://www.opengis.net/kml/2.2"}, + parser=parser, )