From f347e8e7781ea794de9e23d963972b80fb715d91 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Thu, 22 Jun 2023 19:47:39 -0400 Subject: [PATCH 01/17] Updating documentation and adding deprecation logic for read_xml. --- doc/source/whatsnew/v2.1.0.rst | 2 + pandas/io/xml.py | 20 ++++++++-- pandas/tests/io/xml/test_xml.py | 51 +++++++++++++++++--------- pandas/tests/io/xml/test_xml_dtypes.py | 30 ++++++++------- 4 files changed, 69 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2436d91690ed3..6fdedf97d3e09 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,6 +298,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) +- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) @@ -305,6 +306,7 @@ Deprecations - Deprecated the "method" and "limit" keywords on :meth:`Series.fillna`, :meth:`DataFrame.fillna`, :meth:`SeriesGroupBy.fillna`, :meth:`DataFrameGroupBy.fillna`, and :meth:`Resampler.fillna`, use ``obj.bfill()`` or ``obj.ffill()`` instead (:issue:`53394`) - Deprecated the ``method`` and ``limit`` keywords in :meth:`DataFrame.replace` and :meth:`Series.replace` (:issue:`33302`) - Deprecated values "pad", "ffill", "bfill", "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate`, use ``obj.ffill()`` or ``obj.bfill()`` instead (:issue:`53581`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.performance: diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 2aec361d46b99..bb46088296f47 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -12,6 +12,7 @@ Callable, Sequence, ) +import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -20,6 +21,7 @@ ParserError, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -894,6 +896,9 @@ def read_xml( string or a path. The string can further be a URL. Valid URL schemes include http, ftp, s3, and file. + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + xpath : str, optional, default './\*' The XPath to parse required set of nodes for migration to DataFrame. XPath should return a collection of elements and not a single @@ -1068,7 +1073,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(xml) + >>> df = pd.read_xml(SringIO(xml)) >>> df shape degrees sides 0 square 360 4.0 @@ -1082,7 +1087,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(xml, xpath=".//row") + >>> df = pd.read_xml(StringIO(xml), xpath=".//row") >>> df shape degrees sides 0 square 360 4.0 @@ -1108,7 +1113,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(xml, + >>> df = pd.read_xml(StringIO(xml), ... xpath="//doc:row", ... namespaces={{"doc": "https://example.com"}}) >>> df @@ -1119,6 +1124,15 @@ def read_xml( """ check_dtype_backend(dtype_backend) + if isinstance(path_or_buffer, str) and "\n" in path_or_buffer: + warnings.warn( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return _parse( path_or_buffer=path_or_buffer, xpath=xpath, diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index b0e806caecc80..f83079acd611c 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -391,6 +391,11 @@ def test_file_buffered_reader_string(xml_books, parser, mode): with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f: xml_obj = f.read() + if mode == "rb": + xml_obj = StringIO(xml_obj.decode()) + elif mode == "r": + xml_obj = StringIO(xml_obj) + df_str = read_xml(xml_obj, parser=parser) df_expected = DataFrame( @@ -411,6 +416,11 @@ def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode): next(f) xml_obj = f.read() + if mode == "rb": + xml_obj = StringIO(xml_obj.decode()) + elif mode == "r": + xml_obj = StringIO(xml_obj) + df_str = read_xml(xml_obj, parser=parser) df_expected = DataFrame( @@ -580,7 +590,7 @@ def test_bad_xpath_lxml(xml_books): def test_default_namespace(parser): df_nmsp = read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//ns:row", namespaces={"ns": "http://example.com"}, parser=parser, @@ -606,7 +616,7 @@ def test_default_namespace(parser): def test_prefix_namespace(parser): df_nmsp = read_xml( - xml_prefix_nmsp, + StringIO(xml_prefix_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser=parser, @@ -630,14 +640,14 @@ def test_prefix_namespace(parser): @td.skip_if_no("lxml") def test_consistency_default_namespace(): df_lxml = read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//ns:row", namespaces={"ns": "http://example.com"}, parser="lxml", ) df_etree = read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="etree", @@ -649,14 +659,14 @@ def test_consistency_default_namespace(): @td.skip_if_no("lxml") def test_consistency_prefix_namespace(): df_lxml = read_xml( - xml_prefix_nmsp, + StringIO(xml_prefix_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="lxml", ) df_etree = read_xml( - xml_prefix_nmsp, + StringIO(xml_prefix_nmsp), xpath=".//doc:row", namespaces={"doc": "http://example.com"}, parser="etree", @@ -693,7 +703,7 @@ def test_none_namespace_prefix(key): TypeError, match=("empty namespace prefix is not supported in XPath") ): read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), xpath=".//kml:Placemark", namespaces={key: "http://www.opengis.net/kml/2.2"}, parser="lxml", @@ -782,7 +792,7 @@ def test_empty_attrs_only(parser): ValueError, match=("xpath does not return any nodes or attributes"), ): - read_xml(xml, xpath="./row", attrs_only=True, parser=parser) + read_xml(StringIO(xml), xpath="./row", attrs_only=True, parser=parser) def test_empty_elems_only(parser): @@ -797,7 +807,7 @@ def test_empty_elems_only(parser): ValueError, match=("xpath does not return any nodes or attributes"), ): - read_xml(xml, xpath="./row", elems_only=True, parser=parser) + read_xml(StringIO(xml), xpath="./row", elems_only=True, parser=parser) @td.skip_if_no("lxml") @@ -822,8 +832,8 @@ def test_attribute_centric_xml(): """ - df_lxml = read_xml(xml, xpath=".//station") - df_etree = read_xml(xml, xpath=".//station", parser="etree") + df_lxml = read_xml(StringIO(xml), xpath=".//station") + df_etree = read_xml(StringIO(xml), xpath=".//station", parser="etree") df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]}) df_iter_et = read_xml_iterparse( @@ -875,7 +885,10 @@ def test_repeat_names(parser): """ df_xpath = read_xml( - xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"] + StringIO(xml), + xpath=".//shape", + parser=parser, + names=["type_dim", "shape", "type_edge"], ) df_iter = read_xml_iterparse( @@ -917,7 +930,9 @@ def test_repeat_values_new_names(parser): ellipse """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"]) + df_xpath = read_xml( + StringIO(xml), xpath=".//shape", parser=parser, names=["name", "group"] + ) df_iter = read_xml_iterparse( xml, @@ -960,7 +975,7 @@ def test_repeat_elements(parser): """ df_xpath = read_xml( - xml, + StringIO(xml), xpath=".//shape", parser=parser, names=["name", "family", "degrees", "sides"], @@ -1532,7 +1547,7 @@ def test_comment(parser): """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} @@ -1568,7 +1583,7 @@ def test_dtd(parser): """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} @@ -1604,7 +1619,7 @@ def test_processing_instruction(parser): """ - df_xpath = read_xml(xml, xpath=".//shape", parser=parser) + df_xpath = read_xml(StringIO(xml), xpath=".//shape", parser=parser) df_iter = read_xml_iterparse( xml, parser=parser, iterparse={"shape": ["name", "type"]} @@ -1808,7 +1823,7 @@ def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): string_array_na = ArrowStringArray(pa.array(["x", None])) with pd.option_context("mode.string_storage", string_storage): - result = read_xml(data, parser=parser, dtype_backend=dtype_backend) + result = read_xml(StringIO(data), parser=parser, dtype_backend=dtype_backend) expected = DataFrame( { diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 911b540dbc380..fb24902efc0f5 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -1,5 +1,7 @@ from __future__ import annotations +from io import StringIO + import pytest from pandas.errors import ParserWarning @@ -81,7 +83,7 @@ def read_xml_iterparse(data, **kwargs): def test_dtype_single_str(parser): - df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + df_result = read_xml(StringIO(xml_types), dtype={"degrees": "str"}, parser=parser) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -102,7 +104,7 @@ def test_dtype_single_str(parser): def test_dtypes_all_str(parser): - df_result = read_xml(xml_dates, dtype="string", parser=parser) + df_result = read_xml(StringIO(xml_dates), dtype="string", parser=parser) df_iter = read_xml_iterparse( xml_dates, parser=parser, @@ -126,7 +128,7 @@ def test_dtypes_all_str(parser): def test_dtypes_with_names(parser): df_result = read_xml( - xml_dates, + StringIO(xml_dates), names=["Col1", "Col2", "Col3", "Col4"], dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64[ns]"}, parser=parser, @@ -153,7 +155,7 @@ def test_dtypes_with_names(parser): def test_dtype_nullable_int(parser): - df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + df_result = read_xml(StringIO(xml_types), dtype={"sides": "Int64"}, parser=parser) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -174,7 +176,7 @@ def test_dtype_nullable_int(parser): def test_dtype_float(parser): - df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + df_result = read_xml(StringIO(xml_types), dtype={"degrees": "float"}, parser=parser) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -214,7 +216,7 @@ def test_both_dtype_converters(parser): with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): df_result = read_xml( - xml_types, + StringIO(xml_types), dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser, @@ -235,7 +237,9 @@ def test_both_dtype_converters(parser): def test_converters_str(parser): - df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + df_result = read_xml( + StringIO(xml_types), converters={"degrees": str}, parser=parser + ) df_iter = read_xml_iterparse( xml_types, parser=parser, @@ -258,7 +262,7 @@ def test_converters_str(parser): def test_converters_date(parser): convert_to_datetime = lambda x: to_datetime(x) df_result = read_xml( - xml_dates, converters={"date": convert_to_datetime}, parser=parser + StringIO(xml_dates), converters={"date": convert_to_datetime}, parser=parser ) df_iter = read_xml_iterparse( xml_dates, @@ -305,7 +309,7 @@ def test_callable_str_converters(xml_books, parser, iterparse): def test_parse_dates_column_name(parser): - df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + df_result = read_xml(StringIO(xml_dates), parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( xml_dates, parser=parser, @@ -327,7 +331,7 @@ def test_parse_dates_column_name(parser): def test_parse_dates_column_index(parser): - df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + df_result = read_xml(StringIO(xml_dates), parse_dates=[3], parser=parser) df_iter = read_xml_iterparse( xml_dates, parser=parser, @@ -349,7 +353,7 @@ def test_parse_dates_column_index(parser): def test_parse_dates_true(parser): - df_result = read_xml(xml_dates, parse_dates=True, parser=parser) + df_result = read_xml(StringIO(xml_dates), parse_dates=True, parser=parser) df_iter = read_xml_iterparse( xml_dates, @@ -401,7 +405,7 @@ def test_parse_dates_dictionary(parser): """ df_result = read_xml( - xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser + StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser ) df_iter = read_xml_iterparse( xml, @@ -459,7 +463,7 @@ def test_day_first_parse_dates(parser): with tm.assert_produces_warning( UserWarning, match="Parsing dates in %d/%m/%Y format" ): - df_result = read_xml(xml, parse_dates=["date"], parser=parser) + df_result = read_xml(StringIO(xml), parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( xml, parse_dates=["date"], From 296b45afa14280b1ff68cbb2943f680a6141c9f7 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Thu, 22 Jun 2023 20:47:38 -0400 Subject: [PATCH 02/17] Fixing documentation issue and adding unit test --- doc/source/whatsnew/v1.5.0.rst | 3 ++- pandas/tests/io/xml/test_xml.py | 40 ++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9653226b96196..44728e7e552ab 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -221,6 +221,7 @@ apply converter methods, and parse dates (:issue:`43567`). .. ipython:: python + from io import StringIO xml_dates = """ @@ -244,7 +245,7 @@ apply converter methods, and parse dates (:issue:`43567`). """ df = pd.read_xml( - xml_dates, + StringIO(xml_dates), dtype={'sides': 'Int64'}, converters={'degrees': str}, parse_dates=['date'] diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index f83079acd611c..dfaf609360467 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -247,6 +247,44 @@ ) +def test_literal_xml_deprecation(): + # GH 53785 + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + read_xml( + """ + + + x + 1 + 4.0 + x + 2 + 4.0 + + True + False + + + y + 2 + 5.0 + + + + + False + + + """ + ) + + @pytest.fixture(params=["rb", "r"]) def mode(request): return request.param @@ -1361,7 +1399,7 @@ def test_string_error(parser): ParserError, match=("iterparse is designed for large XML files") ): read_xml( - xml_default_nmsp, + StringIO(xml_default_nmsp), parser=parser, iterparse={"row": ["shape", "degrees", "sides", "date"]}, ) From 69cdc1afee4ecdc04d96ba5fbe318c50e78876a9 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Fri, 23 Jun 2023 17:51:57 -0400 Subject: [PATCH 03/17] Updating unit tests and documentation. --- doc/source/user_guide/io.rst | 13 +++++++------ pandas/tests/io/xml/test_xml.py | 15 +-------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 84a78ace8d7c7..59f8ddd9853bd 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2920,6 +2920,7 @@ Read an XML string: .. ipython:: python + from io import StringIO xml = """ @@ -2942,7 +2943,7 @@ Read an XML string: """ - df = pd.read_xml(xml) + df = pd.read_xml(StringIO(xml)) df Read a URL with no options: @@ -2962,7 +2963,7 @@ as a string: f.write(xml) with open(file_path, "r") as f: - df = pd.read_xml(f.read()) + df = pd.read_xml(StringIO(f.read())) df Read in the content of the "books.xml" as instance of ``StringIO`` or @@ -3053,7 +3054,7 @@ For example, below XML contains a namespace with prefix, ``doc``, and URI at """ - df = pd.read_xml(xml, + df = pd.read_xml(StringIO(xml), xpath="//doc:row", namespaces={"doc": "https://example.com"}) df @@ -3083,7 +3084,7 @@ But assigning *any* temporary name to correct URI allows parsing by nodes. """ - df = pd.read_xml(xml, + df = pd.read_xml(StringIO(xml), xpath="//pandas:row", namespaces={"pandas": "https://example.com"}) df @@ -3118,7 +3119,7 @@ However, if XPath does not reference node names such as default, ``/*``, then """ - df = pd.read_xml(xml, xpath="./row") + df = pd.read_xml(StringIO(xml), xpath="./row") df shows the attribute ``sides`` on ``shape`` element was not parsed as @@ -3219,7 +3220,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``: """ - df = pd.read_xml(xml, stylesheet=xsl) + df = pd.read_xml(StringIO(xml), stylesheet=xsl) df For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index dfaf609360467..fc76715ea97e7 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -248,7 +248,7 @@ def test_literal_xml_deprecation(): - # GH 53785 + # GH 53809 msg = ( "Passing literal xml to 'read_xml' is deprecated and " "will be removed in a future version. To read from a " @@ -1392,19 +1392,6 @@ def test_empty_stylesheet(val): # ITERPARSE - - -def test_string_error(parser): - with pytest.raises( - ParserError, match=("iterparse is designed for large XML files") - ): - read_xml( - StringIO(xml_default_nmsp), - parser=parser, - iterparse={"row": ["shape", "degrees", "sides", "date"]}, - ) - - def test_file_like_iterparse(xml_books, parser, mode): with open(xml_books, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "r" and parser == "lxml": From 0f0f38b3cdef4cdcbfe25e4d41efb45f5d051050 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Fri, 23 Jun 2023 21:57:31 -0400 Subject: [PATCH 04/17] Fixing unit tests and documentation issues --- pandas/io/xml.py | 2 +- pandas/tests/io/xml/test_xml.py | 30 +++--------------------------- 2 files changed, 4 insertions(+), 28 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index bb46088296f47..b35f409d9032b 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1073,7 +1073,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(SringIO(xml)) + >>> df = pd.read_xml(StringIO(xml)) >>> df shape degrees sides 0 square 360 4.0 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index fc76715ea97e7..637af6e4a5f95 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -246,6 +246,8 @@ } ) +td.skip_if_no("lxml") + def test_literal_xml_deprecation(): # GH 53809 @@ -256,33 +258,7 @@ def test_literal_xml_deprecation(): ) with tm.assert_produces_warning(FutureWarning, match=msg): - read_xml( - """ - - - x - 1 - 4.0 - x - 2 - 4.0 - - True - False - - - y - 2 - 5.0 - - - - - False - - - """ - ) + read_xml(xml_default_nmsp) @pytest.fixture(params=["rb", "r"]) From 2c848acc59b4ba69082f7580bae6ee9b810629bd Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Fri, 23 Jun 2023 22:34:05 -0400 Subject: [PATCH 05/17] Fixing unit tests and documentation issues --- pandas/tests/io/xml/test_xml.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 637af6e4a5f95..893e5ec229bdd 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -246,9 +246,8 @@ } ) -td.skip_if_no("lxml") - +@td.skip_if_no("lxml") def test_literal_xml_deprecation(): # GH 53809 msg = ( @@ -258,7 +257,7 @@ def test_literal_xml_deprecation(): ) with tm.assert_produces_warning(FutureWarning, match=msg): - read_xml(xml_default_nmsp) + read_xml(xml_default_nmsp, parser="etree") @pytest.fixture(params=["rb", "r"]) From b8a582c97a0e75459eb069d61bed85918ee12a7c Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Fri, 23 Jun 2023 23:27:59 -0400 Subject: [PATCH 06/17] Fixing unit tests and documentation issues --- pandas/io/xml.py | 6 +++--- pandas/tests/io/xml/test_xml.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index b35f409d9032b..a60586368a0bf 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1073,7 +1073,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml)) + >>> df = pd.read_xml(io.StringIO(xml)) >>> df shape degrees sides 0 square 360 4.0 @@ -1087,7 +1087,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml), xpath=".//row") + >>> df = pd.read_xml(io.StringIO(xml), xpath=".//row") >>> df shape degrees sides 0 square 360 4.0 @@ -1113,7 +1113,7 @@ def read_xml( ... ... ''' - >>> df = pd.read_xml(StringIO(xml), + >>> df = pd.read_xml(io.StringIO(xml), ... xpath="//doc:row", ... namespaces={{"doc": "https://example.com"}}) >>> df diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 893e5ec229bdd..fb8802d5be312 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -257,7 +257,7 @@ def test_literal_xml_deprecation(): ) with tm.assert_produces_warning(FutureWarning, match=msg): - read_xml(xml_default_nmsp, parser="etree") + read_xml(xml_default_nmsp) @pytest.fixture(params=["rb", "r"]) From 92bc6fa7bf5a4173a5c5027147ec381a1c635951 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Sat, 24 Jun 2023 00:06:58 -0400 Subject: [PATCH 07/17] Fixing import error in documentation --- pandas/io/xml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a60586368a0bf..a85e82448538a 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1054,6 +1054,7 @@ def read_xml( Examples -------- + >>> import io >>> xml = ''' ... ... From 8bbd7c425c8232bfa188035452dcb3d2147c2540 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 18:32:48 -0400 Subject: [PATCH 08/17] Updated deprecation logic per reviewer recommendations. --- pandas/io/xml.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a85e82448538a..ac46b5f8cc552 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -32,6 +32,7 @@ file_exists, get_handle, infer_compression, + is_file_like, is_fsspec_url, is_url, stringify_path, @@ -898,6 +899,7 @@ def read_xml( .. deprecated:: 2.1.0 Passing html literal strings is deprecated. + Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead. xpath : str, optional, default './\*' The XPath to parse required set of nodes for migration to DataFrame. @@ -1125,7 +1127,13 @@ def read_xml( """ check_dtype_backend(dtype_backend) - if isinstance(path_or_buffer, str) and "\n" in path_or_buffer: + if ( + isinstance(path_or_buffer, str) + and not is_file_like(path_or_buffer) + and "\n" in path_or_buffer + ): + with open("/home/richard/Desktop/file.txt", "a+") as fil: + fil.write(f"{path_or_buffer}\n\n\n") warnings.warn( "Passing literal xml to 'read_xml' is deprecated and " "will be removed in a future version. To read from a " From 5aece784cf4d4d5419c1460a2979f6ae1d53de79 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 18:45:52 -0400 Subject: [PATCH 09/17] Updating deprecation logic and documentation per reviewer recommendations. --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c7a5bd59861ae..c224fd822c887 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,7 +298,7 @@ Deprecations - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - Deprecated falling back to filling when ``value`` is not specified in :meth:`DataFrame.replace` and :meth:`Series.replace` with non-dict-like ``to_replace`` (:issue:`33302`) - Deprecated literal json input to :func:`read_json`. Wrap literal json string input in ``io.StringIO`` instead. (:issue:`53409`) -- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` instead. (:issue:`53767`) +- Deprecated literal string input to :func:`read_xml`. Wrap literal string/bytes input in ``io.StringIO`` / ``io.BytesIO`` instead. (:issue:`53767`) - Deprecated option "mode.use_inf_as_na", convert inf entries to ``NaN`` before instead (:issue:`51684`) - Deprecated parameter ``obj`` in :meth:`GroupBy.get_group` (:issue:`53545`) - Deprecated positional indexing on :class:`Series` with :meth:`Series.__getitem__` and :meth:`Series.__setitem__`, in a future version ``ser[item]`` will *always* interpret ``item`` as a label, not a position (:issue:`50617`) From 6f15924698badb860367df4e059dd306ce2a6cce Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 19:19:40 -0400 Subject: [PATCH 10/17] Fixing logic error --- pandas/io/xml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ac46b5f8cc552..7a8de97e9e288 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1132,8 +1132,6 @@ def read_xml( and not is_file_like(path_or_buffer) and "\n" in path_or_buffer ): - with open("/home/richard/Desktop/file.txt", "a+") as fil: - fil.write(f"{path_or_buffer}\n\n\n") warnings.warn( "Passing literal xml to 'read_xml' is deprecated and " "will be removed in a future version. To read from a " From 00f7b158aa8a4b908469529be9c0ddce31957e01 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 21:52:54 -0400 Subject: [PATCH 11/17] Fixing implementation per reviewer recommendations. --- pandas/io/xml.py | 7 ++++--- pandas/tests/io/xml/test_xml.py | 36 +++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 7a8de97e9e288..d545821f218b5 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1128,9 +1128,10 @@ def read_xml( check_dtype_backend(dtype_backend) if ( - isinstance(path_or_buffer, str) - and not is_file_like(path_or_buffer) - and "\n" in path_or_buffer + not is_file_like(path_or_buffer) + and not file_exists(path_or_buffer) + and not is_url(path_or_buffer) + and not is_fsspec_url(path_or_buffer) ): warnings.warn( "Passing literal xml to 'read_xml' is deprecated and " diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index fb8802d5be312..df00ff2b5d825 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -452,7 +452,7 @@ def test_file_buffered_reader_no_xml_declaration(xml_books, parser, mode): def test_string_charset(parser): txt = "<中文標籤>12" - df_str = read_xml(txt, parser=parser) + df_str = read_xml(StringIO(txt), parser=parser) df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) @@ -510,34 +510,48 @@ def test_empty_string_lxml(val): ] ) with pytest.raises(XMLSyntaxError, match=msg): - read_xml(val, parser="lxml") + if isinstance(val, str): + read_xml(StringIO(val), parser="lxml") + else: + read_xml(BytesIO(val), parser="lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_string_etree(val): with pytest.raises(ParseError, match="no element found"): - read_xml(val, parser="etree") + if isinstance(val, str): + read_xml(StringIO(val), parser="etree") + else: + read_xml(BytesIO(val), parser="etree") @td.skip_if_no("lxml") def test_wrong_file_path_lxml(): - from lxml.etree import XMLSyntaxError - + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) filename = os.path.join("data", "html", "books.xml") with pytest.raises( - XMLSyntaxError, - match=("Start tag expected, '<' not found"), + FutureWarning, + match=msg, ): read_xml(filename, parser="lxml") def test_wrong_file_path_etree(): + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) filename = os.path.join("data", "html", "books.xml") with pytest.raises( - ParseError, - match=("not well-formed"), + FutureWarning, + match=msg, ): read_xml(filename, parser="etree") @@ -1223,8 +1237,8 @@ def test_style_charset(): """ - df_orig = read_xml(xml) - df_style = read_xml(xml, stylesheet=xsl) + df_orig = read_xml(StringIO(xml)) + df_style = read_xml(StringIO(xml), stylesheet=xsl) tm.assert_frame_equal(df_orig, df_style) From 20e7ef2fb83256a426ab09caa5707ae21fb75f46 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 23:11:02 -0400 Subject: [PATCH 12/17] Updating implementation per reviewer recommendations. --- pandas/tests/io/xml/test_xml.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index df00ff2b5d825..b6024b2bc11e6 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -1370,13 +1370,14 @@ def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc): @td.skip_if_no("lxml") @pytest.mark.parametrize("val", ["", b""]) def test_empty_stylesheet(val): - from lxml.etree import XMLSyntaxError - + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object." + ) kml = os.path.join("data", "xml", "cta_rail_lines.kml") - with pytest.raises( - XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found") - ): + with pytest.raises(FutureWarning, match=msg): read_xml(kml, stylesheet=val) From 526c2240ce79ac8145b604d109d7089a85614be7 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Mon, 26 Jun 2023 23:31:44 -0400 Subject: [PATCH 13/17] Cleaning up the deprecation logic a bit. --- pandas/io/xml.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index d545821f218b5..44c77f8e215ee 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1127,11 +1127,13 @@ def read_xml( """ check_dtype_backend(dtype_backend) - if ( - not is_file_like(path_or_buffer) - and not file_exists(path_or_buffer) - and not is_url(path_or_buffer) - and not is_fsspec_url(path_or_buffer) + if not any( + [ + is_file_like(path_or_buffer), + file_exists(path_or_buffer), + is_url(path_or_buffer), + is_fsspec_url(path_or_buffer), + ] ): warnings.warn( "Passing literal xml to 'read_xml' is deprecated and " From 65f88b978e91c27d50d918ca8183b0d5761ad41a Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Tue, 27 Jun 2023 18:13:27 -0400 Subject: [PATCH 14/17] Updating implementation per reviewer recommendations. --- pandas/io/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 44c77f8e215ee..0adcedce24049 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -1127,7 +1127,7 @@ def read_xml( """ check_dtype_backend(dtype_backend) - if not any( + if isinstance(path_or_buffer, str) and not any( [ is_file_like(path_or_buffer), file_exists(path_or_buffer), From b7e1fb6418ad15822dd6f5f135a1e7bc82e5db11 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Sat, 8 Jul 2023 23:18:17 -0400 Subject: [PATCH 15/17] Updating unit tests --- pandas/tests/io/xml/test_xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 7879da01ff016..1a64d9910d8bf 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -313,7 +313,7 @@ def test_parser_consistency_file(xml_books): def test_parser_consistency_url(parser, httpserver): httpserver.serve_content(content=xml_default_nmsp) - df_xpath = read_xml(xml_default_nmsp, parser=parser) + df_xpath = read_xml(StringIO(xml_default_nmsp), parser=parser) df_iter = read_xml( BytesIO(xml_default_nmsp.encode()), parser=parser, @@ -1872,7 +1872,7 @@ def test_online_stylesheet(): """ df_xsl = read_xml( - xml, + StringIO(xml), xpath=".//tr[td and position() <= 6]", names=["title", "artist"], stylesheet=xsl, From 14d2cb159a2b12549c3c65ebf9dc61c719abb394 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Sun, 9 Jul 2023 12:09:00 -0400 Subject: [PATCH 16/17] Fixing discrepancy in doc string. --- pandas/io/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a7bb041a3028e..75e3ccdb66c0e 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -898,7 +898,7 @@ def read_xml( include http, ftp, s3, and file. .. deprecated:: 2.1.0 - Passing html literal strings is deprecated. + Passing xml literal strings is deprecated. Wrap literal xml input in ``io.StringIO`` or ``io.BytesIO`` instead. xpath : str, optional, default './\*' From c215a94098d4dbc314cbb64e75246fa032a59a01 Mon Sep 17 00:00:00 2001 From: Richard Howe Date: Tue, 11 Jul 2023 14:37:16 -0400 Subject: [PATCH 17/17] Updating implementation based on reviewer recommendations. --- pandas/io/xml.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 75e3ccdb66c0e..a58437fdeb8dc 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -805,6 +805,22 @@ def _parse( p: _EtreeFrameParser | _LxmlFrameParser + if isinstance(path_or_buffer, str) and not any( + [ + is_file_like(path_or_buffer), + file_exists(path_or_buffer), + is_url(path_or_buffer), + is_fsspec_url(path_or_buffer), + ] + ): + warnings.warn( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " + "literal string, wrap it in a 'StringIO' object.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if parser == "lxml": lxml = import_optional_dependency("lxml.etree", errors="ignore") @@ -1127,22 +1143,6 @@ def read_xml( """ check_dtype_backend(dtype_backend) - if isinstance(path_or_buffer, str) and not any( - [ - is_file_like(path_or_buffer), - file_exists(path_or_buffer), - is_url(path_or_buffer), - is_fsspec_url(path_or_buffer), - ] - ): - warnings.warn( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return _parse( path_or_buffer=path_or_buffer, xpath=xpath,