From e778d07dc8af665662d1e2ed9d8f2db7d0c03095 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sun, 16 Jan 2022 22:59:36 -0600 Subject: [PATCH 1/5] ENH: Add dtypes/converters arguments for pandas.read_xml --- doc/source/whatsnew/v1.4.0.rst | 41 +++++ pandas/io/xml.py | 93 +++++++++- pandas/tests/io/xml/test_xml.py | 317 +++++++++++++++++++++++++++++++- 3 files changed, 448 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 32ca3f6945d7f..d57ddad0b48c2 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -184,6 +184,47 @@ representation of :class:`DataFrame` objects (:issue:`4889`). df df.to_dict(orient='tight') +.. _whatsnew_140.read_xml_dtypes: + +read_xml now supports ``dtype``, ``converters``, and ``parse_dates`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns, +apply converter methods, and parse dates. + +.. ipython:: python + + xml_dates = """ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + + """ + + df = pd.read_xml( + xml_dates, + dtype={'sides': 'Int64'}, + converters={'degrees': str}, + parse_dates=['date'] + ) + df + df.dtypes + .. _whatsnew_140.enhancements.other: Other enhancements diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ad87b18bd1683..7add71df0e872 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,10 +5,14 @@ from __future__ import annotations import io -from typing import Sequence +from typing import ( + Callable, + Sequence, +) from pandas._typing import ( CompressionOptions, + DtypeArg, FilePath, ReadBuffer, StorageOptions, @@ -67,6 +71,23 @@ class _XMLFrameParser: names : list Column names for Data Frame of parsed XML data. + dtype : dict + Data type for data or columns. E.g. {{'a': np.float64, + 'b': np.int32, 'c': 'Int64'}} + + .. versionadded:: 1.4.0 + + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels. + + .. versionadded:: 1.4.0 + + parse_dates : bool or list of int or names or list of lists or dict + Converts either index or select columns to datetimes + + .. versionadded:: 1.4.0 + encoding : str Encoding of xml object or document. @@ -109,6 +130,13 @@ def __init__( elems_only: bool, attrs_only: bool, names: Sequence[str] | None, + dtype: DtypeArg | None, + converters: dict[str, Callable] | None, + parse_dates: bool + | list[int | str] + | list[list[int | str]] + | dict[str, list[int | str]] + | None, encoding: str | None, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, @@ -120,6 +148,9 @@ def __init__( self.elems_only = elems_only self.attrs_only = attrs_only self.names = names + self.dtype = dtype + self.converters = converters + self.parse_dates = parse_dates self.encoding = encoding self.stylesheet = stylesheet self.is_style = None @@ -671,6 +702,13 @@ def _parse( elems_only: bool, attrs_only: bool, names: Sequence[str] | None, + dtype: DtypeArg | None, + converters: dict[str, Callable] | None, + parse_dates: bool + | list[int | str] + | list[list[int | str]] + | dict[str, list[int | str]] + | None, encoding: str | None, parser: XMLParsers, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, @@ -706,6 +744,9 @@ def _parse( elems_only, attrs_only, names, + dtype, + converters, + parse_dates, encoding, stylesheet, compression, @@ -722,6 +763,9 @@ def _parse( elems_only, attrs_only, names, + dtype, + converters, + parse_dates, encoding, stylesheet, compression, @@ -732,7 +776,13 @@ def _parse( data_dicts = p.parse_data() - return _data_to_frame(data=data_dicts, **kwargs) + return _data_to_frame( + data=data_dicts, + dtype=dtype, + converters=converters, + parse_dates=parse_dates, + **kwargs, + ) @deprecate_nonkeyword_arguments( @@ -749,6 +799,13 @@ def read_xml( elems_only: bool = False, attrs_only: bool = False, names: Sequence[str] | None = None, + dtype: DtypeArg | None = None, + converters: dict[str, Callable] | None = None, + parse_dates: bool + | list[int | str] + | list[list[int | str]] + | dict[str, list[int | str]] + | None = None, # encoding can not be None for lxml and StringIO input encoding: str | None = "utf-8", parser: XMLParsers = "lxml", @@ -799,6 +856,35 @@ def read_xml( Column names for DataFrame of parsed XML data. Use this parameter to rename original element names and distinguish same named elements. + dtype : Type name or dict of column -> type, optional + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} + Use `str` or `object` together with suitable `na_values` settings + to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 1.4.0 + + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels. + + .. versionadded:: 1.4.0 + + parse_dates : bool or list of int or names or list of lists or dict, default False + The behavior is as follows: + + * boolean. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + .. versionadded:: 1.4.0 + encoding : str, optional, default 'utf-8' Encoding of XML document. @@ -942,6 +1028,9 @@ def read_xml( elems_only=elems_only, attrs_only=attrs_only, names=names, + dtype=dtype, + converters=converters, + parse_dates=parse_dates, encoding=encoding, parser=parser, stylesheet=stylesheet, diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 8809c423a29ba..aa1a3418666f0 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -13,9 +13,14 @@ import pytest from pandas.compat._optional import import_optional_dependency +from pandas.errors import ParserWarning import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, + to_datetime, +) import pandas._testing as tm from pandas.io.xml import read_xml @@ -231,6 +236,48 @@ } ) +xml_types = """\ + + + + square + 00360 + 4.0 + + + circle + 00360 + + + + triangle + 00180 + 3.0 + +""" + +xml_dates = """ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + @pytest.fixture(params=["rb", "r"]) def mode(request): @@ -687,6 +734,274 @@ def test_names_option_wrong_type(datapath, parser): read_xml(filename, names="Col1, Col2, Col3", parser=parser) +# DTYPE + + +def test_dtype_single_str(parser): + df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtypes_all_str(parser): + df_result = read_xml(xml_dates, dtype="string") + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": ["4.0", None, "3.0"], + "date": ["2020-01-01", "2021-01-01", "2022-01-01"], + }, + dtype="string", + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtype_nullable_int(parser): + df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": Series([4.0, float("nan"), 3.0]).astype("Int64"), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtype_float(parser): + df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": Series([360, 360, 180]).astype("float"), + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_dtype(parser): + with pytest.raises( + ValueError, match=('Unable to parse string "square" at position 0') + ): + read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser) + + +def test_both_dtype_converters(parser): + df_result = read_xml( + xml_types, dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): + read_xml( + xml_types, + dtype={"degrees": "str"}, + converters={"degrees": str}, + parser=parser, + ) + + +# CONVERTERS + + +def test_converters_str(parser): + df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_converters_date(parser): + convert_to_datetime = lambda x: to_datetime(x) + df_result = read_xml(xml_dates, converters={"date": convert_to_datetime}) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_converters_type(parser): + with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")): + read_xml(xml_types, converters={"degrees", str}, parser=parser) + + +def test_callable_func_converters(parser): + with pytest.raises(TypeError, match=("'float' object is not callable")): + read_xml(xml_types, converters={"degrees": float()}, parser=parser) + + +def test_callable_str_converters(parser): + with pytest.raises(TypeError, match=("'str' object is not callable")): + read_xml(xml_types, converters={"degrees": "float"}, parser=parser) + + +# PARSE DATES + + +def test_parse_dates_column_name(parser): + df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_column_index(parser): + df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_dictionary(parser): + xml = """ + + + square + 360 + 4.0 + 2020 + 12 + 31 + + + circle + 360 + + 2021 + 12 + 31 + + + triangle + 180 + 3.0 + 2022 + 12 + 31 + +""" + + df_result = read_xml( + xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser + ) + + df_expected = DataFrame( + { + "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_day_first_parse_dates(parser): + xml = """\ + + + + square + 00360 + 4.0 + 31/12/2020 + + + circle + 00360 + + 31/12/2021 + + + triangle + 00180 + 3.0 + 31/12/2022 + +""" + + df_result = read_xml(xml, parse_dates=["date"], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + with tm.assert_produces_warning( + UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" + ): + read_xml(xml, parse_dates=["date"], parser=parser) + + +def test_wrong_parse_dates_type(parser): + with pytest.raises( + TypeError, match=("Only booleans, lists, and dictionaries are accepted") + ): + read_xml(xml_dates, parse_dates={"date"}, parser=parser) + + # ENCODING From ef88558de74c779613bd205677086ba02882e4c9 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 17 Jan 2022 00:01:30 -0600 Subject: [PATCH 2/5] Fix missing fixture param on tests --- pandas/tests/io/xml/test_xml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index aa1a3418666f0..f4024fb51208a 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -752,7 +752,7 @@ def test_dtype_single_str(parser): def test_dtypes_all_str(parser): - df_result = read_xml(xml_dates, dtype="string") + df_result = read_xml(xml_dates, dtype="string", parser=parser) df_expected = DataFrame( { @@ -845,7 +845,9 @@ def test_converters_str(parser): def test_converters_date(parser): convert_to_datetime = lambda x: to_datetime(x) - df_result = read_xml(xml_dates, converters={"date": convert_to_datetime}) + df_result = read_xml( + xml_dates, converters={"date": convert_to_datetime}, parser=parser + ) df_expected = DataFrame( { From 6787a5912dd69a694af3ac908d660e267afb0508 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 17 Jan 2022 11:26:06 -0600 Subject: [PATCH 3/5] Update whats_new version, move dtypes tests to new file and add tests --- doc/source/whatsnew/v1.4.0.rst | 41 --- doc/source/whatsnew/v1.5.0.rst | 42 +++ pandas/io/xml.py | 14 +- pandas/tests/io/xml/test_xml.py | 319 +-------------------- pandas/tests/io/xml/test_xml_dtypes.py | 368 +++++++++++++++++++++++++ 5 files changed, 418 insertions(+), 366 deletions(-) create mode 100644 pandas/tests/io/xml/test_xml_dtypes.py diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d57ddad0b48c2..32ca3f6945d7f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -184,47 +184,6 @@ representation of :class:`DataFrame` objects (:issue:`4889`). df df.to_dict(orient='tight') -.. _whatsnew_140.read_xml_dtypes: - -read_xml now supports ``dtype``, ``converters``, and ``parse_dates`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns, -apply converter methods, and parse dates. - -.. ipython:: python - - xml_dates = """ - - - square - 00360 - 4.0 - 2020-01-01 - - - circle - 00360 - - 2021-01-01 - - - triangle - 00180 - 3.0 - 2022-01-01 - - """ - - df = pd.read_xml( - xml_dates, - dtype={'sides': 'Int64'}, - converters={'degrees': str}, - parse_dates=['date'] - ) - df - df.dtypes - .. _whatsnew_140.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 290f2e0ae08b6..7a7ab197df08d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -82,6 +82,48 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_140.read_xml_dtypes: + +read_xml now supports ``dtype``, ``converters``, and ``parse_dates`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns, +apply converter methods, and parse dates. + +.. ipython:: python + + xml_dates = """ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + + """ + + df = pd.read_xml( + xml_dates, + dtype={'sides': 'Int64'}, + converters={'degrees': str}, + parse_dates=['date'] + ) + df + df.dtypes + .. _whatsnew_150.api_breaking.other: Other API changes diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 7add71df0e872..1af76c75b7dc8 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -75,18 +75,18 @@ class _XMLFrameParser: Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 'c': 'Int64'}} - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 parse_dates : bool or list of int or names or list of lists or dict Converts either index or select columns to datetimes - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 encoding : str Encoding of xml object or document. @@ -864,16 +864,16 @@ def read_xml( If converters are specified, they will be applied INSTEAD of dtype conversion. - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 converters : dict, optional Dict of functions for converting values in certain columns. Keys can either be integers or column labels. - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 parse_dates : bool or list of int or names or list of lists or dict, default False - The behavior is as follows: + Identifiers to parse index or columns to datetime. The behavior is as follows: * boolean. If True -> try parsing the index. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 @@ -883,7 +883,7 @@ def read_xml( * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call result 'foo' - .. versionadded:: 1.4.0 + .. versionadded:: 1.5.0 encoding : str, optional, default 'utf-8' Encoding of XML document. diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index f4024fb51208a..8809c423a29ba 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -13,14 +13,9 @@ import pytest from pandas.compat._optional import import_optional_dependency -from pandas.errors import ParserWarning import pandas.util._test_decorators as td -from pandas import ( - DataFrame, - Series, - to_datetime, -) +from pandas import DataFrame import pandas._testing as tm from pandas.io.xml import read_xml @@ -236,48 +231,6 @@ } ) -xml_types = """\ - - - - square - 00360 - 4.0 - - - circle - 00360 - - - - triangle - 00180 - 3.0 - -""" - -xml_dates = """ - - - square - 00360 - 4.0 - 2020-01-01 - - - circle - 00360 - - 2021-01-01 - - - triangle - 00180 - 3.0 - 2022-01-01 - -""" - @pytest.fixture(params=["rb", "r"]) def mode(request): @@ -734,276 +687,6 @@ def test_names_option_wrong_type(datapath, parser): read_xml(filename, names="Col1, Col2, Col3", parser=parser) -# DTYPE - - -def test_dtype_single_str(parser): - df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": ["00360", "00360", "00180"], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_dtypes_all_str(parser): - df_result = read_xml(xml_dates, dtype="string", parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": ["00360", "00360", "00180"], - "sides": ["4.0", None, "3.0"], - "date": ["2020-01-01", "2021-01-01", "2022-01-01"], - }, - dtype="string", - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_dtype_nullable_int(parser): - df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": Series([4.0, float("nan"), 3.0]).astype("Int64"), - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_dtype_float(parser): - df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": Series([360, 360, 180]).astype("float"), - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_wrong_dtype(parser): - with pytest.raises( - ValueError, match=('Unable to parse string "square" at position 0') - ): - read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser) - - -def test_both_dtype_converters(parser): - df_result = read_xml( - xml_types, dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser - ) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": ["00360", "00360", "00180"], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): - read_xml( - xml_types, - dtype={"degrees": "str"}, - converters={"degrees": str}, - parser=parser, - ) - - -# CONVERTERS - - -def test_converters_str(parser): - df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": ["00360", "00360", "00180"], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_converters_date(parser): - convert_to_datetime = lambda x: to_datetime(x) - df_result = read_xml( - xml_dates, converters={"date": convert_to_datetime}, parser=parser - ) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_wrong_converters_type(parser): - with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")): - read_xml(xml_types, converters={"degrees", str}, parser=parser) - - -def test_callable_func_converters(parser): - with pytest.raises(TypeError, match=("'float' object is not callable")): - read_xml(xml_types, converters={"degrees": float()}, parser=parser) - - -def test_callable_str_converters(parser): - with pytest.raises(TypeError, match=("'str' object is not callable")): - read_xml(xml_types, converters={"degrees": "float"}, parser=parser) - - -# PARSE DATES - - -def test_parse_dates_column_name(parser): - df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_parse_dates_column_index(parser): - df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_parse_dates_dictionary(parser): - xml = """ - - - square - 360 - 4.0 - 2020 - 12 - 31 - - - circle - 360 - - 2021 - 12 - 31 - - - triangle - 180 - 3.0 - 2022 - 12 - 31 - -""" - - df_result = read_xml( - xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser - ) - - df_expected = DataFrame( - { - "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - -def test_day_first_parse_dates(parser): - xml = """\ - - - - square - 00360 - 4.0 - 31/12/2020 - - - circle - 00360 - - 31/12/2021 - - - triangle - 00180 - 3.0 - 31/12/2022 - -""" - - df_result = read_xml(xml, parse_dates=["date"], parser=parser) - - df_expected = DataFrame( - { - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - "date": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), - } - ) - - tm.assert_frame_equal(df_result, df_expected) - - with tm.assert_produces_warning( - UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" - ): - read_xml(xml, parse_dates=["date"], parser=parser) - - -def test_wrong_parse_dates_type(parser): - with pytest.raises( - TypeError, match=("Only booleans, lists, and dictionaries are accepted") - ): - read_xml(xml_dates, parse_dates={"date"}, parser=parser) - - # ENCODING diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py new file mode 100644 index 0000000000000..8fbdff898a9d1 --- /dev/null +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -0,0 +1,368 @@ +from __future__ import annotations + +import pytest + +from pandas.errors import ParserWarning +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Series, + to_datetime, +) +import pandas._testing as tm + +from pandas.io.xml import read_xml + + +@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) +def parser(request): + return request.param + + +xml_types = """\ + + + + square + 00360 + 4.0 + + + circle + 00360 + + + + triangle + 00180 + 3.0 + +""" + +xml_dates = """ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + + +# DTYPE + + +def test_dtype_single_str(parser): + df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtypes_all_str(parser): + df_result = read_xml(xml_dates, dtype="string", parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": ["4.0", None, "3.0"], + "date": ["2020-01-01", "2021-01-01", "2022-01-01"], + }, + dtype="string", + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtypes_with_names(parser): + df_result = read_xml( + xml_dates, + names=["Col1", "Col2", "Col3", "Col4"], + dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, + parser=parser, + ) + + df_expected = DataFrame( + { + "Col1": ["square", "circle", "triangle"], + "Col2": Series(["00360", "00360", "00180"]).astype("string"), + "Col3": Series([4.0, float("nan"), 3.0]).astype("Int64"), + "Col4": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtype_nullable_int(parser): + df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": Series([4.0, float("nan"), 3.0]).astype("Int64"), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtype_float(parser): + df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": Series([360, 360, 180]).astype("float"), + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_dtype(parser): + with pytest.raises( + ValueError, match=('Unable to parse string "square" at position 0') + ): + read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser) + + +def test_both_dtype_converters(parser): + df_result = read_xml( + xml_types, dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): + read_xml( + xml_types, + dtype={"degrees": "str"}, + converters={"degrees": str}, + parser=parser, + ) + + +# CONVERTERS + + +def test_converters_str(parser): + df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_converters_date(parser): + convert_to_datetime = lambda x: to_datetime(x) + df_result = read_xml( + xml_dates, converters={"date": convert_to_datetime}, parser=parser + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_converters_type(parser): + with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")): + read_xml(xml_types, converters={"degrees", str}, parser=parser) + + +def test_callable_func_converters(parser): + with pytest.raises(TypeError, match=("'float' object is not callable")): + read_xml(xml_types, converters={"degrees": float()}, parser=parser) + + +def test_callable_str_converters(parser): + with pytest.raises(TypeError, match=("'str' object is not callable")): + read_xml(xml_types, converters={"degrees": "float"}, parser=parser) + + +# PARSE DATES + + +def test_parse_dates_column_name(parser): + df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_column_index(parser): + df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_true(parser): + df_result = read_xml(xml_dates, parse_dates=True, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": ["2020-01-01", "2021-01-01", "2022-01-01"], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_dictionary(parser): + xml = """ + + + square + 360 + 4.0 + 2020 + 12 + 31 + + + circle + 360 + + 2021 + 12 + 31 + + + triangle + 180 + 3.0 + 2022 + 12 + 31 + +""" + + df_result = read_xml( + xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser + ) + + df_expected = DataFrame( + { + "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_day_first_parse_dates(parser): + xml = """\ + + + + square + 00360 + 4.0 + 31/12/2020 + + + circle + 00360 + + 31/12/2021 + + + triangle + 00180 + 3.0 + 31/12/2022 + +""" + + df_result = read_xml(xml, parse_dates=["date"], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + with tm.assert_produces_warning( + UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" + ): + read_xml(xml, parse_dates=["date"], parser=parser) + + +def test_wrong_parse_dates_type(parser): + with pytest.raises( + TypeError, match=("Only booleans, lists, and dictionaries are accepted") + ): + read_xml(xml_dates, parse_dates={"date"}, parser=parser) From d3ffe37b316e38ca1744f13e54a9b2886057ee47 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Mon, 17 Jan 2022 19:51:08 -0600 Subject: [PATCH 4/5] Fix tests with warnings assertions --- pandas/tests/io/xml/test_xml_dtypes.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 8fbdff898a9d1..801461ed4288a 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -152,10 +152,6 @@ def test_wrong_dtype(parser): def test_both_dtype_converters(parser): - df_result = read_xml( - xml_types, dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser - ) - df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -164,16 +160,16 @@ def test_both_dtype_converters(parser): } ) - tm.assert_frame_equal(df_result, df_expected) - with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): - read_xml( + df_result = read_xml( xml_types, dtype={"degrees": "str"}, converters={"degrees": str}, parser=parser, ) + tm.assert_frame_equal(df_result, df_expected) + # CONVERTERS @@ -342,8 +338,6 @@ def test_day_first_parse_dates(parser): """ - df_result = read_xml(xml, parse_dates=["date"], parser=parser) - df_expected = DataFrame( { "shape": ["square", "circle", "triangle"], @@ -353,12 +347,11 @@ def test_day_first_parse_dates(parser): } ) - tm.assert_frame_equal(df_result, df_expected) - with tm.assert_produces_warning( UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" ): - read_xml(xml, parse_dates=["date"], parser=parser) + df_result = read_xml(xml, parse_dates=["date"], parser=parser) + tm.assert_frame_equal(df_result, df_expected) def test_wrong_parse_dates_type(parser): From eabaa4de7c4c2fa8f504b3d323d22c567e72f855 Mon Sep 17 00:00:00 2001 From: Parfait Gasana Date: Sat, 22 Jan 2022 21:52:36 -0600 Subject: [PATCH 5/5] Add new converters and parse_dates typing aliases --- doc/source/whatsnew/v1.5.0.rst | 4 ++-- pandas/_typing.py | 13 +++++++++++-- pandas/io/xml.py | 31 +++++++++---------------------- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8ac265f5c27ac..495e90c94b10d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -86,13 +86,13 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -.. _whatsnew_140.read_xml_dtypes: +.. _whatsnew_150.read_xml_dtypes: read_xml now supports ``dtype``, ``converters``, and ``parse_dates`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns, -apply converter methods, and parse dates. +apply converter methods, and parse dates (:issue:`43567`). .. ipython:: python diff --git a/pandas/_typing.py b/pandas/_typing.py index fd099b3897bab..c0383fe50a7e7 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -129,6 +129,14 @@ DtypeArg = Union[Dtype, Dict[Hashable, Dtype]] DtypeObj = Union[np.dtype, "ExtensionDtype"] +# converters +ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]] + +# parse_dates +ParseDatesArg = Union[ + bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]] +] + # For functions like rename that convert one label to another Renamer = Union[Mapping[Hashable, Any], Callable[[Hashable], Hashable]] @@ -246,8 +254,6 @@ def closed(self) -> bool: CompressionOptions = Optional[ Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict] ] -XMLParsers = Literal["lxml", "etree"] - # types in DataFrameFormatter FormattersType = Union[ @@ -295,3 +301,6 @@ def closed(self) -> bool: # read_csv engines CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"] + +# read_xml parsers +XMLParsers = Literal["lxml", "etree"] diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 1af76c75b7dc8..8e463c94340c8 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -5,15 +5,14 @@ from __future__ import annotations import io -from typing import ( - Callable, - Sequence, -) +from typing import Sequence from pandas._typing import ( CompressionOptions, + ConvertersArg, DtypeArg, FilePath, + ParseDatesArg, ReadBuffer, StorageOptions, XMLParsers, @@ -131,12 +130,8 @@ def __init__( attrs_only: bool, names: Sequence[str] | None, dtype: DtypeArg | None, - converters: dict[str, Callable] | None, - parse_dates: bool - | list[int | str] - | list[list[int | str]] - | dict[str, list[int | str]] - | None, + converters: ConvertersArg | None, + parse_dates: ParseDatesArg | None, encoding: str | None, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, @@ -703,12 +698,8 @@ def _parse( attrs_only: bool, names: Sequence[str] | None, dtype: DtypeArg | None, - converters: dict[str, Callable] | None, - parse_dates: bool - | list[int | str] - | list[list[int | str]] - | dict[str, list[int | str]] - | None, + converters: ConvertersArg | None, + parse_dates: ParseDatesArg | None, encoding: str | None, parser: XMLParsers, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, @@ -800,12 +791,8 @@ def read_xml( attrs_only: bool = False, names: Sequence[str] | None = None, dtype: DtypeArg | None = None, - converters: dict[str, Callable] | None = None, - parse_dates: bool - | list[int | str] - | list[list[int | str]] - | dict[str, list[int | str]] - | None = None, + converters: ConvertersArg | None = None, + parse_dates: ParseDatesArg | None = None, # encoding can not be None for lxml and StringIO input encoding: str | None = "utf-8", parser: XMLParsers = "lxml",