diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b32416418a39f..c20369181c4ac 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -88,6 +88,48 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_150.read_xml_dtypes: + +read_xml now supports ``dtype``, ``converters``, and ``parse_dates`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns, +apply converter methods, and parse dates (:issue:`43567`). + +.. ipython:: python + + xml_dates = """ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + + """ + + df = pd.read_xml( + xml_dates, + dtype={'sides': 'Int64'}, + converters={'degrees': str}, + parse_dates=['date'] + ) + df + df.dtypes + .. _whatsnew_150.api_breaking.other: Other API changes diff --git a/pandas/_typing.py b/pandas/_typing.py index fd099b3897bab..c0383fe50a7e7 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -129,6 +129,14 @@ DtypeArg = Union[Dtype, Dict[Hashable, Dtype]] DtypeObj = Union[np.dtype, "ExtensionDtype"] +# converters +ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]] + +# parse_dates +ParseDatesArg = Union[ + bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]] +] + # For functions like rename that convert one label to another Renamer = Union[Mapping[Hashable, Any], Callable[[Hashable], Hashable]] @@ -246,8 +254,6 @@ def closed(self) -> bool: CompressionOptions = Optional[ Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict] ] -XMLParsers = Literal["lxml", "etree"] - # types in DataFrameFormatter FormattersType = Union[ @@ -295,3 +301,6 @@ def closed(self) -> bool: # read_csv engines CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"] + +# read_xml parsers +XMLParsers = Literal["lxml", "etree"] diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ad87b18bd1683..8e463c94340c8 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -9,7 +9,10 @@ from pandas._typing import ( CompressionOptions, + ConvertersArg, + DtypeArg, FilePath, + ParseDatesArg, ReadBuffer, StorageOptions, XMLParsers, @@ -67,6 +70,23 @@ class _XMLFrameParser: names : list Column names for Data Frame of parsed XML data. + dtype : dict + Data type for data or columns. E.g. {{'a': np.float64, + 'b': np.int32, 'c': 'Int64'}} + + .. versionadded:: 1.5.0 + + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels. + + .. versionadded:: 1.5.0 + + parse_dates : bool or list of int or names or list of lists or dict + Converts either index or select columns to datetimes + + .. versionadded:: 1.5.0 + encoding : str Encoding of xml object or document. @@ -109,6 +129,9 @@ def __init__( elems_only: bool, attrs_only: bool, names: Sequence[str] | None, + dtype: DtypeArg | None, + converters: ConvertersArg | None, + parse_dates: ParseDatesArg | None, encoding: str | None, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, compression: CompressionOptions, @@ -120,6 +143,9 @@ def __init__( self.elems_only = elems_only self.attrs_only = attrs_only self.names = names + self.dtype = dtype + self.converters = converters + self.parse_dates = parse_dates self.encoding = encoding self.stylesheet = stylesheet self.is_style = None @@ -671,6 +697,9 @@ def _parse( elems_only: bool, attrs_only: bool, names: Sequence[str] | None, + dtype: DtypeArg | None, + converters: ConvertersArg | None, + parse_dates: ParseDatesArg | None, encoding: str | None, parser: XMLParsers, stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, @@ -706,6 +735,9 @@ def _parse( elems_only, attrs_only, names, + dtype, + converters, + parse_dates, encoding, stylesheet, compression, @@ -722,6 +754,9 @@ def _parse( elems_only, attrs_only, names, + dtype, + converters, + parse_dates, encoding, stylesheet, compression, @@ -732,7 +767,13 @@ def _parse( data_dicts = p.parse_data() - return _data_to_frame(data=data_dicts, **kwargs) + return _data_to_frame( + data=data_dicts, + dtype=dtype, + converters=converters, + parse_dates=parse_dates, + **kwargs, + ) @deprecate_nonkeyword_arguments( @@ -749,6 +790,9 @@ def read_xml( elems_only: bool = False, attrs_only: bool = False, names: Sequence[str] | None = None, + dtype: DtypeArg | None = None, + converters: ConvertersArg | None = None, + parse_dates: ParseDatesArg | None = None, # encoding can not be None for lxml and StringIO input encoding: str | None = "utf-8", parser: XMLParsers = "lxml", @@ -799,6 +843,35 @@ def read_xml( Column names for DataFrame of parsed XML data. Use this parameter to rename original element names and distinguish same named elements. + dtype : Type name or dict of column -> type, optional + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} + Use `str` or `object` together with suitable `na_values` settings + to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 1.5.0 + + converters : dict, optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels. + + .. versionadded:: 1.5.0 + + parse_dates : bool or list of int or names or list of lists or dict, default False + Identifiers to parse index or columns to datetime. The behavior is as follows: + + * boolean. If True -> try parsing the index. + * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + result 'foo' + + .. versionadded:: 1.5.0 + encoding : str, optional, default 'utf-8' Encoding of XML document. @@ -942,6 +1015,9 @@ def read_xml( elems_only=elems_only, attrs_only=attrs_only, names=names, + dtype=dtype, + converters=converters, + parse_dates=parse_dates, encoding=encoding, parser=parser, stylesheet=stylesheet, diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py new file mode 100644 index 0000000000000..801461ed4288a --- /dev/null +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -0,0 +1,361 @@ +from __future__ import annotations + +import pytest + +from pandas.errors import ParserWarning +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Series, + to_datetime, +) +import pandas._testing as tm + +from pandas.io.xml import read_xml + + +@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"]) +def parser(request): + return request.param + + +xml_types = """\ + + + + square + 00360 + 4.0 + + + circle + 00360 + + + + triangle + 00180 + 3.0 + +""" + +xml_dates = """ + + + square + 00360 + 4.0 + 2020-01-01 + + + circle + 00360 + + 2021-01-01 + + + triangle + 00180 + 3.0 + 2022-01-01 + +""" + + +# DTYPE + + +def test_dtype_single_str(parser): + df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtypes_all_str(parser): + df_result = read_xml(xml_dates, dtype="string", parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": ["4.0", None, "3.0"], + "date": ["2020-01-01", "2021-01-01", "2022-01-01"], + }, + dtype="string", + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtypes_with_names(parser): + df_result = read_xml( + xml_dates, + names=["Col1", "Col2", "Col3", "Col4"], + dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"}, + parser=parser, + ) + + df_expected = DataFrame( + { + "Col1": ["square", "circle", "triangle"], + "Col2": Series(["00360", "00360", "00180"]).astype("string"), + "Col3": Series([4.0, float("nan"), 3.0]).astype("Int64"), + "Col4": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtype_nullable_int(parser): + df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": Series([4.0, float("nan"), 3.0]).astype("Int64"), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_dtype_float(parser): + df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": Series([360, 360, 180]).astype("float"), + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_dtype(parser): + with pytest.raises( + ValueError, match=('Unable to parse string "square" at position 0') + ): + read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser) + + +def test_both_dtype_converters(parser): + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"): + df_result = read_xml( + xml_types, + dtype={"degrees": "str"}, + converters={"degrees": str}, + parser=parser, + ) + + tm.assert_frame_equal(df_result, df_expected) + + +# CONVERTERS + + +def test_converters_str(parser): + df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": ["00360", "00360", "00180"], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_converters_date(parser): + convert_to_datetime = lambda x: to_datetime(x) + df_result = read_xml( + xml_dates, converters={"date": convert_to_datetime}, parser=parser + ) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_converters_type(parser): + with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")): + read_xml(xml_types, converters={"degrees", str}, parser=parser) + + +def test_callable_func_converters(parser): + with pytest.raises(TypeError, match=("'float' object is not callable")): + read_xml(xml_types, converters={"degrees": float()}, parser=parser) + + +def test_callable_str_converters(parser): + with pytest.raises(TypeError, match=("'str' object is not callable")): + read_xml(xml_types, converters={"degrees": "float"}, parser=parser) + + +# PARSE DATES + + +def test_parse_dates_column_name(parser): + df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_column_index(parser): + df_result = read_xml(xml_dates, parse_dates=[3], parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_true(parser): + df_result = read_xml(xml_dates, parse_dates=True, parser=parser) + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": ["2020-01-01", "2021-01-01", "2022-01-01"], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_parse_dates_dictionary(parser): + xml = """ + + + square + 360 + 4.0 + 2020 + 12 + 31 + + + circle + 360 + + 2021 + 12 + 31 + + + triangle + 180 + 3.0 + 2022 + 12 + 31 + +""" + + df_result = read_xml( + xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser + ) + + df_expected = DataFrame( + { + "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + } + ) + + tm.assert_frame_equal(df_result, df_expected) + + +def test_day_first_parse_dates(parser): + xml = """\ + + + + square + 00360 + 4.0 + 31/12/2020 + + + circle + 00360 + + 31/12/2021 + + + triangle + 00180 + 3.0 + 31/12/2022 + +""" + + df_expected = DataFrame( + { + "shape": ["square", "circle", "triangle"], + "degrees": [360, 360, 180], + "sides": [4.0, float("nan"), 3.0], + "date": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), + } + ) + + with tm.assert_produces_warning( + UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format" + ): + df_result = read_xml(xml, parse_dates=["date"], parser=parser) + tm.assert_frame_equal(df_result, df_expected) + + +def test_wrong_parse_dates_type(parser): + with pytest.raises( + TypeError, match=("Only booleans, lists, and dictionaries are accepted") + ): + read_xml(xml_dates, parse_dates={"date"}, parser=parser)