diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index b32416418a39f..c20369181c4ac 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -88,6 +88,48 @@ Optional libraries below the lowest tested version may still work, but are not c
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
+
+.. _whatsnew_150.read_xml_dtypes:
+
+read_xml now supports ``dtype``, ``converters``, and ``parse_dates``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns,
+apply converter methods, and parse dates (:issue:`43567`).
+
+.. ipython:: python
+
+ xml_dates = """
+
+
+ square
+ 00360
+ 4.0
+ 2020-01-01
+
+
+ circle
+ 00360
+
+ 2021-01-01
+
+
+ triangle
+ 00180
+ 3.0
+ 2022-01-01
+
+ """
+
+ df = pd.read_xml(
+ xml_dates,
+ dtype={'sides': 'Int64'},
+ converters={'degrees': str},
+ parse_dates=['date']
+ )
+ df
+ df.dtypes
+
.. _whatsnew_150.api_breaking.other:
Other API changes
diff --git a/pandas/_typing.py b/pandas/_typing.py
index fd099b3897bab..c0383fe50a7e7 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -129,6 +129,14 @@
DtypeArg = Union[Dtype, Dict[Hashable, Dtype]]
DtypeObj = Union[np.dtype, "ExtensionDtype"]
+# converters
+ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]]
+
+# parse_dates
+ParseDatesArg = Union[
+ bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]]
+]
+
# For functions like rename that convert one label to another
Renamer = Union[Mapping[Hashable, Any], Callable[[Hashable], Hashable]]
@@ -246,8 +254,6 @@ def closed(self) -> bool:
CompressionOptions = Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
]
-XMLParsers = Literal["lxml", "etree"]
-
# types in DataFrameFormatter
FormattersType = Union[
@@ -295,3 +301,6 @@ def closed(self) -> bool:
# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
+
+# read_xml parsers
+XMLParsers = Literal["lxml", "etree"]
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index ad87b18bd1683..8e463c94340c8 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -9,7 +9,10 @@
from pandas._typing import (
CompressionOptions,
+ ConvertersArg,
+ DtypeArg,
FilePath,
+ ParseDatesArg,
ReadBuffer,
StorageOptions,
XMLParsers,
@@ -67,6 +70,23 @@ class _XMLFrameParser:
names : list
Column names for Data Frame of parsed XML data.
+ dtype : dict
+ Data type for data or columns. E.g. {{'a': np.float64,
+ 'b': np.int32, 'c': 'Int64'}}
+
+ .. versionadded:: 1.5.0
+
+ converters : dict, optional
+ Dict of functions for converting values in certain columns. Keys can
+ either be integers or column labels.
+
+ .. versionadded:: 1.5.0
+
+ parse_dates : bool or list of int or names or list of lists or dict
+ Converts either index or select columns to datetimes
+
+ .. versionadded:: 1.5.0
+
encoding : str
Encoding of xml object or document.
@@ -109,6 +129,9 @@ def __init__(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
+ dtype: DtypeArg | None,
+ converters: ConvertersArg | None,
+ parse_dates: ParseDatesArg | None,
encoding: str | None,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
@@ -120,6 +143,9 @@ def __init__(
self.elems_only = elems_only
self.attrs_only = attrs_only
self.names = names
+ self.dtype = dtype
+ self.converters = converters
+ self.parse_dates = parse_dates
self.encoding = encoding
self.stylesheet = stylesheet
self.is_style = None
@@ -671,6 +697,9 @@ def _parse(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
+ dtype: DtypeArg | None,
+ converters: ConvertersArg | None,
+ parse_dates: ParseDatesArg | None,
encoding: str | None,
parser: XMLParsers,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
@@ -706,6 +735,9 @@ def _parse(
elems_only,
attrs_only,
names,
+ dtype,
+ converters,
+ parse_dates,
encoding,
stylesheet,
compression,
@@ -722,6 +754,9 @@ def _parse(
elems_only,
attrs_only,
names,
+ dtype,
+ converters,
+ parse_dates,
encoding,
stylesheet,
compression,
@@ -732,7 +767,13 @@ def _parse(
data_dicts = p.parse_data()
- return _data_to_frame(data=data_dicts, **kwargs)
+ return _data_to_frame(
+ data=data_dicts,
+ dtype=dtype,
+ converters=converters,
+ parse_dates=parse_dates,
+ **kwargs,
+ )
@deprecate_nonkeyword_arguments(
@@ -749,6 +790,9 @@ def read_xml(
elems_only: bool = False,
attrs_only: bool = False,
names: Sequence[str] | None = None,
+ dtype: DtypeArg | None = None,
+ converters: ConvertersArg | None = None,
+ parse_dates: ParseDatesArg | None = None,
# encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
parser: XMLParsers = "lxml",
@@ -799,6 +843,35 @@ def read_xml(
Column names for DataFrame of parsed XML data. Use this parameter to
rename original element names and distinguish same named elements.
+ dtype : Type name or dict of column -> type, optional
+ Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
+ 'c': 'Int64'}}
+ Use `str` or `object` together with suitable `na_values` settings
+ to preserve and not interpret dtype.
+ If converters are specified, they will be applied INSTEAD
+ of dtype conversion.
+
+ .. versionadded:: 1.5.0
+
+ converters : dict, optional
+ Dict of functions for converting values in certain columns. Keys can either
+ be integers or column labels.
+
+ .. versionadded:: 1.5.0
+
+ parse_dates : bool or list of int or names or list of lists or dict, default False
+ Identifiers to parse index or columns to datetime. The behavior is as follows:
+
+ * boolean. If True -> try parsing the index.
+ * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
+ each as a separate date column.
+ * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
+ a single date column.
+ * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
+ result 'foo'
+
+ .. versionadded:: 1.5.0
+
encoding : str, optional, default 'utf-8'
Encoding of XML document.
@@ -942,6 +1015,9 @@ def read_xml(
elems_only=elems_only,
attrs_only=attrs_only,
names=names,
+ dtype=dtype,
+ converters=converters,
+ parse_dates=parse_dates,
encoding=encoding,
parser=parser,
stylesheet=stylesheet,
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py
new file mode 100644
index 0000000000000..801461ed4288a
--- /dev/null
+++ b/pandas/tests/io/xml/test_xml_dtypes.py
@@ -0,0 +1,361 @@
+from __future__ import annotations
+
+import pytest
+
+from pandas.errors import ParserWarning
+import pandas.util._test_decorators as td
+
+from pandas import (
+ DataFrame,
+ Series,
+ to_datetime,
+)
+import pandas._testing as tm
+
+from pandas.io.xml import read_xml
+
+
+@pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
+def parser(request):
+ return request.param
+
+
+xml_types = """\
+
+
+
+ square
+ 00360
+ 4.0
+
+
+ circle
+ 00360
+
+
+
+ triangle
+ 00180
+ 3.0
+
+"""
+
+xml_dates = """
+
+
+ square
+ 00360
+ 4.0
+ 2020-01-01
+
+
+ circle
+ 00360
+
+ 2021-01-01
+
+
+ triangle
+ 00180
+ 3.0
+ 2022-01-01
+
+"""
+
+
+# DTYPE
+
+
+def test_dtype_single_str(parser):
+ df_result = read_xml(xml_types, dtype={"degrees": "str"}, parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": ["00360", "00360", "00180"],
+ "sides": [4.0, float("nan"), 3.0],
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_dtypes_all_str(parser):
+ df_result = read_xml(xml_dates, dtype="string", parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": ["00360", "00360", "00180"],
+ "sides": ["4.0", None, "3.0"],
+ "date": ["2020-01-01", "2021-01-01", "2022-01-01"],
+ },
+ dtype="string",
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_dtypes_with_names(parser):
+ df_result = read_xml(
+ xml_dates,
+ names=["Col1", "Col2", "Col3", "Col4"],
+ dtype={"Col2": "string", "Col3": "Int64", "Col4": "datetime64"},
+ parser=parser,
+ )
+
+ df_expected = DataFrame(
+ {
+ "Col1": ["square", "circle", "triangle"],
+ "Col2": Series(["00360", "00360", "00180"]).astype("string"),
+ "Col3": Series([4.0, float("nan"), 3.0]).astype("Int64"),
+ "Col4": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_dtype_nullable_int(parser):
+ df_result = read_xml(xml_types, dtype={"sides": "Int64"}, parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": Series([4.0, float("nan"), 3.0]).astype("Int64"),
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_dtype_float(parser):
+ df_result = read_xml(xml_types, dtype={"degrees": "float"}, parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": Series([360, 360, 180]).astype("float"),
+ "sides": [4.0, float("nan"), 3.0],
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_wrong_dtype(parser):
+ with pytest.raises(
+ ValueError, match=('Unable to parse string "square" at position 0')
+ ):
+ read_xml(xml_types, dtype={"shape": "Int64"}, parser=parser)
+
+
+def test_both_dtype_converters(parser):
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": ["00360", "00360", "00180"],
+ "sides": [4.0, float("nan"), 3.0],
+ }
+ )
+
+ with tm.assert_produces_warning(ParserWarning, match="Both a converter and dtype"):
+ df_result = read_xml(
+ xml_types,
+ dtype={"degrees": "str"},
+ converters={"degrees": str},
+ parser=parser,
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+# CONVERTERS
+
+
+def test_converters_str(parser):
+ df_result = read_xml(xml_types, converters={"degrees": str}, parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": ["00360", "00360", "00180"],
+ "sides": [4.0, float("nan"), 3.0],
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_converters_date(parser):
+ convert_to_datetime = lambda x: to_datetime(x)
+ df_result = read_xml(
+ xml_dates, converters={"date": convert_to_datetime}, parser=parser
+ )
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": [4.0, float("nan"), 3.0],
+ "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_wrong_converters_type(parser):
+ with pytest.raises(TypeError, match=("Type converters must be a dict or subclass")):
+ read_xml(xml_types, converters={"degrees", str}, parser=parser)
+
+
+def test_callable_func_converters(parser):
+ with pytest.raises(TypeError, match=("'float' object is not callable")):
+ read_xml(xml_types, converters={"degrees": float()}, parser=parser)
+
+
+def test_callable_str_converters(parser):
+ with pytest.raises(TypeError, match=("'str' object is not callable")):
+ read_xml(xml_types, converters={"degrees": "float"}, parser=parser)
+
+
+# PARSE DATES
+
+
+def test_parse_dates_column_name(parser):
+ df_result = read_xml(xml_dates, parse_dates=["date"], parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": [4.0, float("nan"), 3.0],
+ "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_parse_dates_column_index(parser):
+ df_result = read_xml(xml_dates, parse_dates=[3], parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": [4.0, float("nan"), 3.0],
+ "date": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]),
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_parse_dates_true(parser):
+ df_result = read_xml(xml_dates, parse_dates=True, parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": [4.0, float("nan"), 3.0],
+ "date": ["2020-01-01", "2021-01-01", "2022-01-01"],
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_parse_dates_dictionary(parser):
+ xml = """
+
+
+ square
+ 360
+ 4.0
+ 2020
+ 12
+ 31
+
+
+ circle
+ 360
+
+ 2021
+ 12
+ 31
+
+
+ triangle
+ 180
+ 3.0
+ 2022
+ 12
+ 31
+
+"""
+
+ df_result = read_xml(
+ xml, parse_dates={"date_end": ["year", "month", "day"]}, parser=parser
+ )
+
+ df_expected = DataFrame(
+ {
+ "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]),
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": [4.0, float("nan"), 3.0],
+ }
+ )
+
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_day_first_parse_dates(parser):
+ xml = """\
+
+
+
+ square
+ 00360
+ 4.0
+ 31/12/2020
+
+
+ circle
+ 00360
+
+ 31/12/2021
+
+
+ triangle
+ 00180
+ 3.0
+ 31/12/2022
+
+"""
+
+ df_expected = DataFrame(
+ {
+ "shape": ["square", "circle", "triangle"],
+ "degrees": [360, 360, 180],
+ "sides": [4.0, float("nan"), 3.0],
+ "date": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]),
+ }
+ )
+
+ with tm.assert_produces_warning(
+ UserWarning, match="Parsing '31/12/2020' in DD/MM/YYYY format"
+ ):
+ df_result = read_xml(xml, parse_dates=["date"], parser=parser)
+ tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_wrong_parse_dates_type(parser):
+ with pytest.raises(
+ TypeError, match=("Only booleans, lists, and dictionaries are accepted")
+ ):
+ read_xml(xml_dates, parse_dates={"date"}, parser=parser)