Skip to content

ENH: Add dtypes/converters arguments for pandas.read_xml #45411

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 23, 2022
42 changes: 42 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,48 @@ Optional libraries below the lowest tested version may still work, but are not c

See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.


.. _whatsnew_150.read_xml_dtypes:

read_xml now supports ``dtype``, ``converters``, and ``parse_dates``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns,
apply converter methods, and parse dates (:issue:`43567`).

.. ipython:: python

xml_dates = """<?xml version='1.0' encoding='utf-8'?>
<data>
<row>
<shape>square</shape>
<degrees>00360</degrees>
<sides>4.0</sides>
<date>2020-01-01</date>
</row>
<row>
<shape>circle</shape>
<degrees>00360</degrees>
<sides/>
<date>2021-01-01</date>
</row>
<row>
<shape>triangle</shape>
<degrees>00180</degrees>
<sides>3.0</sides>
<date>2022-01-01</date>
</row>
</data>"""

df = pd.read_xml(
xml_dates,
dtype={'sides': 'Int64'},
converters={'degrees': str},
parse_dates=['date']
)
df
df.dtypes

.. _whatsnew_150.api_breaking.other:

Other API changes
Expand Down
13 changes: 11 additions & 2 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@
DtypeArg = Union[Dtype, Dict[Hashable, Dtype]]
DtypeObj = Union[np.dtype, "ExtensionDtype"]

# converters
ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]]

# parse_dates
ParseDatesArg = Union[
bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]]
]

# For functions like rename that convert one label to another
Renamer = Union[Mapping[Hashable, Any], Callable[[Hashable], Hashable]]

Expand Down Expand Up @@ -246,8 +254,6 @@ def closed(self) -> bool:
CompressionOptions = Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
]
XMLParsers = Literal["lxml", "etree"]


# types in DataFrameFormatter
FormattersType = Union[
Expand Down Expand Up @@ -295,3 +301,6 @@ def closed(self) -> bool:

# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]

# read_xml parsers
XMLParsers = Literal["lxml", "etree"]
78 changes: 77 additions & 1 deletion pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

from pandas._typing import (
CompressionOptions,
ConvertersArg,
DtypeArg,
FilePath,
ParseDatesArg,
ReadBuffer,
StorageOptions,
XMLParsers,
Expand Down Expand Up @@ -67,6 +70,23 @@ class _XMLFrameParser:
names : list
Column names for Data Frame of parsed XML data.

dtype : dict
Data type for data or columns. E.g. {{'a': np.float64,
'b': np.int32, 'c': 'Int64'}}

.. versionadded:: 1.5.0

converters : dict, optional
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels.

.. versionadded:: 1.5.0

parse_dates : bool or list of int or names or list of lists or dict
Converts either index or select columns to datetimes

.. versionadded:: 1.5.0

encoding : str
Encoding of xml object or document.

Expand Down Expand Up @@ -109,6 +129,9 @@ def __init__(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
dtype: DtypeArg | None,
converters: ConvertersArg | None,
parse_dates: ParseDatesArg | None,
encoding: str | None,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
Expand All @@ -120,6 +143,9 @@ def __init__(
self.elems_only = elems_only
self.attrs_only = attrs_only
self.names = names
self.dtype = dtype
self.converters = converters
self.parse_dates = parse_dates
self.encoding = encoding
self.stylesheet = stylesheet
self.is_style = None
Expand Down Expand Up @@ -671,6 +697,9 @@ def _parse(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
dtype: DtypeArg | None,
converters: ConvertersArg | None,
parse_dates: ParseDatesArg | None,
encoding: str | None,
parser: XMLParsers,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
Expand Down Expand Up @@ -706,6 +735,9 @@ def _parse(
elems_only,
attrs_only,
names,
dtype,
converters,
parse_dates,
encoding,
stylesheet,
compression,
Expand All @@ -722,6 +754,9 @@ def _parse(
elems_only,
attrs_only,
names,
dtype,
converters,
parse_dates,
encoding,
stylesheet,
compression,
Expand All @@ -732,7 +767,13 @@ def _parse(

data_dicts = p.parse_data()

return _data_to_frame(data=data_dicts, **kwargs)
return _data_to_frame(
data=data_dicts,
dtype=dtype,
converters=converters,
parse_dates=parse_dates,
**kwargs,
)


@deprecate_nonkeyword_arguments(
Expand All @@ -749,6 +790,9 @@ def read_xml(
elems_only: bool = False,
attrs_only: bool = False,
names: Sequence[str] | None = None,
dtype: DtypeArg | None = None,
converters: ConvertersArg | None = None,
parse_dates: ParseDatesArg | None = None,
# encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
parser: XMLParsers = "lxml",
Expand Down Expand Up @@ -799,6 +843,35 @@ def read_xml(
Column names for DataFrame of parsed XML data. Use this parameter to
rename original element names and distinguish same named elements.

dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Use `str` or `object` together with suitable `na_values` settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.

.. versionadded:: 1.5.0

converters : dict, optional
Dict of functions for converting values in certain columns. Keys can either
be integers or column labels.

.. versionadded:: 1.5.0

parse_dates : bool or list of int or names or list of lists or dict, default False
Identifiers to parse index or columns to datetime. The behavior is as follows:

* boolean. If True -> try parsing the index.
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
each as a separate date column.
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
result 'foo'

.. versionadded:: 1.5.0

encoding : str, optional, default 'utf-8'
Encoding of XML document.

Expand Down Expand Up @@ -942,6 +1015,9 @@ def read_xml(
elems_only=elems_only,
attrs_only=attrs_only,
names=names,
dtype=dtype,
converters=converters,
parse_dates=parse_dates,
encoding=encoding,
parser=parser,
stylesheet=stylesheet,
Expand Down
Loading