Skip to content

ENH: Add dtypes/converters arguments for pandas.read_xml #45411

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jan 23, 2022
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,47 @@ representation of :class:`DataFrame` objects (:issue:`4889`).
df
df.to_dict(orient='tight')

.. _whatsnew_140.read_xml_dtypes:

read_xml now supports ``dtype``, ``converters``, and ``parse_dates``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns,
apply converter methods, and parse dates.

.. ipython:: python

xml_dates = """<?xml version='1.0' encoding='utf-8'?>
<data>
<row>
<shape>square</shape>
<degrees>00360</degrees>
<sides>4.0</sides>
<date>2020-01-01</date>
</row>
<row>
<shape>circle</shape>
<degrees>00360</degrees>
<sides/>
<date>2021-01-01</date>
</row>
<row>
<shape>triangle</shape>
<degrees>00180</degrees>
<sides>3.0</sides>
<date>2022-01-01</date>
</row>
</data>"""

df = pd.read_xml(
xml_dates,
dtype={'sides': 'Int64'},
converters={'degrees': str},
parse_dates=['date']
)
df
df.dtypes

.. _whatsnew_140.enhancements.other:

Other enhancements
Expand Down
93 changes: 91 additions & 2 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
from __future__ import annotations

import io
from typing import Sequence
from typing import (
Callable,
Sequence,
)

from pandas._typing import (
CompressionOptions,
DtypeArg,
FilePath,
ReadBuffer,
StorageOptions,
Expand Down Expand Up @@ -67,6 +71,23 @@ class _XMLFrameParser:
names : list
Column names for Data Frame of parsed XML data.

dtype : dict
Data type for data or columns. E.g. {{'a': np.float64,
'b': np.int32, 'c': 'Int64'}}

.. versionadded:: 1.4.0

converters : dict, optional
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels.

.. versionadded:: 1.4.0

parse_dates : bool or list of int or names or list of lists or dict
Converts either index or select columns to datetimes

.. versionadded:: 1.4.0

encoding : str
Encoding of xml object or document.

Expand Down Expand Up @@ -109,6 +130,13 @@ def __init__(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
dtype: DtypeArg | None,
converters: dict[str, Callable] | None,
parse_dates: bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm I think we have a typing alias for this? e.g. is this what we are doing in csv parsers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did look in pandas._typing. For read_csv, there is no typing for converters and parse_dates:

def read_csv(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    ...
    # General Parsing Configuration
    dtype: DtypeArg | None = None,
    ...
    converters=None,
    ...
    # Datetime Handling
    parse_dates=None,
    ...
):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same with read_excel. Let me know how to handle typing for read_xml. Maybe raise a TYP issue for future PR?

def read_excel(
    io,
    ...
    dtype: DtypeArg | None = None,
    ...
    converters=None,
    ...
    parse_dates=False,
    ...
)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add your alias in _typing and use it (can followup later to use it elsewhere)

| list[int | str]
| list[list[int | str]]
| dict[str, list[int | str]]
| None,
encoding: str | None,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
compression: CompressionOptions,
Expand All @@ -120,6 +148,9 @@ def __init__(
self.elems_only = elems_only
self.attrs_only = attrs_only
self.names = names
self.dtype = dtype
self.converters = converters
self.parse_dates = parse_dates
self.encoding = encoding
self.stylesheet = stylesheet
self.is_style = None
Expand Down Expand Up @@ -671,6 +702,13 @@ def _parse(
elems_only: bool,
attrs_only: bool,
names: Sequence[str] | None,
dtype: DtypeArg | None,
converters: dict[str, Callable] | None,
parse_dates: bool
| list[int | str]
| list[list[int | str]]
| dict[str, list[int | str]]
| None,
encoding: str | None,
parser: XMLParsers,
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
Expand Down Expand Up @@ -706,6 +744,9 @@ def _parse(
elems_only,
attrs_only,
names,
dtype,
converters,
parse_dates,
encoding,
stylesheet,
compression,
Expand All @@ -722,6 +763,9 @@ def _parse(
elems_only,
attrs_only,
names,
dtype,
converters,
parse_dates,
encoding,
stylesheet,
compression,
Expand All @@ -732,7 +776,13 @@ def _parse(

data_dicts = p.parse_data()

return _data_to_frame(data=data_dicts, **kwargs)
return _data_to_frame(
data=data_dicts,
dtype=dtype,
converters=converters,
parse_dates=parse_dates,
**kwargs,
)


@deprecate_nonkeyword_arguments(
Expand All @@ -749,6 +799,13 @@ def read_xml(
elems_only: bool = False,
attrs_only: bool = False,
names: Sequence[str] | None = None,
dtype: DtypeArg | None = None,
converters: dict[str, Callable] | None = None,
parse_dates: bool
| list[int | str]
| list[list[int | str]]
| dict[str, list[int | str]]
| None = None,
# encoding can not be None for lxml and StringIO input
encoding: str | None = "utf-8",
parser: XMLParsers = "lxml",
Expand Down Expand Up @@ -799,6 +856,35 @@ def read_xml(
Column names for DataFrame of parsed XML data. Use this parameter to
rename original element names and distinguish same named elements.

dtype : Type name or dict of column -> type, optional
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
'c': 'Int64'}}
Use `str` or `object` together with suitable `na_values` settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.

.. versionadded:: 1.4.0

converters : dict, optional
Dict of functions for converting values in certain columns. Keys can either
be integers or column labels.

.. versionadded:: 1.4.0

parse_dates : bool or list of int or names or list of lists or dict, default False
The behavior is as follows:

* boolean. If True -> try parsing the index.
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
each as a separate date column.
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
a single date column.
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
result 'foo'

.. versionadded:: 1.4.0

encoding : str, optional, default 'utf-8'
Encoding of XML document.

Expand Down Expand Up @@ -942,6 +1028,9 @@ def read_xml(
elems_only=elems_only,
attrs_only=attrs_only,
names=names,
dtype=dtype,
converters=converters,
parse_dates=parse_dates,
encoding=encoding,
parser=parser,
stylesheet=stylesheet,
Expand Down
Loading