Skip to content

Commit 5fd73d9

Browse files
ParfaitGyehoshuadimarsky
authored andcommitted
ENH: Add dtypes/converters arguments for pandas.read_xml (pandas-dev#45411)
1 parent 316c47c commit 5fd73d9

File tree

4 files changed

+491
-3
lines changed

4 files changed

+491
-3
lines changed

doc/source/whatsnew/v1.5.0.rst

+42
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,48 @@ Optional libraries below the lowest tested version may still work, but are not c
8888

8989
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
9090

91+
92+
.. _whatsnew_150.read_xml_dtypes:
93+
94+
read_xml now supports ``dtype``, ``converters``, and ``parse_dates``
95+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
96+
97+
Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns,
98+
apply converter methods, and parse dates (:issue:`43567`).
99+
100+
.. ipython:: python
101+
102+
xml_dates = """<?xml version='1.0' encoding='utf-8'?>
103+
<data>
104+
<row>
105+
<shape>square</shape>
106+
<degrees>00360</degrees>
107+
<sides>4.0</sides>
108+
<date>2020-01-01</date>
109+
</row>
110+
<row>
111+
<shape>circle</shape>
112+
<degrees>00360</degrees>
113+
<sides/>
114+
<date>2021-01-01</date>
115+
</row>
116+
<row>
117+
<shape>triangle</shape>
118+
<degrees>00180</degrees>
119+
<sides>3.0</sides>
120+
<date>2022-01-01</date>
121+
</row>
122+
</data>"""
123+
124+
df = pd.read_xml(
125+
xml_dates,
126+
dtype={'sides': 'Int64'},
127+
converters={'degrees': str},
128+
parse_dates=['date']
129+
)
130+
df
131+
df.dtypes
132+
91133
.. _whatsnew_150.api_breaking.other:
92134

93135
Other API changes

pandas/_typing.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,14 @@
129129
DtypeArg = Union[Dtype, Dict[Hashable, Dtype]]
130130
DtypeObj = Union[np.dtype, "ExtensionDtype"]
131131

132+
# converters
133+
ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]]
134+
135+
# parse_dates
136+
ParseDatesArg = Union[
137+
bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]]
138+
]
139+
132140
# For functions like rename that convert one label to another
133141
Renamer = Union[Mapping[Hashable, Any], Callable[[Hashable], Hashable]]
134142

@@ -246,8 +254,6 @@ def closed(self) -> bool:
246254
CompressionOptions = Optional[
247255
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
248256
]
249-
XMLParsers = Literal["lxml", "etree"]
250-
251257

252258
# types in DataFrameFormatter
253259
FormattersType = Union[
@@ -295,3 +301,6 @@ def closed(self) -> bool:
295301

296302
# read_csv engines
297303
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
304+
305+
# read_xml parsers
306+
XMLParsers = Literal["lxml", "etree"]

pandas/io/xml.py

+77-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99

1010
from pandas._typing import (
1111
CompressionOptions,
12+
ConvertersArg,
13+
DtypeArg,
1214
FilePath,
15+
ParseDatesArg,
1316
ReadBuffer,
1417
StorageOptions,
1518
XMLParsers,
@@ -67,6 +70,23 @@ class _XMLFrameParser:
6770
names : list
6871
Column names for Data Frame of parsed XML data.
6972
73+
dtype : dict
74+
Data type for data or columns. E.g. {{'a': np.float64,
75+
'b': np.int32, 'c': 'Int64'}}
76+
77+
.. versionadded:: 1.5.0
78+
79+
converters : dict, optional
80+
Dict of functions for converting values in certain columns. Keys can
81+
either be integers or column labels.
82+
83+
.. versionadded:: 1.5.0
84+
85+
parse_dates : bool or list of int or names or list of lists or dict
86+
Converts either index or select columns to datetimes
87+
88+
.. versionadded:: 1.5.0
89+
7090
encoding : str
7191
Encoding of xml object or document.
7292
@@ -109,6 +129,9 @@ def __init__(
109129
elems_only: bool,
110130
attrs_only: bool,
111131
names: Sequence[str] | None,
132+
dtype: DtypeArg | None,
133+
converters: ConvertersArg | None,
134+
parse_dates: ParseDatesArg | None,
112135
encoding: str | None,
113136
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
114137
compression: CompressionOptions,
@@ -120,6 +143,9 @@ def __init__(
120143
self.elems_only = elems_only
121144
self.attrs_only = attrs_only
122145
self.names = names
146+
self.dtype = dtype
147+
self.converters = converters
148+
self.parse_dates = parse_dates
123149
self.encoding = encoding
124150
self.stylesheet = stylesheet
125151
self.is_style = None
@@ -671,6 +697,9 @@ def _parse(
671697
elems_only: bool,
672698
attrs_only: bool,
673699
names: Sequence[str] | None,
700+
dtype: DtypeArg | None,
701+
converters: ConvertersArg | None,
702+
parse_dates: ParseDatesArg | None,
674703
encoding: str | None,
675704
parser: XMLParsers,
676705
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
@@ -706,6 +735,9 @@ def _parse(
706735
elems_only,
707736
attrs_only,
708737
names,
738+
dtype,
739+
converters,
740+
parse_dates,
709741
encoding,
710742
stylesheet,
711743
compression,
@@ -722,6 +754,9 @@ def _parse(
722754
elems_only,
723755
attrs_only,
724756
names,
757+
dtype,
758+
converters,
759+
parse_dates,
725760
encoding,
726761
stylesheet,
727762
compression,
@@ -732,7 +767,13 @@ def _parse(
732767

733768
data_dicts = p.parse_data()
734769

735-
return _data_to_frame(data=data_dicts, **kwargs)
770+
return _data_to_frame(
771+
data=data_dicts,
772+
dtype=dtype,
773+
converters=converters,
774+
parse_dates=parse_dates,
775+
**kwargs,
776+
)
736777

737778

738779
@deprecate_nonkeyword_arguments(
@@ -749,6 +790,9 @@ def read_xml(
749790
elems_only: bool = False,
750791
attrs_only: bool = False,
751792
names: Sequence[str] | None = None,
793+
dtype: DtypeArg | None = None,
794+
converters: ConvertersArg | None = None,
795+
parse_dates: ParseDatesArg | None = None,
752796
# encoding can not be None for lxml and StringIO input
753797
encoding: str | None = "utf-8",
754798
parser: XMLParsers = "lxml",
@@ -799,6 +843,35 @@ def read_xml(
799843
Column names for DataFrame of parsed XML data. Use this parameter to
800844
rename original element names and distinguish same named elements.
801845
846+
dtype : Type name or dict of column -> type, optional
847+
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
848+
'c': 'Int64'}}
849+
Use `str` or `object` together with suitable `na_values` settings
850+
to preserve and not interpret dtype.
851+
If converters are specified, they will be applied INSTEAD
852+
of dtype conversion.
853+
854+
.. versionadded:: 1.5.0
855+
856+
converters : dict, optional
857+
Dict of functions for converting values in certain columns. Keys can either
858+
be integers or column labels.
859+
860+
.. versionadded:: 1.5.0
861+
862+
parse_dates : bool or list of int or names or list of lists or dict, default False
863+
Identifiers to parse index or columns to datetime. The behavior is as follows:
864+
865+
* boolean. If True -> try parsing the index.
866+
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
867+
each as a separate date column.
868+
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
869+
a single date column.
870+
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
871+
result 'foo'
872+
873+
.. versionadded:: 1.5.0
874+
802875
encoding : str, optional, default 'utf-8'
803876
Encoding of XML document.
804877
@@ -942,6 +1015,9 @@ def read_xml(
9421015
elems_only=elems_only,
9431016
attrs_only=attrs_only,
9441017
names=names,
1018+
dtype=dtype,
1019+
converters=converters,
1020+
parse_dates=parse_dates,
9451021
encoding=encoding,
9461022
parser=parser,
9471023
stylesheet=stylesheet,

0 commit comments

Comments
 (0)