-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add dtypes/converters arguments for pandas.read_xml #45411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
e778d07
ef88558
6787a59
b579acd
1af236c
d3ffe37
5c01e3c
2c06278
eabaa4d
53960d3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,6 +85,48 @@ Optional libraries below the lowest tested version may still work, but are not c | |
|
||
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. | ||
|
||
|
||
.. _whatsnew_140.read_xml_dtypes: | ||
|
||
read_xml now supports ``dtype``, ``converters``, and ``parse_dates`` | ||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
Similar to other IO methods, :func:`pandas.read_xml` now supports assigning specific dtypes to columns, | ||
apply converter methods, and parse dates. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue reference here (this PR number if no issue) |
||
|
||
.. ipython:: python | ||
|
||
xml_dates = """<?xml version='1.0' encoding='utf-8'?> | ||
<data> | ||
<row> | ||
<shape>square</shape> | ||
<degrees>00360</degrees> | ||
<sides>4.0</sides> | ||
<date>2020-01-01</date> | ||
</row> | ||
<row> | ||
<shape>circle</shape> | ||
<degrees>00360</degrees> | ||
<sides/> | ||
<date>2021-01-01</date> | ||
</row> | ||
<row> | ||
<shape>triangle</shape> | ||
<degrees>00180</degrees> | ||
<sides>3.0</sides> | ||
<date>2022-01-01</date> | ||
</row> | ||
</data>""" | ||
|
||
df = pd.read_xml( | ||
xml_dates, | ||
dtype={'sides': 'Int64'}, | ||
converters={'degrees': str}, | ||
parse_dates=['date'] | ||
) | ||
df | ||
df.dtypes | ||
|
||
.. _whatsnew_150.api_breaking.other: | ||
|
||
Other API changes | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,10 +5,14 @@ | |
from __future__ import annotations | ||
|
||
import io | ||
from typing import Sequence | ||
from typing import ( | ||
Callable, | ||
Sequence, | ||
) | ||
|
||
from pandas._typing import ( | ||
CompressionOptions, | ||
DtypeArg, | ||
FilePath, | ||
ReadBuffer, | ||
StorageOptions, | ||
|
@@ -67,6 +71,23 @@ class _XMLFrameParser: | |
names : list | ||
Column names for Data Frame of parsed XML data. | ||
|
||
dtype : dict | ||
Data type for data or columns. E.g. {{'a': np.float64, | ||
'b': np.int32, 'c': 'Int64'}} | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
converters : dict, optional | ||
Dict of functions for converting values in certain columns. Keys can | ||
either be integers or column labels. | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
parse_dates : bool or list of int or names or list of lists or dict | ||
Converts either index or select columns to datetimes | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
encoding : str | ||
Encoding of xml object or document. | ||
|
||
|
@@ -109,6 +130,13 @@ def __init__( | |
elems_only: bool, | ||
attrs_only: bool, | ||
names: Sequence[str] | None, | ||
dtype: DtypeArg | None, | ||
converters: dict[str, Callable] | None, | ||
parse_dates: bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm I think we have a typing alias for this? e.g. is this what we are doing in csv parsers? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did look in def read_csv(
filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
...
# General Parsing Configuration
dtype: DtypeArg | None = None,
...
converters=None,
...
# Datetime Handling
parse_dates=None,
...
): There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same with def read_excel(
io,
...
dtype: DtypeArg | None = None,
...
converters=None,
...
parse_dates=False,
...
) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add your alias in _typing and use it (can followup later to use it elsewhere) |
||
| list[int | str] | ||
| list[list[int | str]] | ||
| dict[str, list[int | str]] | ||
| None, | ||
encoding: str | None, | ||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, | ||
compression: CompressionOptions, | ||
|
@@ -120,6 +148,9 @@ def __init__( | |
self.elems_only = elems_only | ||
self.attrs_only = attrs_only | ||
self.names = names | ||
self.dtype = dtype | ||
self.converters = converters | ||
self.parse_dates = parse_dates | ||
self.encoding = encoding | ||
self.stylesheet = stylesheet | ||
self.is_style = None | ||
|
@@ -671,6 +702,13 @@ def _parse( | |
elems_only: bool, | ||
attrs_only: bool, | ||
names: Sequence[str] | None, | ||
dtype: DtypeArg | None, | ||
converters: dict[str, Callable] | None, | ||
parse_dates: bool | ||
| list[int | str] | ||
| list[list[int | str]] | ||
| dict[str, list[int | str]] | ||
| None, | ||
encoding: str | None, | ||
parser: XMLParsers, | ||
stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, | ||
|
@@ -706,6 +744,9 @@ def _parse( | |
elems_only, | ||
attrs_only, | ||
names, | ||
dtype, | ||
converters, | ||
parse_dates, | ||
encoding, | ||
stylesheet, | ||
compression, | ||
|
@@ -722,6 +763,9 @@ def _parse( | |
elems_only, | ||
attrs_only, | ||
names, | ||
dtype, | ||
converters, | ||
parse_dates, | ||
encoding, | ||
stylesheet, | ||
compression, | ||
|
@@ -732,7 +776,13 @@ def _parse( | |
|
||
data_dicts = p.parse_data() | ||
|
||
return _data_to_frame(data=data_dicts, **kwargs) | ||
return _data_to_frame( | ||
data=data_dicts, | ||
dtype=dtype, | ||
converters=converters, | ||
parse_dates=parse_dates, | ||
**kwargs, | ||
) | ||
|
||
|
||
@deprecate_nonkeyword_arguments( | ||
|
@@ -749,6 +799,13 @@ def read_xml( | |
elems_only: bool = False, | ||
attrs_only: bool = False, | ||
names: Sequence[str] | None = None, | ||
dtype: DtypeArg | None = None, | ||
converters: dict[str, Callable] | None = None, | ||
parse_dates: bool | ||
| list[int | str] | ||
| list[list[int | str]] | ||
| dict[str, list[int | str]] | ||
| None = None, | ||
# encoding can not be None for lxml and StringIO input | ||
encoding: str | None = "utf-8", | ||
parser: XMLParsers = "lxml", | ||
|
@@ -799,6 +856,35 @@ def read_xml( | |
Column names for DataFrame of parsed XML data. Use this parameter to | ||
rename original element names and distinguish same named elements. | ||
|
||
dtype : Type name or dict of column -> type, optional | ||
Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, | ||
'c': 'Int64'}} | ||
Use `str` or `object` together with suitable `na_values` settings | ||
to preserve and not interpret dtype. | ||
If converters are specified, they will be applied INSTEAD | ||
of dtype conversion. | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
converters : dict, optional | ||
Dict of functions for converting values in certain columns. Keys can either | ||
be integers or column labels. | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
parse_dates : bool or list of int or names or list of lists or dict, default False | ||
Identifiers to parse index or columns to datetime. The behavior is as follows: | ||
|
||
* boolean. If True -> try parsing the index. | ||
* list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 | ||
each as a separate date column. | ||
* list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as | ||
a single date column. | ||
* dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call | ||
result 'foo' | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
encoding : str, optional, default 'utf-8' | ||
Encoding of XML document. | ||
|
||
|
@@ -942,6 +1028,9 @@ def read_xml( | |
elems_only=elems_only, | ||
attrs_only=attrs_only, | ||
names=names, | ||
dtype=dtype, | ||
converters=converters, | ||
parse_dates=parse_dates, | ||
encoding=encoding, | ||
parser=parser, | ||
stylesheet=stylesheet, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change to 150