Skip to content

Commit 209de28

Browse files
authored
ENH: Add use_nullable_dtypes to read_xml (#50500)
* ENH: Add use_nullable_dtypes to read_xml * Add gh ref * Move import * Remove import
1 parent 3058713 commit 209de28

File tree

4 files changed

+101
-2
lines changed

4 files changed

+101
-2
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
3939
* :func:`read_fwf`
4040
* :func:`read_excel`
4141
* :func:`read_html`
42+
* :func:`read_xml`
4243
* :func:`read_sql`
4344
* :func:`read_sql_query`
4445
* :func:`read_sql_table`
@@ -49,6 +50,7 @@ to select the nullable dtypes implementation.
4950
* :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``)
5051
* :func:`read_excel`
5152
* :func:`read_html`
53+
* :func:`read_xml`
5254
* :func:`read_parquet`
5355
* :func:`read_orc`
5456

pandas/_libs/ops.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr,
292292
result[i] = 1
293293
elif val in false_vals:
294294
result[i] = 0
295-
elif is_nan(val):
295+
elif is_nan(val) or val is None:
296296
mask[i] = 1
297297
result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
298298
has_na = True

pandas/io/xml.py

+17
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,7 @@ def _parse(
774774
iterparse: dict[str, list[str]] | None,
775775
compression: CompressionOptions,
776776
storage_options: StorageOptions,
777+
use_nullable_dtypes: bool = False,
777778
**kwargs,
778779
) -> DataFrame:
779780
"""
@@ -843,6 +844,7 @@ def _parse(
843844
dtype=dtype,
844845
converters=converters,
845846
parse_dates=parse_dates,
847+
use_nullable_dtypes=use_nullable_dtypes,
846848
**kwargs,
847849
)
848850

@@ -869,6 +871,7 @@ def read_xml(
869871
iterparse: dict[str, list[str]] | None = None,
870872
compression: CompressionOptions = "infer",
871873
storage_options: StorageOptions = None,
874+
use_nullable_dtypes: bool = False,
872875
) -> DataFrame:
873876
r"""
874877
Read XML document into a ``DataFrame`` object.
@@ -980,6 +983,19 @@ def read_xml(
980983
981984
{storage_options}
982985
986+
use_nullable_dtypes : bool = False
987+
Whether or not to use nullable dtypes as default when reading data. If
988+
set to True, nullable dtypes are used for all dtypes that have a nullable
989+
implementation, even if no nulls are present.
990+
991+
The nullable dtype implementation can be configured by calling
992+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
993+
numpy-backed nullable dtypes or
994+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
995+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
996+
997+
.. versionadded:: 2.0
998+
983999
Returns
9841000
-------
9851001
df
@@ -1113,4 +1129,5 @@ def read_xml(
11131129
iterparse=iterparse,
11141130
compression=compression,
11151131
storage_options=storage_options,
1132+
use_nullable_dtypes=use_nullable_dtypes,
11161133
)

pandas/tests/io/xml/test_xml.py

+81-1
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,17 @@
2121
)
2222
import pandas.util._test_decorators as td
2323

24-
from pandas import DataFrame
24+
import pandas as pd
25+
from pandas import (
26+
NA,
27+
DataFrame,
28+
Series,
29+
)
2530
import pandas._testing as tm
31+
from pandas.core.arrays import (
32+
ArrowStringArray,
33+
StringArray,
34+
)
2635

2736
from pandas.io.common import get_handle
2837
from pandas.io.xml import read_xml
@@ -1702,3 +1711,74 @@ def test_s3_parser_consistency():
17021711
)
17031712

17041713
tm.assert_frame_equal(df_lxml, df_etree)
1714+
1715+
1716+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
1717+
def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend):
1718+
# GH#50500
1719+
if string_storage == "pyarrow" or dtype_backend == "pyarrow":
1720+
pa = pytest.importorskip("pyarrow")
1721+
data = """<?xml version='1.0' encoding='utf-8'?>
1722+
<data xmlns="http://example.com">
1723+
<row>
1724+
<a>x</a>
1725+
<b>1</b>
1726+
<c>4.0</c>
1727+
<d>x</d>
1728+
<e>2</e>
1729+
<f>4.0</f>
1730+
<g></g>
1731+
<h>True</h>
1732+
<i>False</i>
1733+
</row>
1734+
<row>
1735+
<a>y</a>
1736+
<b>2</b>
1737+
<c>5.0</c>
1738+
<d></d>
1739+
<e></e>
1740+
<f></f>
1741+
<g></g>
1742+
<h>False</h>
1743+
<i></i>
1744+
</row>
1745+
</data>"""
1746+
1747+
if string_storage == "python":
1748+
string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
1749+
string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
1750+
1751+
else:
1752+
string_array = ArrowStringArray(pa.array(["x", "y"]))
1753+
string_array_na = ArrowStringArray(pa.array(["x", None]))
1754+
1755+
with pd.option_context("mode.string_storage", string_storage):
1756+
with pd.option_context("mode.dtype_backend", dtype_backend):
1757+
result = read_xml(data, parser=parser, use_nullable_dtypes=True)
1758+
1759+
expected = DataFrame(
1760+
{
1761+
"a": string_array,
1762+
"b": Series([1, 2], dtype="Int64"),
1763+
"c": Series([4.0, 5.0], dtype="Float64"),
1764+
"d": string_array_na,
1765+
"e": Series([2, NA], dtype="Int64"),
1766+
"f": Series([4.0, NA], dtype="Float64"),
1767+
"g": Series([NA, NA], dtype="Int64"),
1768+
"h": Series([True, False], dtype="boolean"),
1769+
"i": Series([False, NA], dtype="boolean"),
1770+
}
1771+
)
1772+
1773+
if dtype_backend == "pyarrow":
1774+
from pandas.arrays import ArrowExtensionArray
1775+
1776+
expected = DataFrame(
1777+
{
1778+
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
1779+
for col in expected.columns
1780+
}
1781+
)
1782+
expected["g"] = ArrowExtensionArray(pa.array([None, None]))
1783+
1784+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)