ENH: Add use_nullable_dtypes to read_xml (#50500)

phofl · web-flow · commit 209de2803c5d · 2023-01-04T11:11:43.000-08:00
* ENH: Add use_nullable_dtypes to read_xml

* Add gh ref

* Move import

* Remove import
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -39,6 +39,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_fwf`
 * :func:`read_excel`
 * :func:`read_html`
+* :func:`read_xml`
 * :func:`read_sql`
 * :func:`read_sql_query`
 * :func:`read_sql_table`
@@ -49,6 +50,7 @@ to select the nullable dtypes implementation.
 * :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``)
 * :func:`read_excel`
 * :func:`read_html`
+* :func:`read_xml`
 * :func:`read_parquet`
 * :func:`read_orc`
 
diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx
@@ -292,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr,
             result[i] = 1
         elif val in false_vals:
             result[i] = 0
-        elif is_nan(val):
+        elif is_nan(val) or val is None:
             mask[i] = 1
             result[i] = 0  # Value here doesn't matter, will be replaced w/ nan
             has_na = True
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -774,6 +774,7 @@ def _parse(
     iterparse: dict[str, list[str]] | None,
     compression: CompressionOptions,
     storage_options: StorageOptions,
+    use_nullable_dtypes: bool = False,
     **kwargs,
 ) -> DataFrame:
     """
@@ -843,6 +844,7 @@ def _parse(
         dtype=dtype,
         converters=converters,
         parse_dates=parse_dates,
+        use_nullable_dtypes=use_nullable_dtypes,
         **kwargs,
     )
 
@@ -869,6 +871,7 @@ def read_xml(
     iterparse: dict[str, list[str]] | None = None,
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame:
     r"""
     Read XML document into a ``DataFrame`` object.
@@ -980,6 +983,19 @@ def read_xml(
 
     {storage_options}
 
+    use_nullable_dtypes : bool = False
+        Whether or not to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        The nullable dtype implementation can be configured by calling
+        ``pd.set_option("mode.dtype_backend", "pandas")`` to use
+        numpy-backed nullable dtypes or
+        ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
+        pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
+
+        .. versionadded:: 2.0
+
     Returns
     -------
     df
@@ -1113,4 +1129,5 @@ def read_xml(
         iterparse=iterparse,
         compression=compression,
         storage_options=storage_options,
+        use_nullable_dtypes=use_nullable_dtypes,
     )
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -21,8 +21,17 @@
 )
 import pandas.util._test_decorators as td
 
-from pandas import DataFrame
+import pandas as pd
+from pandas import (
+    NA,
+    DataFrame,
+    Series,
+)
 import pandas._testing as tm
+from pandas.core.arrays import (
+    ArrowStringArray,
+    StringArray,
+)
 
 from pandas.io.common import get_handle
 from pandas.io.xml import read_xml
@@ -1702,3 +1711,74 @@ def test_s3_parser_consistency():
     )
 
     tm.assert_frame_equal(df_lxml, df_etree)
+
+
+@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
+def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend):
+    # GH#50500
+    if string_storage == "pyarrow" or dtype_backend == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+    data = """<?xml version='1.0' encoding='utf-8'?>
+<data xmlns="http://example.com">
+<row>
+  <a>x</a>
+  <b>1</b>
+  <c>4.0</c>
+  <d>x</d>
+  <e>2</e>
+  <f>4.0</f>
+  <g></g>
+  <h>True</h>
+  <i>False</i>
+</row>
+<row>
+  <a>y</a>
+  <b>2</b>
+  <c>5.0</c>
+  <d></d>
+  <e></e>
+  <f></f>
+  <g></g>
+  <h>False</h>
+  <i></i>
+</row>
+</data>"""
+
+    if string_storage == "python":
+        string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
+        string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
+
+    else:
+        string_array = ArrowStringArray(pa.array(["x", "y"]))
+        string_array_na = ArrowStringArray(pa.array(["x", None]))
+
+    with pd.option_context("mode.string_storage", string_storage):
+        with pd.option_context("mode.dtype_backend", dtype_backend):
+            result = read_xml(data, parser=parser, use_nullable_dtypes=True)
+
+    expected = DataFrame(
+        {
+            "a": string_array,
+            "b": Series([1, 2], dtype="Int64"),
+            "c": Series([4.0, 5.0], dtype="Float64"),
+            "d": string_array_na,
+            "e": Series([2, NA], dtype="Int64"),
+            "f": Series([4.0, NA], dtype="Float64"),
+            "g": Series([NA, NA], dtype="Int64"),
+            "h": Series([True, False], dtype="boolean"),
+            "i": Series([False, NA], dtype="boolean"),
+        }
+    )
+
+    if dtype_backend == "pyarrow":
+        from pandas.arrays import ArrowExtensionArray
+
+        expected = DataFrame(
+            {
+                col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                for col in expected.columns
+            }
+        )
+        expected["g"] = ArrowExtensionArray(pa.array([None, None]))
+
+    tm.assert_frame_equal(result, expected)