pandas-dev · jreback · Jan 1, 2022 · Dec 30, 2021 · Dec 30, 2021 · Dec 30, 2021
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2948,8 +2948,8 @@ def to_xml(
         root_name: str | None = "data",
         row_name: str | None = "row",
         na_rep: str | None = None,
-        attr_cols: str | list[str] | None = None,
-        elem_cols: str | list[str] | None = None,
+        attr_cols: list[str] | None = None,
+        elem_cols: list[str] | None = None,
         namespaces: dict[str | None, str] | None = None,
         prefix: str | None = None,
         encoding: str = "utf-8",

diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
@@ -10,7 +10,6 @@
 from pandas._typing import (
     CompressionOptions,
     FilePath,
-    ReadBuffer,
     StorageOptions,
     WriteBuffer,
 )
@@ -96,8 +95,8 @@ class BaseXMLFormatter:
     def __init__(
         self,
         frame: DataFrame,
-        path_or_buffer: FilePath | WriteBuffer[bytes] | None = None,
-        index: bool | None = True,
+        path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+        index: bool = True,
         root_name: str | None = "data",
         row_name: str | None = "row",
         na_rep: str | None = None,
@@ -108,7 +107,7 @@ def __init__(
         encoding: str = "utf-8",
         xml_declaration: bool | None = True,
         pretty_print: bool | None = True,
-        stylesheet: FilePath | ReadBuffer[str] | None = None,
+        stylesheet: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
         compression: CompressionOptions = "infer",
         storage_options: StorageOptions = None,
     ) -> None:
@@ -132,6 +131,11 @@ def __init__(
         self.orig_cols = self.frame.columns.tolist()
         self.frame_dicts = self.process_dataframe()
 
+        self.validate_columns()
+        self.validate_encoding()
+        self.prefix_uri = self.get_prefix_uri()
+        self.handle_indexes()
+
     def build_tree(self) -> bytes:
         """
         Build tree from  data.
@@ -189,8 +193,8 @@ def process_dataframe(self) -> dict[int | str, dict[str, Any]]:
         if self.index:
             df = df.reset_index()
 
-        if self.na_rep:
-            df = df.replace({None: self.na_rep, float("nan"): self.na_rep})
+        if self.na_rep is not None:
+            df = df.fillna(self.na_rep)
 
         return df.to_dict(orient="index")
 
@@ -247,17 +251,37 @@ def other_namespaces(self) -> dict:
 
         return nmsp_dict
 
-    def build_attribs(self) -> None:
+    def build_attribs(self, d: dict[str, Any], elem_row: Any) -> None:
         """
         Create attributes of row.
 
         This method adds attributes using attr_cols to row element and
         works with tuples for multindex or hierarchical columns.
         """
 
-        raise AbstractMethodError(self)
+        if not self.attr_cols:
+            return
 
-    def build_elems(self) -> None:
+        for col in self.attr_cols:
+            attr_name = self._get_flat_col_name(col)
+            try:
+                val = None if isna(d[col]) else str(d[col])
+                if val is not None:
+                    elem_row.attrib[attr_name] = val
+            except KeyError:
+                raise KeyError(f"no valid column, {col}")
+
+    def _get_flat_col_name(self, col: str | tuple) -> str:
+        flat_col = col
+        if isinstance(col, tuple):
+            flat_col = (
+                "".join([str(c) for c in col]).strip()
+                if "" in col
+                else "_".join([str(c) for c in col]).strip()
+            )
+        return f"{self.prefix_uri}{flat_col}"
+
+    def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
         """
         Create child elements of row.
 
@@ -267,6 +291,19 @@ def build_elems(self) -> None:
 
         raise AbstractMethodError(self)
 
+    def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> None:
+
+        if not self.elem_cols:
+            return
+
+        for col in self.elem_cols:
+            elem_name = self._get_flat_col_name(col)
+            try:
+                val = None if isna(d[col]) or d[col] == "" else str(d[col])
+                sub_element_cls(elem_row, elem_name).text = val
+            except KeyError:
+                raise KeyError(f"no valid column, {col}")
+
     def write_output(self) -> str | None:
         xml_doc = self.build_tree()
 
@@ -291,14 +328,6 @@ class EtreeXMLFormatter(BaseXMLFormatter):
     modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
     """
 
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-        self.validate_columns()
-        self.validate_encoding()
-        self.handle_indexes()
-        self.prefix_uri = self.get_prefix_uri()
-
     def build_tree(self) -> bytes:
         from xml.etree.ElementTree import (
             Element,
@@ -311,16 +340,15 @@ def build_tree(self) -> bytes:
         )
 
         for d in self.frame_dicts.values():
-            self.d = d
-            self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
+            elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
 
             if not self.attr_cols and not self.elem_cols:
-                self.elem_cols = list(self.d.keys())
-                self.build_elems()
+                self.elem_cols = list(d.keys())
+                self.build_elems(d, elem_row)
 
             else:
-                self.build_attribs()
-                self.build_elems()
+                self.build_attribs(d, elem_row)
+                self.build_elems(d, elem_row)
 
         self.out_xml = tostring(self.root, method="xml", encoding=self.encoding)
 
@@ -357,56 +385,10 @@ def get_prefix_uri(self) -> str:
 
         return uri
 
-    def build_attribs(self) -> None:
-        if not self.attr_cols:
-            return
-
-        for col in self.attr_cols:
-            flat_col = col
-            if isinstance(col, tuple):
-                flat_col = (
-                    "".join([str(c) for c in col]).strip()
-                    if "" in col
-                    else "_".join([str(c) for c in col]).strip()
-                )
-
-            attr_name = f"{self.prefix_uri}{flat_col}"
-            try:
-                val = (
-                    None
-                    if self.d[col] is None or self.d[col] != self.d[col]
-                    else str(self.d[col])
-                )
-                if val is not None:
-                    self.elem_row.attrib[attr_name] = val
-            except KeyError:
-                raise KeyError(f"no valid column, {col}")
-
-    def build_elems(self) -> None:
+    def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
         from xml.etree.ElementTree import SubElement
 
-        if not self.elem_cols:
-            return
-
-        for col in self.elem_cols:
-            flat_col = col
-            if isinstance(col, tuple):
-                flat_col = (
-                    "".join([str(c) for c in col]).strip()
-                    if "" in col
-                    else "_".join([str(c) for c in col]).strip()
-                )
-
-            elem_name = f"{self.prefix_uri}{flat_col}"
-            try:
-                val = (
-                    None
-                    if self.d[col] in [None, ""] or self.d[col] != self.d[col]
-                    else str(self.d[col])
-                )
-                SubElement(self.elem_row, elem_name).text = val
-            except KeyError:
-                raise KeyError(f"no valid column, {col}")
+        self._build_elems(SubElement, d, elem_row)
 
     def prettify_tree(self) -> bytes:
         """
@@ -458,12 +440,7 @@ class LxmlXMLFormatter(BaseXMLFormatter):
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
 
-        self.validate_columns()
-        self.validate_encoding()
-        self.prefix_uri = self.get_prefix_uri()
-
         self.convert_empty_str_key()
-        self.handle_indexes()
 
     def build_tree(self) -> bytes:
         """
@@ -481,16 +458,15 @@ def build_tree(self) -> bytes:
         self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
 
         for d in self.frame_dicts.values():
-            self.d = d
-            self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
+            elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
 
             if not self.attr_cols and not self.elem_cols:
-                self.elem_cols = list(self.d.keys())
-                self.build_elems()
+                self.elem_cols = list(d.keys())
+                self.build_elems(d, elem_row)
 
             else:
-                self.build_attribs()
-                self.build_elems()
+                self.build_attribs(d, elem_row)
+                self.build_elems(d, elem_row)
 
         self.out_xml = tostring(
             self.root,
@@ -529,54 +505,10 @@ def get_prefix_uri(self) -> str:
 
         return uri
 
-    def build_attribs(self) -> None:
-        if not self.attr_cols:
-            return
-
-        for col in self.attr_cols:
-            flat_col = col
-            if isinstance(col, tuple):
-                flat_col = (
-                    "".join([str(c) for c in col]).strip()
-                    if "" in col
-                    else "_".join([str(c) for c in col]).strip()
-                )
-
-            attr_name = f"{self.prefix_uri}{flat_col}"
-            try:
-                val = (
-                    None
-                    if self.d[col] is None or self.d[col] != self.d[col]
-                    else str(self.d[col])
-                )
-                if val is not None:
-                    self.elem_row.attrib[attr_name] = val
-            except KeyError:
-                raise KeyError(f"no valid column, {col}")
-
-    def build_elems(self) -> None:
+    def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
         from lxml.etree import SubElement
 
-        if not self.elem_cols:
-            return
-
-        for col in self.elem_cols:
-            flat_col = col
-            if isinstance(col, tuple):
-                flat_col = (
-                    "".join([str(c) for c in col]).strip()
-                    if "" in col
-                    else "_".join([str(c) for c in col]).strip()
-                )
-
-            elem_name = f"{self.prefix_uri}{flat_col}"
-            try:
-                val = (
-                    None if isna(self.d[col]) or self.d[col] == "" else str(self.d[col])
-                )
-                SubElement(self.elem_row, elem_name).text = val
-            except KeyError:
-                raise KeyError(f"no valid column, {col}")
+        self._build_elems(SubElement, d, elem_row)
 
     def transform_doc(self) -> bytes:
         """

diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -11,6 +11,7 @@
     FilePath,
     ReadBuffer,
     StorageOptions,
+    WriteBuffer,
 )
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
@@ -569,11 +570,18 @@ def _transform_doc(self) -> bytes:
 
 
 def get_data_from_filepath(
-    filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
+    filepath_or_buffer: FilePath
+    | bytes
+    | ReadBuffer[bytes]
+    | ReadBuffer[str]
+    | WriteBuffer[bytes]
+    | WriteBuffer[str],
     encoding,
     compression: CompressionOptions,
     storage_options: StorageOptions,
-) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
+) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str] | WriteBuffer[
+    bytes
+] | WriteBuffer[str]:
     """
     Extract raw XML data.
 
@@ -605,10 +613,7 @@ def get_data_from_filepath(
             storage_options=storage_options,
         ) as handle_obj:
             filepath_or_buffer = (
-                # error: Incompatible types in assignment (expression has type
-                # "Union[str, IO[str]]", variable has type "Union[Union[str,
-                # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
-                handle_obj.handle.read()  # type: ignore[assignment]
+                handle_obj.handle.read()
                 if hasattr(handle_obj.handle, "read")
                 else handle_obj.handle
             )

diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
@@ -1309,7 +1309,7 @@ def test_filename_and_suffix_comp(parser, compression_only):
 
 
 @td.skip_if_no("lxml")
-def test_ea_dtypes(any_numeric_ea_dtype):
+def test_ea_dtypes(any_numeric_ea_dtype, parser):
     # GH#43903
     expected = """<?xml version='1.0' encoding='utf-8'?>
 <data>
@@ -1319,8 +1319,8 @@ def test_ea_dtypes(any_numeric_ea_dtype):
   </row>
 </data>"""
     df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype)
-    result = df.to_xml()
-    assert result.strip() == expected
+    result = df.to_xml(parser=parser)
+    assert equalize_decl(result).strip() == expected
 
 
 def test_unsuported_compression(datapath, parser):