Backport PR pandas-dev#47905: BUG: Fix read_xml raising syntax error when reading XML with Chinese tags

ParfaitG · ParfaitG · commit 4003bea01c17 · 2022-08-01T17:02:24.000-05:00
diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst
@@ -26,6 +26,7 @@ Bug fixes
 ~~~~~~~~~
 - The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
 - Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
+- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
@@ -8,6 +8,7 @@
 from typing import Sequence
 
 from pandas._typing import (
+    TYPE_CHECKING,
     CompressionOptions,
     FilePath,
     ReadBuffer,
@@ -38,6 +39,14 @@
 )
 from pandas.io.parsers import TextParser
 
+if TYPE_CHECKING:
+    from xml.etree.ElementTree import Element
+
+    from lxml.etree import (
+        _Element,
+        _XSLTResultTree,
+    )
+
 
 @doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
 class _XMLFrameParser:
@@ -189,7 +198,7 @@ def _validate_names(self) -> None:
         """
         raise AbstractMethodError(self)
 
-    def _parse_doc(self, raw_doc) -> bytes:
+    def _parse_doc(self, raw_doc) -> Element | _Element:
         """
         Build tree from path_or_buffer.
 
@@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser):
     """
 
     def parse_data(self) -> list[dict[str, str | None]]:
-        from xml.etree.ElementTree import XML
-
         if self.stylesheet is not None:
             raise ValueError(
                 "To use stylesheet, you need lxml installed and selected as parser."
             )
 
-        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+        self.xml_doc = self._parse_doc(self.path_or_buffer)
 
         self._validate_path()
         self._validate_names()
@@ -348,11 +355,12 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self, raw_doc) -> bytes:
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> Element:
         from xml.etree.ElementTree import (
             XMLParser,
             parse,
-            tostring,
         )
 
         handle_data = get_data_from_filepath(
@@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes:
 
         with preprocess_data(handle_data) as xml_data:
             curr_parser = XMLParser(encoding=self.encoding)
-            r = parse(xml_data, parser=curr_parser)
+            doc = parse(xml_data, parser=curr_parser)
 
-        return tostring(r.getroot())
+        return doc.getroot()
 
 
 class _LxmlFrameParser(_XMLFrameParser):
@@ -384,13 +392,14 @@ def parse_data(self) -> list[dict[str, str | None]]:
         validate xpath, names, optionally parse and run XSLT,
         and parse original or transformed XML and return specific nodes.
         """
-        from lxml.etree import XML
+        self.xml_doc = self._parse_doc(self.path_or_buffer)
 
-        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+        if self.stylesheet:
+            self.xsl_doc = self._parse_doc(self.stylesheet)
+            self.xml_doc = self._transform_doc()
 
-        if self.stylesheet is not None:
-            self.xsl_doc = XML(self._parse_doc(self.stylesheet))
-            self.xml_doc = XML(self._transform_doc())
+        self._validate_path()
+        self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
 
         self._validate_path()
         self._validate_names()
@@ -527,12 +536,13 @@ def _validate_names(self) -> None:
                     f"{type(self.names).__name__} is not a valid type for names"
                 )
 
-    def _parse_doc(self, raw_doc) -> bytes:
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> _Element:
         from lxml.etree import (
             XMLParser,
             fromstring,
             parse,
-            tostring,
         )
 
         handle_data = get_data_from_filepath(
@@ -557,9 +567,9 @@ def _parse_doc(self, raw_doc) -> bytes:
             else:
                 doc = parse(xml_data, parser=curr_parser)
 
-        return tostring(doc)
+        return doc
 
-    def _transform_doc(self) -> bytes:
+    def _transform_doc(self) -> _XSLTResultTree:
         """
         Transform original tree using stylesheet.
 
@@ -572,7 +582,7 @@ def _transform_doc(self) -> bytes:
         transformer = XSLT(self.xsl_doc)
         new_doc = transformer(self.xml_doc)
 
-        return bytes(new_doc)
+        return new_doc
 
 
 def get_data_from_filepath(
diff --git a/pandas/tests/io/data/xml/doc_ch_utf.xml b/pandas/tests/io/data/xml/doc_ch_utf.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE fragmentdoc [
+  <!ELEMENT qafragment (qa+)>
+  <!ELEMENT qa ( q, a )>
+  <!ELEMENT q ( #PCDATA | title | name)*>
+  <!ATTLIST q speaker CDATA #REQUIRED>
+  <!ELEMENT a ( #PCDATA | title | name)*>
+  <!ATTLIST a speaker CDATA #REQUIRED>
+  <!ELEMENT name (#PCDATA)>
+  <!ELEMENT title (#PCDATA)>
+  <!ENTITY C4-4F71 "Sorry, this is Big5 only">
+]>
+
+<qafragment>
+	<qa>
+		<問 speaker="Opponent">問  若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正</問>
+		<答 speaker="吉藏">答  邪既無量 正亦多途  大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申
+		故<title>大品經</title> <name>善吉</name> 致問 何等是菩薩道 何等非菩薩道
+		<name>佛</name>答云  有所得非菩薩道 無所得是菩薩道</答>
+	</qa>
+	<qa>
+		<問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不</問>
+		<a speaker="吉藏">答 性執是有得 假名是無得  今破有得申無得 即是破性執申假名也</a>
+	</qa>
+	<qa>
+		<問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶</問>
+		<答 speaker="吉藏">答  不例  有無皆是性 所以須雙破 既分性假異 故有破不破</答>
+	</qa>
+</qafragment>
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -358,6 +358,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
     tm.assert_frame_equal(df_str, df_expected)
 
 
+def test_string_charset(parser):
+    txt = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
+
+    df_str = read_xml(txt, parser=parser)
+
+    df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
+
+    tm.assert_frame_equal(df_str, df_expected)
+
+
+def test_file_charset(datapath, parser):
+    xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")
+
+    df_file = read_xml(datapath(xml_file), parser=parser)
+
+    df_expected = DataFrame(
+        {
+            "問": [
+                "問  若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
+                "問 既破有得申無得 亦應但破性執申假名以不",
+                "問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
+            ],
+            "答": [
+                "答  邪既無量 正亦多途  大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
+                None,
+                "答  不例  有無皆是性 所以須雙破 既分性假異 故有破不破",
+            ],
+            "a": [None, "答 性執是有得 假名是無得  今破有得申無得 即是破性執申假名也", None],
+        }
+    )
+
+    tm.assert_frame_equal(df_file, df_expected)
+
+
 def test_file_handle_close(datapath, parser):
     xml_file = datapath("io", "data", "xml", "books.xml")
 
@@ -878,6 +912,35 @@ def test_stylesheet_buffered_reader(datapath, mode):
     tm.assert_frame_equal(df_kml, df_style)
 
 
+@td.skip_if_no("lxml")
+def test_style_charset():
+    xml = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
+
+    xsl = """\
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output omit-xml-declaration="yes" indent="yes"/>
+ <xsl:strip-space elements="*"/>
+
+ <xsl:template match="node()|@*">
+     <xsl:copy>
+       <xsl:apply-templates select="node()|@*"/>
+     </xsl:copy>
+ </xsl:template>
+
+ <xsl:template match="中文標籤">
+     <根>
+       <xsl:apply-templates />
+     </根>
+ </xsl:template>
+
+</xsl:stylesheet>"""
+
+    df_orig = read_xml(xml)
+    df_style = read_xml(xml, stylesheet=xsl)
+
+    tm.assert_frame_equal(df_orig, df_style)
+
+
 @td.skip_if_no("lxml")
 def test_not_stylesheet(datapath):
     from lxml.etree import XSLTParseError