diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst
index 6bd7378e05404..89487bfde94a5 100644
--- a/doc/source/whatsnew/v1.4.4.rst
+++ b/doc/source/whatsnew/v1.4.4.rst
@@ -26,6 +26,7 @@ Bug fixes
~~~~~~~~~
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
+- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)
.. ---------------------------------------------------------------------------
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index ad87b18bd1683..6cef494ab0aa3 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -8,6 +8,7 @@
from typing import Sequence
from pandas._typing import (
+ TYPE_CHECKING,
CompressionOptions,
FilePath,
ReadBuffer,
@@ -38,6 +39,14 @@
)
from pandas.io.parsers import TextParser
+if TYPE_CHECKING:
+ from xml.etree.ElementTree import Element
+
+ from lxml.etree import (
+ _Element,
+ _XSLTResultTree,
+ )
+
@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
class _XMLFrameParser:
@@ -189,7 +198,7 @@ def _validate_names(self) -> None:
"""
raise AbstractMethodError(self)
- def _parse_doc(self, raw_doc) -> bytes:
+ def _parse_doc(self, raw_doc) -> Element | _Element:
"""
Build tree from path_or_buffer.
@@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser):
"""
def parse_data(self) -> list[dict[str, str | None]]:
- from xml.etree.ElementTree import XML
-
if self.stylesheet is not None:
raise ValueError(
"To use stylesheet, you need lxml installed and selected as parser."
)
- self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+ self.xml_doc = self._parse_doc(self.path_or_buffer)
self._validate_path()
self._validate_names()
@@ -348,11 +355,12 @@ def _validate_names(self) -> None:
f"{type(self.names).__name__} is not a valid type for names"
)
- def _parse_doc(self, raw_doc) -> bytes:
+ def _parse_doc(
+ self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+ ) -> Element:
from xml.etree.ElementTree import (
XMLParser,
parse,
- tostring,
)
handle_data = get_data_from_filepath(
@@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes:
with preprocess_data(handle_data) as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
- r = parse(xml_data, parser=curr_parser)
+ doc = parse(xml_data, parser=curr_parser)
- return tostring(r.getroot())
+ return doc.getroot()
class _LxmlFrameParser(_XMLFrameParser):
@@ -384,13 +392,11 @@ def parse_data(self) -> list[dict[str, str | None]]:
validate xpath, names, optionally parse and run XSLT,
and parse original or transformed XML and return specific nodes.
"""
- from lxml.etree import XML
-
- self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+ self.xml_doc = self._parse_doc(self.path_or_buffer)
if self.stylesheet is not None:
- self.xsl_doc = XML(self._parse_doc(self.stylesheet))
- self.xml_doc = XML(self._transform_doc())
+ self.xsl_doc = self._parse_doc(self.stylesheet)
+ self.xml_doc = self._transform_doc()
self._validate_path()
self._validate_names()
@@ -527,12 +533,13 @@ def _validate_names(self) -> None:
f"{type(self.names).__name__} is not a valid type for names"
)
- def _parse_doc(self, raw_doc) -> bytes:
+ def _parse_doc(
+ self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+ ) -> _Element:
from lxml.etree import (
XMLParser,
fromstring,
parse,
- tostring,
)
handle_data = get_data_from_filepath(
@@ -557,9 +564,9 @@ def _parse_doc(self, raw_doc) -> bytes:
else:
doc = parse(xml_data, parser=curr_parser)
- return tostring(doc)
+ return doc
- def _transform_doc(self) -> bytes:
+ def _transform_doc(self) -> _XSLTResultTree:
"""
Transform original tree using stylesheet.
@@ -572,7 +579,7 @@ def _transform_doc(self) -> bytes:
transformer = XSLT(self.xsl_doc)
new_doc = transformer(self.xml_doc)
- return bytes(new_doc)
+ return new_doc
def get_data_from_filepath(
diff --git a/pandas/tests/io/data/xml/doc_ch_utf.xml b/pandas/tests/io/data/xml/doc_ch_utf.xml
new file mode 100644
index 0000000000000..fde215b89646b
--- /dev/null
+++ b/pandas/tests/io/data/xml/doc_ch_utf.xml
@@ -0,0 +1,29 @@
+
+
+
+
+
+
+
+
+
+
+]>
+
+
+
+ <問 speaker="Opponent">問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正問>
+ <答 speaker="吉藏">答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申
+ 故大品經 善吉 致問 何等是菩薩道 何等非菩薩道
+ 佛答云 有所得非菩薩道 無所得是菩薩道答>
+
+
+ <問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不問>
+ 答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也
+
+
+ <問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶問>
+ <答 speaker="吉藏">答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破答>
+
+
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index c3d4d635a36be..8ce70f998f337 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -358,6 +358,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
tm.assert_frame_equal(df_str, df_expected)
+def test_string_charset(parser):
+ txt = "<中文標籤>12
中文標籤>"
+
+ df_str = read_xml(txt, parser=parser)
+
+ df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
+
+ tm.assert_frame_equal(df_str, df_expected)
+
+
+def test_file_charset(datapath, parser):
+ xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")
+
+ df_file = read_xml(datapath(xml_file), parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "問": [
+ "問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
+ "問 既破有得申無得 亦應但破性執申假名以不",
+ "問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
+ ],
+ "答": [
+ "答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
+ None,
+ "答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
+ ],
+ "a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None],
+ }
+ )
+
+ tm.assert_frame_equal(df_file, df_expected)
+
+
def test_file_handle_close(datapath, parser):
xml_file = datapath("io", "data", "xml", "books.xml")
@@ -878,6 +912,35 @@ def test_stylesheet_buffered_reader(datapath, mode):
tm.assert_frame_equal(df_kml, df_style)
+@td.skip_if_no("lxml")
+def test_style_charset():
+ xml = "<中文標籤>12
中文標籤>"
+
+ xsl = """\
+
+
+
+
+
+
+
+
+
+
+
+ <根>
+
+ 根>
+
+
+"""
+
+ df_orig = read_xml(xml)
+ df_style = read_xml(xml, stylesheet=xsl)
+
+ tm.assert_frame_equal(df_orig, df_style)
+
+
@td.skip_if_no("lxml")
def test_not_stylesheet(datapath):
from lxml.etree import XSLTParseError