diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst
index 6bd7378e05404..89487bfde94a5 100644
--- a/doc/source/whatsnew/v1.4.4.rst
+++ b/doc/source/whatsnew/v1.4.4.rst
@@ -26,6 +26,7 @@ Bug fixes
~~~~~~~~~
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
+- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)
.. ---------------------------------------------------------------------------
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 9b6eb31dafc07..d52482fe2ef5a 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -12,6 +12,7 @@
)
from pandas._typing import (
+ TYPE_CHECKING,
CompressionOptions,
ConvertersArg,
DtypeArg,
@@ -46,6 +47,14 @@
)
from pandas.io.parsers import TextParser
+if TYPE_CHECKING:
+ from xml.etree.ElementTree import Element
+
+ from lxml.etree import (
+ _Element,
+ _XSLTResultTree,
+ )
+
@doc(
storage_options=_shared_docs["storage_options"],
@@ -410,7 +419,7 @@ def _validate_names(self) -> None:
def _parse_doc(
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> bytes:
+ ) -> Element | _Element:
"""
Build tree from path_or_buffer.
@@ -427,10 +436,7 @@ class _EtreeFrameParser(_XMLFrameParser):
"""
def parse_data(self) -> list[dict[str, str | None]]:
- from xml.etree.ElementTree import (
- XML,
- iterparse,
- )
+ from xml.etree.ElementTree import iterparse
if self.stylesheet is not None:
raise ValueError(
@@ -438,7 +444,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
)
if self.iterparse is None:
- self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+ self.xml_doc = self._parse_doc(self.path_or_buffer)
self._validate_path()
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
@@ -503,11 +509,10 @@ def _validate_names(self) -> None:
def _parse_doc(
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> bytes:
+ ) -> Element:
from xml.etree.ElementTree import (
XMLParser,
parse,
- tostring,
)
handle_data = get_data_from_filepath(
@@ -519,9 +524,9 @@ def _parse_doc(
with preprocess_data(handle_data) as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
- r = parse(xml_data, parser=curr_parser)
+ doc = parse(xml_data, parser=curr_parser)
- return tostring(r.getroot())
+ return doc.getroot()
class _LxmlFrameParser(_XMLFrameParser):
@@ -539,17 +544,14 @@ def parse_data(self) -> list[dict[str, str | None]]:
validate xpath, names, optionally parse and run XSLT,
and parse original or transformed XML and return specific nodes.
"""
- from lxml.etree import (
- XML,
- iterparse,
- )
+ from lxml.etree import iterparse
if self.iterparse is None:
- self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
+ self.xml_doc = self._parse_doc(self.path_or_buffer)
if self.stylesheet:
- self.xsl_doc = XML(self._parse_doc(self.stylesheet))
- self.xml_doc = XML(self._transform_doc())
+ self.xsl_doc = self._parse_doc(self.stylesheet)
+ self.xml_doc = self._transform_doc()
self._validate_path()
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
@@ -607,12 +609,11 @@ def _validate_names(self) -> None:
def _parse_doc(
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> bytes:
+ ) -> _Element:
from lxml.etree import (
XMLParser,
fromstring,
parse,
- tostring,
)
handle_data = get_data_from_filepath(
@@ -637,9 +638,9 @@ def _parse_doc(
else:
doc = parse(xml_data, parser=curr_parser)
- return tostring(doc)
+ return doc
- def _transform_doc(self) -> bytes:
+ def _transform_doc(self) -> _XSLTResultTree:
"""
Transform original tree using stylesheet.
@@ -652,7 +653,7 @@ def _transform_doc(self) -> bytes:
transformer = XSLT(self.xsl_doc)
new_doc = transformer(self.xml_doc)
- return bytes(new_doc)
+ return new_doc
def get_data_from_filepath(
diff --git a/pandas/tests/io/data/xml/doc_ch_utf.xml b/pandas/tests/io/data/xml/doc_ch_utf.xml
new file mode 100644
index 0000000000000..fde215b89646b
--- /dev/null
+++ b/pandas/tests/io/data/xml/doc_ch_utf.xml
@@ -0,0 +1,29 @@
+
+
+
+
+
+
+
+
+
+
+]>
+
+
+
+ <問 speaker="Opponent">問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正問>
+ <答 speaker="吉藏">答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申
+ 故大品經 善吉 致問 何等是菩薩道 何等非菩薩道
+ 佛答云 有所得非菩薩道 無所得是菩薩道答>
+
+
+ <問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不問>
+ 答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也
+
+
+ <問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶問>
+ <答 speaker="吉藏">答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破答>
+
+
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 410c5f6703dcd..fd4ba87bd302c 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -423,6 +423,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
tm.assert_frame_equal(df_str, df_expected)
+def test_string_charset(parser):
+ txt = "<中文標籤>12
中文標籤>"
+
+ df_str = read_xml(txt, parser=parser)
+
+ df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
+
+ tm.assert_frame_equal(df_str, df_expected)
+
+
+def test_file_charset(datapath, parser):
+ xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")
+
+ df_file = read_xml(datapath(xml_file), parser=parser)
+
+ df_expected = DataFrame(
+ {
+ "問": [
+ "問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
+ "問 既破有得申無得 亦應但破性執申假名以不",
+ "問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
+ ],
+ "答": [
+ "答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
+ None,
+ "答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
+ ],
+ "a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None],
+ }
+ )
+
+ tm.assert_frame_equal(df_file, df_expected)
+
+
def test_file_handle_close(datapath, parser):
xml_file = datapath("io", "data", "xml", "books.xml")
@@ -1086,6 +1120,35 @@ def test_stylesheet_buffered_reader(datapath, mode):
tm.assert_frame_equal(df_kml, df_style)
+@td.skip_if_no("lxml")
+def test_style_charset():
+ xml = "<中文標籤>12
中文標籤>"
+
+ xsl = """\
+
+
+
+
+
+
+
+
+
+
+
+ <根>
+
+ 根>
+
+
+"""
+
+ df_orig = read_xml(xml)
+ df_style = read_xml(xml, stylesheet=xsl)
+
+ tm.assert_frame_equal(df_orig, df_style)
+
+
@td.skip_if_no("lxml")
def test_not_stylesheet(datapath):
from lxml.etree import XSLTParseError