Skip to content

Backport PR #47905: BUG: Fix read_xml raising syntax error when reading XML with Chinese tags #47925

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Bug fixes
~~~~~~~~~
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)

.. ---------------------------------------------------------------------------

Expand Down
43 changes: 25 additions & 18 deletions pandas/io/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Sequence

from pandas._typing import (
TYPE_CHECKING,
CompressionOptions,
FilePath,
ReadBuffer,
Expand Down Expand Up @@ -38,6 +39,14 @@
)
from pandas.io.parsers import TextParser

if TYPE_CHECKING:
from xml.etree.ElementTree import Element

from lxml.etree import (
_Element,
_XSLTResultTree,
)


@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
class _XMLFrameParser:
Expand Down Expand Up @@ -189,7 +198,7 @@ def _validate_names(self) -> None:
"""
raise AbstractMethodError(self)

def _parse_doc(self, raw_doc) -> bytes:
def _parse_doc(self, raw_doc) -> Element | _Element:
"""
Build tree from path_or_buffer.

Expand All @@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser):
"""

def parse_data(self) -> list[dict[str, str | None]]:
from xml.etree.ElementTree import XML

if self.stylesheet is not None:
raise ValueError(
"To use stylesheet, you need lxml installed and selected as parser."
)

self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
self.xml_doc = self._parse_doc(self.path_or_buffer)

self._validate_path()
self._validate_names()
Expand Down Expand Up @@ -348,11 +355,12 @@ def _validate_names(self) -> None:
f"{type(self.names).__name__} is not a valid type for names"
)

def _parse_doc(self, raw_doc) -> bytes:
def _parse_doc(
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
) -> Element:
from xml.etree.ElementTree import (
XMLParser,
parse,
tostring,
)

handle_data = get_data_from_filepath(
Expand All @@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes:

with preprocess_data(handle_data) as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
r = parse(xml_data, parser=curr_parser)
doc = parse(xml_data, parser=curr_parser)

return tostring(r.getroot())
return doc.getroot()


class _LxmlFrameParser(_XMLFrameParser):
Expand All @@ -384,13 +392,11 @@ def parse_data(self) -> list[dict[str, str | None]]:
validate xpath, names, optionally parse and run XSLT,
and parse original or transformed XML and return specific nodes.
"""
from lxml.etree import XML

self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
self.xml_doc = self._parse_doc(self.path_or_buffer)

if self.stylesheet is not None:
self.xsl_doc = XML(self._parse_doc(self.stylesheet))
self.xml_doc = XML(self._transform_doc())
self.xsl_doc = self._parse_doc(self.stylesheet)
self.xml_doc = self._transform_doc()

self._validate_path()
self._validate_names()
Expand Down Expand Up @@ -527,12 +533,13 @@ def _validate_names(self) -> None:
f"{type(self.names).__name__} is not a valid type for names"
)

def _parse_doc(self, raw_doc) -> bytes:
def _parse_doc(
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
) -> _Element:
from lxml.etree import (
XMLParser,
fromstring,
parse,
tostring,
)

handle_data = get_data_from_filepath(
Expand All @@ -557,9 +564,9 @@ def _parse_doc(self, raw_doc) -> bytes:
else:
doc = parse(xml_data, parser=curr_parser)

return tostring(doc)
return doc

def _transform_doc(self) -> bytes:
def _transform_doc(self) -> _XSLTResultTree:
"""
Transform original tree using stylesheet.

Expand All @@ -572,7 +579,7 @@ def _transform_doc(self) -> bytes:
transformer = XSLT(self.xsl_doc)
new_doc = transformer(self.xml_doc)

return bytes(new_doc)
return new_doc


def get_data_from_filepath(
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/io/data/xml/doc_ch_utf.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE fragmentdoc [
<!ELEMENT qafragment (qa+)>
<!ELEMENT qa ( q, a )>
<!ELEMENT q ( #PCDATA | title | name)*>
<!ATTLIST q speaker CDATA #REQUIRED>
<!ELEMENT a ( #PCDATA | title | name)*>
<!ATTLIST a speaker CDATA #REQUIRED>
<!ELEMENT name (#PCDATA)>
<!ELEMENT title (#PCDATA)>
<!ENTITY C4-4F71 "Sorry, this is Big5 only">
]>

<qafragment>
<qa>
<問 speaker="Opponent">問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正</問>
<答 speaker="吉藏">答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申
故<title>大品經</title> <name>善吉</name> 致問 何等是菩薩道 何等非菩薩道
<name>佛</name>答云 有所得非菩薩道 無所得是菩薩道</答>
</qa>
<qa>
<問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不</問>
<a speaker="吉藏">答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也</a>
</qa>
<qa>
<問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶</問>
<答 speaker="吉藏">答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破</答>
</qa>
</qafragment>
63 changes: 63 additions & 0 deletions pandas/tests/io/xml/test_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
tm.assert_frame_equal(df_str, df_expected)


def test_string_charset(parser):
txt = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"

df_str = read_xml(txt, parser=parser)

df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])

tm.assert_frame_equal(df_str, df_expected)


def test_file_charset(datapath, parser):
xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")

df_file = read_xml(datapath(xml_file), parser=parser)

df_expected = DataFrame(
{
"問": [
"問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
"問 既破有得申無得 亦應但破性執申假名以不",
"問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
],
"答": [
"答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
None,
"答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
],
"a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None],
}
)

tm.assert_frame_equal(df_file, df_expected)


def test_file_handle_close(datapath, parser):
xml_file = datapath("io", "data", "xml", "books.xml")

Expand Down Expand Up @@ -878,6 +912,35 @@ def test_stylesheet_buffered_reader(datapath, mode):
tm.assert_frame_equal(df_kml, df_style)


@td.skip_if_no("lxml")
def test_style_charset():
xml = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"

xsl = """\
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*"/>

<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>

<xsl:template match="中文標籤">
<根>
<xsl:apply-templates />
</根>
</xsl:template>

</xsl:stylesheet>"""

df_orig = read_xml(xml)
df_style = read_xml(xml, stylesheet=xsl)

tm.assert_frame_equal(df_orig, df_style)


@td.skip_if_no("lxml")
def test_not_stylesheet(datapath):
from lxml.etree import XSLTParseError
Expand Down