Skip to content

Commit 4003bea

Browse files
committed
Backport PR pandas-dev#47905: BUG: Fix read_xml raising syntax error when reading XML with Chinese tags
1 parent 9fb91db commit 4003bea

File tree

4 files changed

+121
-18
lines changed

4 files changed

+121
-18
lines changed

doc/source/whatsnew/v1.4.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Bug fixes
2626
~~~~~~~~~
2727
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
2828
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
29+
- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)
2930

3031
.. ---------------------------------------------------------------------------
3132

pandas/io/xml.py

+28-18
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Sequence
99

1010
from pandas._typing import (
11+
TYPE_CHECKING,
1112
CompressionOptions,
1213
FilePath,
1314
ReadBuffer,
@@ -38,6 +39,14 @@
3839
)
3940
from pandas.io.parsers import TextParser
4041

42+
if TYPE_CHECKING:
43+
from xml.etree.ElementTree import Element
44+
45+
from lxml.etree import (
46+
_Element,
47+
_XSLTResultTree,
48+
)
49+
4150

4251
@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer")
4352
class _XMLFrameParser:
@@ -189,7 +198,7 @@ def _validate_names(self) -> None:
189198
"""
190199
raise AbstractMethodError(self)
191200

192-
def _parse_doc(self, raw_doc) -> bytes:
201+
def _parse_doc(self, raw_doc) -> Element | _Element:
193202
"""
194203
Build tree from path_or_buffer.
195204
@@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser):
206215
"""
207216

208217
def parse_data(self) -> list[dict[str, str | None]]:
209-
from xml.etree.ElementTree import XML
210-
211218
if self.stylesheet is not None:
212219
raise ValueError(
213220
"To use stylesheet, you need lxml installed and selected as parser."
214221
)
215222

216-
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
223+
self.xml_doc = self._parse_doc(self.path_or_buffer)
217224

218225
self._validate_path()
219226
self._validate_names()
@@ -348,11 +355,12 @@ def _validate_names(self) -> None:
348355
f"{type(self.names).__name__} is not a valid type for names"
349356
)
350357

351-
def _parse_doc(self, raw_doc) -> bytes:
358+
def _parse_doc(
359+
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
360+
) -> Element:
352361
from xml.etree.ElementTree import (
353362
XMLParser,
354363
parse,
355-
tostring,
356364
)
357365

358366
handle_data = get_data_from_filepath(
@@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes:
364372

365373
with preprocess_data(handle_data) as xml_data:
366374
curr_parser = XMLParser(encoding=self.encoding)
367-
r = parse(xml_data, parser=curr_parser)
375+
doc = parse(xml_data, parser=curr_parser)
368376

369-
return tostring(r.getroot())
377+
return doc.getroot()
370378

371379

372380
class _LxmlFrameParser(_XMLFrameParser):
@@ -384,13 +392,14 @@ def parse_data(self) -> list[dict[str, str | None]]:
384392
validate xpath, names, optionally parse and run XSLT,
385393
and parse original or transformed XML and return specific nodes.
386394
"""
387-
from lxml.etree import XML
395+
self.xml_doc = self._parse_doc(self.path_or_buffer)
388396

389-
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
397+
if self.stylesheet:
398+
self.xsl_doc = self._parse_doc(self.stylesheet)
399+
self.xml_doc = self._transform_doc()
390400

391-
if self.stylesheet is not None:
392-
self.xsl_doc = XML(self._parse_doc(self.stylesheet))
393-
self.xml_doc = XML(self._transform_doc())
401+
self._validate_path()
402+
self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
394403

395404
self._validate_path()
396405
self._validate_names()
@@ -527,12 +536,13 @@ def _validate_names(self) -> None:
527536
f"{type(self.names).__name__} is not a valid type for names"
528537
)
529538

530-
def _parse_doc(self, raw_doc) -> bytes:
539+
def _parse_doc(
540+
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
541+
) -> _Element:
531542
from lxml.etree import (
532543
XMLParser,
533544
fromstring,
534545
parse,
535-
tostring,
536546
)
537547

538548
handle_data = get_data_from_filepath(
@@ -557,9 +567,9 @@ def _parse_doc(self, raw_doc) -> bytes:
557567
else:
558568
doc = parse(xml_data, parser=curr_parser)
559569

560-
return tostring(doc)
570+
return doc
561571

562-
def _transform_doc(self) -> bytes:
572+
def _transform_doc(self) -> _XSLTResultTree:
563573
"""
564574
Transform original tree using stylesheet.
565575
@@ -572,7 +582,7 @@ def _transform_doc(self) -> bytes:
572582
transformer = XSLT(self.xsl_doc)
573583
new_doc = transformer(self.xml_doc)
574584

575-
return bytes(new_doc)
585+
return new_doc
576586

577587

578588
def get_data_from_filepath(
+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE fragmentdoc [
3+
<!ELEMENT qafragment (qa+)>
4+
<!ELEMENT qa ( q, a )>
5+
<!ELEMENT q ( #PCDATA | title | name)*>
6+
<!ATTLIST q speaker CDATA #REQUIRED>
7+
<!ELEMENT a ( #PCDATA | title | name)*>
8+
<!ATTLIST a speaker CDATA #REQUIRED>
9+
<!ELEMENT name (#PCDATA)>
10+
<!ELEMENT title (#PCDATA)>
11+
<!ENTITY C4-4F71 "Sorry, this is Big5 only">
12+
]>
13+
14+
<qafragment>
15+
<qa>
16+
<問 speaker="Opponent">問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正</問>
17+
<答 speaker="吉藏">答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申
18+
故<title>大品經</title> <name>善吉</name> 致問 何等是菩薩道 何等非菩薩道
19+
<name>佛</name>答云 有所得非菩薩道 無所得是菩薩道</答>
20+
</qa>
21+
<qa>
22+
<問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不</問>
23+
<a speaker="吉藏">答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也</a>
24+
</qa>
25+
<qa>
26+
<問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶</問>
27+
<答 speaker="吉藏">答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破</答>
28+
</qa>
29+
</qafragment>

pandas/tests/io/xml/test_xml.py

+63
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
358358
tm.assert_frame_equal(df_str, df_expected)
359359

360360

361+
def test_string_charset(parser):
362+
txt = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
363+
364+
df_str = read_xml(txt, parser=parser)
365+
366+
df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
367+
368+
tm.assert_frame_equal(df_str, df_expected)
369+
370+
371+
def test_file_charset(datapath, parser):
372+
xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")
373+
374+
df_file = read_xml(datapath(xml_file), parser=parser)
375+
376+
df_expected = DataFrame(
377+
{
378+
"問": [
379+
"問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
380+
"問 既破有得申無得 亦應但破性執申假名以不",
381+
"問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
382+
],
383+
"答": [
384+
"答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
385+
None,
386+
"答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
387+
],
388+
"a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None],
389+
}
390+
)
391+
392+
tm.assert_frame_equal(df_file, df_expected)
393+
394+
361395
def test_file_handle_close(datapath, parser):
362396
xml_file = datapath("io", "data", "xml", "books.xml")
363397

@@ -878,6 +912,35 @@ def test_stylesheet_buffered_reader(datapath, mode):
878912
tm.assert_frame_equal(df_kml, df_style)
879913

880914

915+
@td.skip_if_no("lxml")
916+
def test_style_charset():
917+
xml = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
918+
919+
xsl = """\
920+
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
921+
<xsl:output omit-xml-declaration="yes" indent="yes"/>
922+
<xsl:strip-space elements="*"/>
923+
924+
<xsl:template match="node()|@*">
925+
<xsl:copy>
926+
<xsl:apply-templates select="node()|@*"/>
927+
</xsl:copy>
928+
</xsl:template>
929+
930+
<xsl:template match="中文標籤">
931+
<根>
932+
<xsl:apply-templates />
933+
</根>
934+
</xsl:template>
935+
936+
</xsl:stylesheet>"""
937+
938+
df_orig = read_xml(xml)
939+
df_style = read_xml(xml, stylesheet=xsl)
940+
941+
tm.assert_frame_equal(df_orig, df_style)
942+
943+
881944
@td.skip_if_no("lxml")
882945
def test_not_stylesheet(datapath):
883946
from lxml.etree import XSLTParseError

0 commit comments

Comments
 (0)