diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 6bd7378e05404..89487bfde94a5 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -26,6 +26,7 @@ Bug fixes ~~~~~~~~~ - The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`) - Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`) +- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/xml.py b/pandas/io/xml.py index ad87b18bd1683..6cef494ab0aa3 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -8,6 +8,7 @@ from typing import Sequence from pandas._typing import ( + TYPE_CHECKING, CompressionOptions, FilePath, ReadBuffer, @@ -38,6 +39,14 @@ ) from pandas.io.parsers import TextParser +if TYPE_CHECKING: + from xml.etree.ElementTree import Element + + from lxml.etree import ( + _Element, + _XSLTResultTree, + ) + @doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer") class _XMLFrameParser: @@ -189,7 +198,7 @@ def _validate_names(self) -> None: """ raise AbstractMethodError(self) - def _parse_doc(self, raw_doc) -> bytes: + def _parse_doc(self, raw_doc) -> Element | _Element: """ Build tree from path_or_buffer. @@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser): """ def parse_data(self) -> list[dict[str, str | None]]: - from xml.etree.ElementTree import XML - if self.stylesheet is not None: raise ValueError( "To use stylesheet, you need lxml installed and selected as parser." ) - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self.xml_doc = self._parse_doc(self.path_or_buffer) self._validate_path() self._validate_names() @@ -348,11 +355,12 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self, raw_doc) -> bytes: + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> Element: from xml.etree.ElementTree import ( XMLParser, parse, - tostring, ) handle_data = get_data_from_filepath( @@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes: with preprocess_data(handle_data) as xml_data: curr_parser = XMLParser(encoding=self.encoding) - r = parse(xml_data, parser=curr_parser) + doc = parse(xml_data, parser=curr_parser) - return tostring(r.getroot()) + return doc.getroot() class _LxmlFrameParser(_XMLFrameParser): @@ -384,13 +392,11 @@ def parse_data(self) -> list[dict[str, str | None]]: validate xpath, names, optionally parse and run XSLT, and parse original or transformed XML and return specific nodes. """ - from lxml.etree import XML - - self.xml_doc = XML(self._parse_doc(self.path_or_buffer)) + self.xml_doc = self._parse_doc(self.path_or_buffer) if self.stylesheet is not None: - self.xsl_doc = XML(self._parse_doc(self.stylesheet)) - self.xml_doc = XML(self._transform_doc()) + self.xsl_doc = self._parse_doc(self.stylesheet) + self.xml_doc = self._transform_doc() self._validate_path() self._validate_names() @@ -527,12 +533,13 @@ def _validate_names(self) -> None: f"{type(self.names).__name__} is not a valid type for names" ) - def _parse_doc(self, raw_doc) -> bytes: + def _parse_doc( + self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] + ) -> _Element: from lxml.etree import ( XMLParser, fromstring, parse, - tostring, ) handle_data = get_data_from_filepath( @@ -557,9 +564,9 @@ def _parse_doc(self, raw_doc) -> bytes: else: doc = parse(xml_data, parser=curr_parser) - return tostring(doc) + return doc - def _transform_doc(self) -> bytes: + def _transform_doc(self) -> _XSLTResultTree: """ Transform original tree using stylesheet. @@ -572,7 +579,7 @@ def _transform_doc(self) -> bytes: transformer = XSLT(self.xsl_doc) new_doc = transformer(self.xml_doc) - return bytes(new_doc) + return new_doc def get_data_from_filepath( diff --git a/pandas/tests/io/data/xml/doc_ch_utf.xml b/pandas/tests/io/data/xml/doc_ch_utf.xml new file mode 100644 index 0000000000000..fde215b89646b --- /dev/null +++ b/pandas/tests/io/data/xml/doc_ch_utf.xml @@ -0,0 +1,29 @@ + + + + + + + + + + +]> + + + + <問 speaker="Opponent">問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正 + <答 speaker="吉藏">答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申 + 故大品經 善吉 致問 何等是菩薩道 何等非菩薩道 + 答云 有所得非菩薩道 無所得是菩薩道 + + + <問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不 + 答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也 + + + <問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶 + <答 speaker="吉藏">答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破 + + diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index c3d4d635a36be..8ce70f998f337 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -358,6 +358,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): tm.assert_frame_equal(df_str, df_expected) +def test_string_charset(parser): + txt = "<中文標籤>12" + + df_str = read_xml(txt, parser=parser) + + df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0]) + + tm.assert_frame_equal(df_str, df_expected) + + +def test_file_charset(datapath, parser): + xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml") + + df_file = read_xml(datapath(xml_file), parser=parser) + + df_expected = DataFrame( + { + "問": [ + "問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正", + "問 既破有得申無得 亦應但破性執申假名以不", + "問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶", + ], + "答": [ + "答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故", + None, + "答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破", + ], + "a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None], + } + ) + + tm.assert_frame_equal(df_file, df_expected) + + def test_file_handle_close(datapath, parser): xml_file = datapath("io", "data", "xml", "books.xml") @@ -878,6 +912,35 @@ def test_stylesheet_buffered_reader(datapath, mode): tm.assert_frame_equal(df_kml, df_style) +@td.skip_if_no("lxml") +def test_style_charset(): + xml = "<中文標籤>12" + + xsl = """\ + + + + + + + + + + + + <根> + + + + +""" + + df_orig = read_xml(xml) + df_style = read_xml(xml, stylesheet=xsl) + + tm.assert_frame_equal(df_orig, df_style) + + @td.skip_if_no("lxml") def test_not_stylesheet(datapath): from lxml.etree import XSLTParseError