8
8
from typing import Sequence
9
9
10
10
from pandas ._typing import (
11
+ TYPE_CHECKING ,
11
12
CompressionOptions ,
12
13
FilePath ,
13
14
ReadBuffer ,
38
39
)
39
40
from pandas .io .parsers import TextParser
40
41
42
+ if TYPE_CHECKING :
43
+ from xml .etree .ElementTree import Element
44
+
45
+ from lxml .etree import (
46
+ _Element ,
47
+ _XSLTResultTree ,
48
+ )
49
+
41
50
42
51
@doc (decompression_options = _shared_docs ["decompression_options" ] % "path_or_buffer" )
43
52
class _XMLFrameParser :
@@ -189,7 +198,7 @@ def _validate_names(self) -> None:
189
198
"""
190
199
raise AbstractMethodError (self )
191
200
192
- def _parse_doc (self , raw_doc ) -> bytes :
201
+ def _parse_doc (self , raw_doc ) -> Element | _Element :
193
202
"""
194
203
Build tree from path_or_buffer.
195
204
@@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser):
206
215
"""
207
216
208
217
def parse_data (self ) -> list [dict [str , str | None ]]:
209
- from xml .etree .ElementTree import XML
210
-
211
218
if self .stylesheet is not None :
212
219
raise ValueError (
213
220
"To use stylesheet, you need lxml installed and selected as parser."
214
221
)
215
222
216
- self .xml_doc = XML ( self ._parse_doc (self .path_or_buffer ) )
223
+ self .xml_doc = self ._parse_doc (self .path_or_buffer )
217
224
218
225
self ._validate_path ()
219
226
self ._validate_names ()
@@ -348,11 +355,12 @@ def _validate_names(self) -> None:
348
355
f"{ type (self .names ).__name__ } is not a valid type for names"
349
356
)
350
357
351
- def _parse_doc (self , raw_doc ) -> bytes :
358
+ def _parse_doc (
359
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
360
+ ) -> Element :
352
361
from xml .etree .ElementTree import (
353
362
XMLParser ,
354
363
parse ,
355
- tostring ,
356
364
)
357
365
358
366
handle_data = get_data_from_filepath (
@@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes:
364
372
365
373
with preprocess_data (handle_data ) as xml_data :
366
374
curr_parser = XMLParser (encoding = self .encoding )
367
- r = parse (xml_data , parser = curr_parser )
375
+ doc = parse (xml_data , parser = curr_parser )
368
376
369
- return tostring ( r .getroot () )
377
+ return doc .getroot ()
370
378
371
379
372
380
class _LxmlFrameParser (_XMLFrameParser ):
@@ -384,13 +392,11 @@ def parse_data(self) -> list[dict[str, str | None]]:
384
392
validate xpath, names, optionally parse and run XSLT,
385
393
and parse original or transformed XML and return specific nodes.
386
394
"""
387
- from lxml .etree import XML
388
-
389
- self .xml_doc = XML (self ._parse_doc (self .path_or_buffer ))
395
+ self .xml_doc = self ._parse_doc (self .path_or_buffer )
390
396
391
397
if self .stylesheet is not None :
392
- self .xsl_doc = XML ( self ._parse_doc (self .stylesheet ) )
393
- self .xml_doc = XML ( self ._transform_doc () )
398
+ self .xsl_doc = self ._parse_doc (self .stylesheet )
399
+ self .xml_doc = self ._transform_doc ()
394
400
395
401
self ._validate_path ()
396
402
self ._validate_names ()
@@ -527,12 +533,13 @@ def _validate_names(self) -> None:
527
533
f"{ type (self .names ).__name__ } is not a valid type for names"
528
534
)
529
535
530
- def _parse_doc (self , raw_doc ) -> bytes :
536
+ def _parse_doc (
537
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
538
+ ) -> _Element :
531
539
from lxml .etree import (
532
540
XMLParser ,
533
541
fromstring ,
534
542
parse ,
535
- tostring ,
536
543
)
537
544
538
545
handle_data = get_data_from_filepath (
@@ -557,9 +564,9 @@ def _parse_doc(self, raw_doc) -> bytes:
557
564
else :
558
565
doc = parse (xml_data , parser = curr_parser )
559
566
560
- return tostring ( doc )
567
+ return doc
561
568
562
- def _transform_doc (self ) -> bytes :
569
+ def _transform_doc (self ) -> _XSLTResultTree :
563
570
"""
564
571
Transform original tree using stylesheet.
565
572
@@ -572,7 +579,7 @@ def _transform_doc(self) -> bytes:
572
579
transformer = XSLT (self .xsl_doc )
573
580
new_doc = transformer (self .xml_doc )
574
581
575
- return bytes ( new_doc )
582
+ return new_doc
576
583
577
584
578
585
def get_data_from_filepath (
0 commit comments