8
8
from typing import Sequence
9
9
10
10
from pandas ._typing import (
11
+ TYPE_CHECKING ,
11
12
CompressionOptions ,
12
13
FilePath ,
13
14
ReadBuffer ,
38
39
)
39
40
from pandas .io .parsers import TextParser
40
41
42
+ if TYPE_CHECKING :
43
+ from xml .etree .ElementTree import Element
44
+
45
+ from lxml .etree import (
46
+ _Element ,
47
+ _XSLTResultTree ,
48
+ )
49
+
41
50
42
51
@doc (decompression_options = _shared_docs ["decompression_options" ] % "path_or_buffer" )
43
52
class _XMLFrameParser :
@@ -189,7 +198,7 @@ def _validate_names(self) -> None:
189
198
"""
190
199
raise AbstractMethodError (self )
191
200
192
- def _parse_doc (self , raw_doc ) -> bytes :
201
+ def _parse_doc (self , raw_doc ) -> Element | _Element :
193
202
"""
194
203
Build tree from path_or_buffer.
195
204
@@ -206,14 +215,12 @@ class _EtreeFrameParser(_XMLFrameParser):
206
215
"""
207
216
208
217
def parse_data (self ) -> list [dict [str , str | None ]]:
209
- from xml .etree .ElementTree import XML
210
-
211
218
if self .stylesheet is not None :
212
219
raise ValueError (
213
220
"To use stylesheet, you need lxml installed and selected as parser."
214
221
)
215
222
216
- self .xml_doc = XML ( self ._parse_doc (self .path_or_buffer ) )
223
+ self .xml_doc = self ._parse_doc (self .path_or_buffer )
217
224
218
225
self ._validate_path ()
219
226
self ._validate_names ()
@@ -348,11 +355,12 @@ def _validate_names(self) -> None:
348
355
f"{ type (self .names ).__name__ } is not a valid type for names"
349
356
)
350
357
351
- def _parse_doc (self , raw_doc ) -> bytes :
358
+ def _parse_doc (
359
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
360
+ ) -> Element :
352
361
from xml .etree .ElementTree import (
353
362
XMLParser ,
354
363
parse ,
355
- tostring ,
356
364
)
357
365
358
366
handle_data = get_data_from_filepath (
@@ -364,9 +372,9 @@ def _parse_doc(self, raw_doc) -> bytes:
364
372
365
373
with preprocess_data (handle_data ) as xml_data :
366
374
curr_parser = XMLParser (encoding = self .encoding )
367
- r = parse (xml_data , parser = curr_parser )
375
+ doc = parse (xml_data , parser = curr_parser )
368
376
369
- return tostring ( r .getroot () )
377
+ return doc .getroot ()
370
378
371
379
372
380
class _LxmlFrameParser (_XMLFrameParser ):
@@ -384,13 +392,14 @@ def parse_data(self) -> list[dict[str, str | None]]:
384
392
validate xpath, names, optionally parse and run XSLT,
385
393
and parse original or transformed XML and return specific nodes.
386
394
"""
387
- from lxml . etree import XML
395
+ self . xml_doc = self . _parse_doc ( self . path_or_buffer )
388
396
389
- self .xml_doc = XML (self ._parse_doc (self .path_or_buffer ))
397
+ if self .stylesheet :
398
+ self .xsl_doc = self ._parse_doc (self .stylesheet )
399
+ self .xml_doc = self ._transform_doc ()
390
400
391
- if self .stylesheet is not None :
392
- self .xsl_doc = XML (self ._parse_doc (self .stylesheet ))
393
- self .xml_doc = XML (self ._transform_doc ())
401
+ self ._validate_path ()
402
+ self .xml_doc .xpath (self .xpath , namespaces = self .namespaces )
394
403
395
404
self ._validate_path ()
396
405
self ._validate_names ()
@@ -527,12 +536,13 @@ def _validate_names(self) -> None:
527
536
f"{ type (self .names ).__name__ } is not a valid type for names"
528
537
)
529
538
530
- def _parse_doc (self , raw_doc ) -> bytes :
539
+ def _parse_doc (
540
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
541
+ ) -> _Element :
531
542
from lxml .etree import (
532
543
XMLParser ,
533
544
fromstring ,
534
545
parse ,
535
- tostring ,
536
546
)
537
547
538
548
handle_data = get_data_from_filepath (
@@ -557,9 +567,9 @@ def _parse_doc(self, raw_doc) -> bytes:
557
567
else :
558
568
doc = parse (xml_data , parser = curr_parser )
559
569
560
- return tostring ( doc )
570
+ return doc
561
571
562
- def _transform_doc (self ) -> bytes :
572
+ def _transform_doc (self ) -> _XSLTResultTree :
563
573
"""
564
574
Transform original tree using stylesheet.
565
575
@@ -572,7 +582,7 @@ def _transform_doc(self) -> bytes:
572
582
transformer = XSLT (self .xsl_doc )
573
583
new_doc = transformer (self .xml_doc )
574
584
575
- return bytes ( new_doc )
585
+ return new_doc
576
586
577
587
578
588
def get_data_from_filepath (
0 commit comments