@@ -111,7 +111,7 @@ def __init__(
111
111
stylesheet ,
112
112
compression ,
113
113
storage_options ,
114
- ):
114
+ ) -> None :
115
115
self .path_or_buffer = path_or_buffer
116
116
self .xpath = xpath
117
117
self .namespaces = namespaces
@@ -187,14 +187,13 @@ def _validate_names(self) -> None:
187
187
"""
188
188
raise AbstractMethodError (self )
189
189
190
- def _parse_doc (self ) :
190
+ def _parse_doc (self , raw_doc ) -> bytes :
191
191
"""
192
- Build tree from io .
192
+ Build tree from path_or_buffer .
193
193
194
- This method will parse io object into tree for parsing
195
- conditionally by its specific object type .
194
+ This method will parse XML object into tree
195
+ either from string/bytes or file location .
196
196
"""
197
-
198
197
raise AbstractMethodError (self )
199
198
200
199
@@ -204,22 +203,18 @@ class _EtreeFrameParser(_XMLFrameParser):
204
203
standard library XML module: `xml.etree.ElementTree`.
205
204
"""
206
205
207
- from xml .etree .ElementTree import (
208
- Element ,
209
- ElementTree ,
210
- )
211
-
212
- def __init__ (self , * args , ** kwargs ):
206
+ def __init__ (self , * args , ** kwargs ) -> None :
213
207
super ().__init__ (* args , ** kwargs )
214
208
215
209
def parse_data (self ) -> List [Dict [str , Optional [str ]]]:
210
+ from xml .etree .ElementTree import XML
216
211
217
212
if self .stylesheet is not None :
218
213
raise ValueError (
219
214
"To use stylesheet, you need lxml installed and selected as parser."
220
215
)
221
216
222
- self .xml_doc = self ._parse_doc ()
217
+ self .xml_doc = XML ( self ._parse_doc (self . path_or_buffer ) )
223
218
224
219
self ._validate_path ()
225
220
self ._validate_names ()
@@ -356,14 +351,15 @@ def _validate_names(self) -> None:
356
351
f"{ type (self .names ).__name__ } is not a valid type for names"
357
352
)
358
353
359
- def _parse_doc (self ) -> Union [ Element , ElementTree ] :
354
+ def _parse_doc (self , raw_doc ) -> bytes :
360
355
from xml .etree .ElementTree import (
361
356
XMLParser ,
362
357
parse ,
358
+ tostring ,
363
359
)
364
360
365
361
handle_data = get_data_from_filepath (
366
- filepath_or_buffer = self . path_or_buffer ,
362
+ filepath_or_buffer = raw_doc ,
367
363
encoding = self .encoding ,
368
364
compression = self .compression ,
369
365
storage_options = self .storage_options ,
@@ -373,7 +369,7 @@ def _parse_doc(self) -> Union[Element, ElementTree]:
373
369
curr_parser = XMLParser (encoding = self .encoding )
374
370
r = parse (xml_data , parser = curr_parser )
375
371
376
- return r
372
+ return tostring ( r . getroot ())
377
373
378
374
379
375
class _LxmlFrameParser (_XMLFrameParser ):
@@ -383,7 +379,7 @@ class _LxmlFrameParser(_XMLFrameParser):
383
379
XPath 1.0 and XSLT 1.0.
384
380
"""
385
381
386
- def __init__ (self , * args , ** kwargs ):
382
+ def __init__ (self , * args , ** kwargs ) -> None :
387
383
super ().__init__ (* args , ** kwargs )
388
384
389
385
def parse_data (self ) -> List [Dict [str , Optional [str ]]]:
@@ -394,12 +390,13 @@ def parse_data(self) -> List[Dict[str, Optional[str]]]:
394
390
validate xpath, names, optionally parse and run XSLT,
395
391
and parse original or transformed XML and return specific nodes.
396
392
"""
393
+ from lxml .etree import XML
397
394
398
- self .xml_doc = self ._parse_doc (self .path_or_buffer )
395
+ self .xml_doc = XML ( self ._parse_doc (self .path_or_buffer ) )
399
396
400
397
if self .stylesheet is not None :
401
- self .xsl_doc = self ._parse_doc (self .stylesheet )
402
- self .xml_doc = self ._transform_doc ()
398
+ self .xsl_doc = XML ( self ._parse_doc (self .stylesheet ) )
399
+ self .xml_doc = XML ( self ._transform_doc () )
403
400
404
401
self ._validate_path ()
405
402
self ._validate_names ()
@@ -491,21 +488,6 @@ def _parse_nodes(self) -> List[Dict[str, Optional[str]]]:
491
488
492
489
return dicts
493
490
494
- def _transform_doc (self ):
495
- """
496
- Transform original tree using stylesheet.
497
-
498
- This method will transform original xml using XSLT script into
499
- am ideally flatter xml document for easier parsing and migration
500
- to Data Frame.
501
- """
502
- from lxml .etree import XSLT
503
-
504
- transformer = XSLT (self .xsl_doc )
505
- new_doc = transformer (self .xml_doc )
506
-
507
- return new_doc
508
-
509
491
def _validate_path (self ) -> None :
510
492
511
493
msg = (
@@ -553,11 +535,12 @@ def _validate_names(self) -> None:
553
535
f"{ type (self .names ).__name__ } is not a valid type for names"
554
536
)
555
537
556
- def _parse_doc (self , raw_doc ):
538
+ def _parse_doc (self , raw_doc ) -> bytes :
557
539
from lxml .etree import (
558
540
XMLParser ,
559
541
fromstring ,
560
542
parse ,
543
+ tostring ,
561
544
)
562
545
563
546
handle_data = get_data_from_filepath (
@@ -577,7 +560,22 @@ def _parse_doc(self, raw_doc):
577
560
else :
578
561
doc = parse (xml_data , parser = curr_parser )
579
562
580
- return doc
563
+ return tostring (doc )
564
+
565
+ def _transform_doc (self ) -> bytes :
566
+ """
567
+ Transform original tree using stylesheet.
568
+
569
+ This method will transform original xml using XSLT script into
570
+ am ideally flatter xml document for easier parsing and migration
571
+ to Data Frame.
572
+ """
573
+ from lxml .etree import XSLT
574
+
575
+ transformer = XSLT (self .xsl_doc )
576
+ new_doc = transformer (self .xml_doc )
577
+
578
+ return bytes (new_doc )
581
579
582
580
583
581
def get_data_from_filepath (
@@ -695,6 +693,7 @@ def _parse(
695
693
"""
696
694
697
695
lxml = import_optional_dependency ("lxml.etree" , errors = "ignore" )
696
+
698
697
p : Union [_EtreeFrameParser , _LxmlFrameParser ]
699
698
700
699
if parser == "lxml" :
0 commit comments