7
7
import io
8
8
from typing import (
9
9
Any ,
10
+ Callable ,
10
11
Sequence ,
11
12
)
12
13
@@ -177,7 +178,7 @@ def parse_data(self) -> list[dict[str, str | None]]:
177
178
178
179
raise AbstractMethodError (self )
179
180
180
- def _parse_nodes (self ) -> list [dict [str , str | None ]]:
181
+ def _parse_nodes (self , elems : list [ Any ] ) -> list [dict [str , str | None ]]:
181
182
"""
182
183
Parse xml nodes.
183
184
@@ -197,102 +198,6 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
197
198
will have optional keys filled with None values.
198
199
"""
199
200
200
- raise AbstractMethodError (self )
201
-
202
- def _iterparse_nodes (self ) -> list [dict [str , str | None ]]:
203
- """
204
- Iterparse xml nodes.
205
-
206
- This method will read in local disk, decompressed XML files for elements
207
- and underlying descendants using iterparse, a method to iterate through
208
- an XML tree without holding entire XML tree in memory.
209
-
210
- Raises
211
- ------
212
- TypeError
213
- * If `iterparse` is not a dict or its dict value is not list-like.
214
- ParserError
215
- * If `path_or_buffer` is not a physical, decompressed file on disk.
216
- * If no data is returned from selected items in `iterparse`.
217
-
218
- Notes
219
- -----
220
- Namespace URIs will be removed from return node values. Also,
221
- elements with missing children or attributes in submitted list
222
- will have optional keys filled with None values.
223
- """
224
-
225
- raise AbstractMethodError (self )
226
-
227
- def _validate_path (self ) -> None :
228
- """
229
- Validate xpath.
230
-
231
- This method checks for syntax, evaluation, or empty nodes return.
232
-
233
- Raises
234
- ------
235
- SyntaxError
236
- * If xpah is not supported or issues with namespaces.
237
-
238
- ValueError
239
- * If xpah does not return any nodes.
240
- """
241
-
242
- raise AbstractMethodError (self )
243
-
244
- def _validate_names (self ) -> None :
245
- """
246
- Validate names.
247
-
248
- This method will check if names is a list-like and aligns
249
- with length of parse nodes.
250
-
251
- Raises
252
- ------
253
- ValueError
254
- * If value is not a list and less then length of nodes.
255
- """
256
- raise AbstractMethodError (self )
257
-
258
- def _parse_doc (self , raw_doc ) -> bytes :
259
- """
260
- Build tree from path_or_buffer.
261
-
262
- This method will parse XML object into tree
263
- either from string/bytes or file location.
264
- """
265
- raise AbstractMethodError (self )
266
-
267
-
268
- class _EtreeFrameParser (_XMLFrameParser ):
269
- """
270
- Internal class to parse XML into DataFrames with the Python
271
- standard library XML module: `xml.etree.ElementTree`.
272
- """
273
-
274
- def parse_data (self ) -> list [dict [str , str | None ]]:
275
- from xml .etree .ElementTree import XML
276
-
277
- if self .stylesheet is not None :
278
- raise ValueError (
279
- "To use stylesheet, you need lxml installed and selected as parser."
280
- )
281
-
282
- if self .iterparse is None :
283
- self .xml_doc = XML (self ._parse_doc (self .path_or_buffer ))
284
- self ._validate_path ()
285
-
286
- self ._validate_names ()
287
-
288
- xml_dicts : list [dict [str , str | None ]] = (
289
- self ._parse_nodes () if self .iterparse is None else self ._iterparse_nodes ()
290
- )
291
-
292
- return xml_dicts
293
-
294
- def _parse_nodes (self ) -> list [dict [str , str | None ]]:
295
- elems = self .xml_doc .findall (self .xpath , namespaces = self .namespaces )
296
201
dicts : list [dict [str , str | None ]]
297
202
298
203
if self .elems_only and self .attrs_only :
@@ -375,8 +280,28 @@ def _parse_nodes(self) -> list[dict[str, str | None]]:
375
280
376
281
return dicts
377
282
378
- def _iterparse_nodes (self ) -> list [dict [str , str | None ]]:
379
- from xml .etree .ElementTree import iterparse
283
+ def _iterparse_nodes (self , iterparse : Callable ) -> list [dict [str , str | None ]]:
284
+ """
285
+ Iterparse xml nodes.
286
+
287
+ This method will read in local disk, decompressed XML files for elements
288
+ and underlying descendants using iterparse, a method to iterate through
289
+ an XML tree without holding entire XML tree in memory.
290
+
291
+ Raises
292
+ ------
293
+ TypeError
294
+ * If `iterparse` is not a dict or its dict value is not list-like.
295
+ ParserError
296
+ * If `path_or_buffer` is not a physical, decompressed file on disk.
297
+ * If no data is returned from selected items in `iterparse`.
298
+
299
+ Notes
300
+ -----
301
+ Namespace URIs will be removed from return node values. Also,
302
+ elements with missing children or attributes in submitted list
303
+ will have optional keys filled with None values.
304
+ """
380
305
381
306
dicts : list [dict [str , str | None ]] = []
382
307
row : dict [str , str | None ] | None = None
@@ -433,7 +358,13 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
433
358
if curr_elem == row_node and row is not None :
434
359
dicts .append (row )
435
360
row = None
361
+
436
362
elem .clear ()
363
+ if hasattr (elem , "getprevious" ):
364
+ while (
365
+ elem .getprevious () is not None and elem .getparent () is not None
366
+ ):
367
+ del elem .getparent ()[0 ]
437
368
438
369
if dicts == []:
439
370
raise ParserError ("No result from selected items in iterparse." )
@@ -446,6 +377,81 @@ def _iterparse_nodes(self) -> list[dict[str, str | None]]:
446
377
447
378
return dicts
448
379
380
+ def _validate_path (self ) -> None :
381
+ """
382
+ Validate xpath.
383
+
384
+ This method checks for syntax, evaluation, or empty nodes return.
385
+
386
+ Raises
387
+ ------
388
+ SyntaxError
389
+ * If xpah is not supported or issues with namespaces.
390
+
391
+ ValueError
392
+ * If xpah does not return any nodes.
393
+ """
394
+
395
+ raise AbstractMethodError (self )
396
+
397
+ def _validate_names (self ) -> None :
398
+ """
399
+ Validate names.
400
+
401
+ This method will check if names is a list-like and aligns
402
+ with length of parse nodes.
403
+
404
+ Raises
405
+ ------
406
+ ValueError
407
+ * If value is not a list and less then length of nodes.
408
+ """
409
+ raise AbstractMethodError (self )
410
+
411
+ def _parse_doc (
412
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
413
+ ) -> bytes :
414
+ """
415
+ Build tree from path_or_buffer.
416
+
417
+ This method will parse XML object into tree
418
+ either from string/bytes or file location.
419
+ """
420
+ raise AbstractMethodError (self )
421
+
422
+
423
+ class _EtreeFrameParser (_XMLFrameParser ):
424
+ """
425
+ Internal class to parse XML into DataFrames with the Python
426
+ standard library XML module: `xml.etree.ElementTree`.
427
+ """
428
+
429
+ def parse_data (self ) -> list [dict [str , str | None ]]:
430
+ from xml .etree .ElementTree import (
431
+ XML ,
432
+ iterparse ,
433
+ )
434
+
435
+ if self .stylesheet is not None :
436
+ raise ValueError (
437
+ "To use stylesheet, you need lxml installed and selected as parser."
438
+ )
439
+
440
+ if self .iterparse is None :
441
+ self .xml_doc = XML (self ._parse_doc (self .path_or_buffer ))
442
+ self ._validate_path ()
443
+ elems = self .xml_doc .findall (self .xpath , namespaces = self .namespaces )
444
+
445
+ self ._validate_names ()
446
+
447
+ xml_dicts : list [dict [str , str | None ]] = (
448
+ self ._parse_nodes (elems )
449
+ if self .iterparse is None
450
+ else self ._iterparse_nodes (iterparse )
451
+ )
452
+
453
+ return xml_dicts
454
+
449
455
def _validate_path (self ) -> None :
450
456
"""
451
457
Notes
@@ -495,7 +501,9 @@ def _validate_names(self) -> None:
495
501
f"{ type (self .names ).__name__ } is not a valid type for names"
496
502
)
497
503
498
- def _parse_doc (self , raw_doc ) -> bytes :
504
+ def _parse_doc (
505
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
506
+ ) -> bytes :
499
507
from xml .etree .ElementTree import (
500
508
XMLParser ,
501
509
parse ,
@@ -531,7 +539,10 @@ def parse_data(self) -> list[dict[str, str | None]]:
531
539
validate xpath, names, optionally parse and run XSLT,
532
540
and parse original or transformed XML and return specific nodes.
533
541
"""
534
- from lxml .etree import XML
542
+ from lxml .etree import (
543
+ XML ,
544
+ iterparse ,
545
+ )
535
546
536
547
if self .iterparse is None :
537
548
self .xml_doc = XML (self ._parse_doc (self .path_or_buffer ))
@@ -541,172 +552,18 @@ def parse_data(self) -> list[dict[str, str | None]]:
541
552
self .xml_doc = XML (self ._transform_doc ())
542
553
543
554
self ._validate_path ()
555
+ elems = self .xml_doc .xpath (self .xpath , namespaces = self .namespaces )
544
556
545
557
self ._validate_names ()
546
558
547
559
xml_dicts : list [dict [str , str | None ]] = (
548
- self ._parse_nodes () if self .iterparse is None else self ._iterparse_nodes ()
560
+ self ._parse_nodes (elems )
561
+ if self .iterparse is None
562
+ else self ._iterparse_nodes (iterparse )
549
563
)
550
564
551
565
return xml_dicts
552
566
553
- def _parse_nodes (self ) -> list [dict [str , str | None ]]:
554
- elems = self .xml_doc .xpath (self .xpath , namespaces = self .namespaces )
555
- dicts : list [dict [str , str | None ]]
556
-
557
- if self .elems_only and self .attrs_only :
558
- raise ValueError ("Either element or attributes can be parsed not both." )
559
-
560
- elif self .elems_only :
561
- if self .names :
562
- dicts = [
563
- {
564
- ** (
565
- {el .tag : el .text .strip ()}
566
- if el .text and not el .text .isspace ()
567
- else {}
568
- ),
569
- ** {
570
- nm : ch .text .strip () if ch .text else None
571
- for nm , ch in zip (self .names , el .xpath ("*" ))
572
- },
573
- }
574
- for el in elems
575
- ]
576
- else :
577
- dicts = [
578
- {
579
- ch .tag : ch .text .strip () if ch .text else None
580
- for ch in el .xpath ("*" )
581
- }
582
- for el in elems
583
- ]
584
-
585
- elif self .attrs_only :
586
- dicts = [el .attrib for el in elems ]
587
-
588
- else :
589
- if self .names :
590
- dicts = [
591
- {
592
- ** el .attrib ,
593
- ** (
594
- {el .tag : el .text .strip ()}
595
- if el .text and not el .text .isspace ()
596
- else {}
597
- ),
598
- ** {
599
- nm : ch .text .strip () if ch .text else None
600
- for nm , ch in zip (self .names , el .xpath ("*" ))
601
- },
602
- }
603
- for el in elems
604
- ]
605
- else :
606
- dicts = [
607
- {
608
- ** el .attrib ,
609
- ** (
610
- {el .tag : el .text .strip ()}
611
- if el .text and not el .text .isspace ()
612
- else {}
613
- ),
614
- ** {
615
- ch .tag : ch .text .strip () if ch .text else None
616
- for ch in el .xpath ("*" )
617
- },
618
- }
619
- for el in elems
620
- ]
621
-
622
- if self .namespaces or "}" in list (dicts [0 ].keys ())[0 ]:
623
- dicts = [
624
- {k .split ("}" )[1 ] if "}" in k else k : v for k , v in d .items ()}
625
- for d in dicts
626
- ]
627
-
628
- keys = list (dict .fromkeys ([k for d in dicts for k in d .keys ()]))
629
- dicts = [{k : d [k ] if k in d .keys () else None for k in keys } for d in dicts ]
630
-
631
- if self .names :
632
- dicts = [{nm : v for nm , v in zip (self .names , d .values ())} for d in dicts ]
633
-
634
- return dicts
635
-
636
- def _iterparse_nodes (self ) -> list [dict [str , str | None ]]:
637
- from lxml .etree import iterparse
638
-
639
- dicts : list [dict [str , str | None ]] = []
640
- row : dict [str , str | None ] | None = None
641
-
642
- if not isinstance (self .iterparse , dict ):
643
- raise TypeError (
644
- f"{ type (self .iterparse ).__name__ } is not a valid type for iterparse"
645
- )
646
-
647
- row_node = next (iter (self .iterparse .keys ())) if self .iterparse else ""
648
- if not is_list_like (self .iterparse [row_node ]):
649
- raise TypeError (
650
- f"{ type (self .iterparse [row_node ])} is not a valid type "
651
- "for value in iterparse"
652
- )
653
-
654
- if (
655
- not isinstance (self .path_or_buffer , str )
656
- or is_url (self .path_or_buffer )
657
- or is_fsspec_url (self .path_or_buffer )
658
- or self .path_or_buffer .startswith (("<?xml" , "<" ))
659
- or infer_compression (self .path_or_buffer , "infer" ) is not None
660
- ):
661
- raise ParserError (
662
- "iterparse is designed for large XML files that are fully extracted on "
663
- "local disk and not as compressed files or online sources."
664
- )
665
-
666
- for event , elem in iterparse (self .path_or_buffer , events = ("start" , "end" )):
667
- curr_elem = elem .tag .split ("}" )[1 ] if "}" in elem .tag else elem .tag
668
-
669
- if event == "start" :
670
- if curr_elem == row_node :
671
- row = {}
672
-
673
- if row is not None :
674
- if self .names :
675
- for col , nm in zip (self .iterparse [row_node ], self .names ):
676
- if curr_elem == col :
677
- elem_val = elem .text .strip () if elem .text else None
678
- if elem_val not in row .values () and nm not in row :
679
- row [nm ] = elem_val
680
- if col in elem .attrib :
681
- if elem .attrib [col ] not in row .values () and nm not in row :
682
- row [nm ] = elem .attrib [col ]
683
- else :
684
- for col in self .iterparse [row_node ]:
685
- if curr_elem == col :
686
- row [col ] = elem .text .strip () if elem .text else None
687
- if col in elem .attrib :
688
- row [col ] = elem .attrib [col ]
689
-
690
- if event == "end" :
691
- if curr_elem == row_node and row is not None :
692
- dicts .append (row )
693
- row = None
694
-
695
- elem .clear ()
696
- while elem .getprevious () is not None and elem .getparent () is not None :
697
- del elem .getparent ()[0 ]
698
-
699
- if dicts == []:
700
- raise ParserError ("No result from selected items in iterparse." )
701
-
702
- keys = list (dict .fromkeys ([k for d in dicts for k in d .keys ()]))
703
- dicts = [{k : d [k ] if k in d .keys () else None for k in keys } for d in dicts ]
704
-
705
- if self .names :
706
- dicts = [{nm : v for nm , v in zip (self .names , d .values ())} for d in dicts ]
707
-
708
- return dicts
709
-
710
567
def _validate_path (self ) -> None :
711
568
712
569
msg = (
@@ -748,7 +605,9 @@ def _validate_names(self) -> None:
748
605
f"{ type (self .names ).__name__ } is not a valid type for names"
749
606
)
750
607
751
- def _parse_doc (self , raw_doc ) -> bytes :
608
+ def _parse_doc (
609
+ self , raw_doc : FilePath | ReadBuffer [bytes ] | ReadBuffer [str ]
610
+ ) -> bytes :
752
611
from lxml .etree import (
753
612
XMLParser ,
754
613
fromstring ,
0 commit comments