@@ -469,6 +469,8 @@ class _LxmlFrameParser(_HtmlFrameParser):
469
469
:class:`_HtmlFrameParser`.
470
470
"""
471
471
def __init__ (self , * args , ** kwargs ):
472
+ self .strict = kwargs .pop ('strict' , True )
473
+
472
474
super (_LxmlFrameParser , self ).__init__ (* args , ** kwargs )
473
475
474
476
def _text_getter (self , obj ):
@@ -519,7 +521,7 @@ def _build_doc(self):
519
521
from lxml .html import parse , fromstring , HTMLParser
520
522
from lxml .etree import XMLSyntaxError
521
523
522
- parser = HTMLParser (recover = False )
524
+ parser = HTMLParser (recover = not self . strict )
523
525
524
526
try :
525
527
# try to parse the input in the simplest way
@@ -572,8 +574,49 @@ def _parse_raw_tfoot(self, table):
572
574
expr = './/tfoot//th'
573
575
return [_remove_whitespace (x .text_content ()) for x in
574
576
table .xpath (expr )]
577
+
575
578
579
+ class _LiberalLxmlFrameParser (_LxmlFrameParser ):
580
+ """HTML to DataFrame parser that uses lxml under the hood.
581
+
582
+ Tries hard to parse through broken XML.
576
583
584
+ Warning
585
+ -------
586
+ This parser can only handle HTTP, FTP, and FILE urls.
587
+
588
+ See Also
589
+ --------
590
+ _LxmlFrameParser
591
+ _HtmlFrameParser
592
+ _BeautifulSoupLxmlFrameParser
593
+
594
+ Notes
595
+ -----
596
+ It lets libxml2 try its best to return a valid HTML tree
597
+ with all content it can manage to parse.
598
+ It will not raise an exception on parser errors.
599
+ You should use libxml2 version 2.6.21 or newer
600
+ to take advantage of this feature.
601
+
602
+ The support for parsing broken HTML depends entirely on libxml2's
603
+ recovery algorithm.
604
+ It is not the fault of lxml if you find documents that
605
+ are so heavily broken that the parser cannot handle them.
606
+ There is also no guarantee that the resulting tree will
607
+ contain all data from the original document.
608
+ The parser may have to drop seriously broken parts when
609
+ struggling to keep parsing.
610
+ Especially misplaced meta tags can suffer from this,
611
+ which may lead to encoding problems.
612
+
613
+ Documentation strings for this class are in the base class
614
+ :class:`_HtmlFrameParser`.
615
+ """
616
+
617
+ def __init__ (self , * args , ** kwargs ):
618
+ super (_LiberalLxmlFrameParser , self ).__init__ (* args , strict = False , ** kwargs )
619
+
577
620
def _expand_elements (body ):
578
621
lens = Series (lmap (len , body ))
579
622
lens_max = lens .max ()
@@ -611,7 +654,8 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types,
611
654
612
655
_valid_parsers = {'lxml' : _LxmlFrameParser , None : _LxmlFrameParser ,
613
656
'html5lib' : _BeautifulSoupHtml5LibFrameParser ,
614
- 'bs4' : _BeautifulSoupHtml5LibFrameParser }
657
+ 'bs4' : _BeautifulSoupHtml5LibFrameParser ,
658
+ 'lxml-liberal' : _LiberalLxmlFrameParser ,}
615
659
616
660
617
661
def _parser_dispatch (flavor ):
0 commit comments