Skip to content

Commit 91e7e5c

Browse files
committed
ENH: Added lxml-liberal html parsing flavor (pandas-dev#5130)
1 parent bea5051 commit 91e7e5c

File tree

3 files changed

+55
-3
lines changed

3 files changed

+55
-3
lines changed

doc/source/release.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,8 @@ Improvements to existing features
172172
- :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table
173173
from semi-structured JSON data. :ref:`See the docs<io.json_normalize>` (:issue:`1067`)
174174
- ``DataFrame.from_records()`` will now accept generators (:issue:`4910`)
175-
175+
- Added ``lxml-liberal`` html parsing flavor (:issue:`5130`)
176+
176177
API Changes
177178
~~~~~~~~~~~
178179

pandas/io/html.py

+46-2
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,8 @@ class _LxmlFrameParser(_HtmlFrameParser):
469469
:class:`_HtmlFrameParser`.
470470
"""
471471
def __init__(self, *args, **kwargs):
472+
self.strict = kwargs.pop('strict', True)
473+
472474
super(_LxmlFrameParser, self).__init__(*args, **kwargs)
473475

474476
def _text_getter(self, obj):
@@ -519,7 +521,7 @@ def _build_doc(self):
519521
from lxml.html import parse, fromstring, HTMLParser
520522
from lxml.etree import XMLSyntaxError
521523

522-
parser = HTMLParser(recover=False)
524+
parser = HTMLParser(recover=not self.strict)
523525

524526
try:
525527
# try to parse the input in the simplest way
@@ -572,8 +574,49 @@ def _parse_raw_tfoot(self, table):
572574
expr = './/tfoot//th'
573575
return [_remove_whitespace(x.text_content()) for x in
574576
table.xpath(expr)]
577+
575578

579+
class _LiberalLxmlFrameParser(_LxmlFrameParser):
580+
"""HTML to DataFrame parser that uses lxml under the hood.
581+
582+
Tries hard to parse through broken XML.
576583
584+
Warning
585+
-------
586+
This parser can only handle HTTP, FTP, and FILE urls.
587+
588+
See Also
589+
--------
590+
_LxmlFrameParser
591+
_HtmlFrameParser
592+
_BeautifulSoupLxmlFrameParser
593+
594+
Notes
595+
-----
596+
It lets libxml2 try its best to return a valid HTML tree
597+
with all content it can manage to parse.
598+
It will not raise an exception on parser errors.
599+
You should use libxml2 version 2.6.21 or newer
600+
to take advantage of this feature.
601+
602+
The support for parsing broken HTML depends entirely on libxml2's
603+
recovery algorithm.
604+
It is not the fault of lxml if you find documents that
605+
are so heavily broken that the parser cannot handle them.
606+
There is also no guarantee that the resulting tree will
607+
contain all data from the original document.
608+
The parser may have to drop seriously broken parts when
609+
struggling to keep parsing.
610+
Especially misplaced meta tags can suffer from this,
611+
which may lead to encoding problems.
612+
613+
Documentation strings for this class are in the base class
614+
:class:`_HtmlFrameParser`.
615+
"""
616+
617+
def __init__(self, *args, **kwargs):
618+
super(_LiberalLxmlFrameParser, self).__init__(*args, strict=False, **kwargs)
619+
577620
def _expand_elements(body):
578621
lens = Series(lmap(len, body))
579622
lens_max = lens.max()
@@ -611,7 +654,8 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types,
611654

612655
_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
613656
'html5lib': _BeautifulSoupHtml5LibFrameParser,
614-
'bs4': _BeautifulSoupHtml5LibFrameParser}
657+
'bs4': _BeautifulSoupHtml5LibFrameParser,
658+
'lxml-liberal': _LiberalLxmlFrameParser,}
615659

616660

617661
def _parser_dispatch(flavor):

pandas/io/tests/test_html.py

+7
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,13 @@ def test_data_fail(self):
606606
with tm.assertRaises(XMLSyntaxError):
607607
self.read_html(banklist_data, flavor=['lxml'])
608608

609+
def test_lxml_liberal(self):
610+
banklist_data = os.path.join(DATA_PATH, 'banklist.html')
611+
612+
dfs = self.read_html(banklist_data, flavor=['lxml-liberal'])
613+
for df in dfs:
614+
tm.assert_isinstance(df, DataFrame)
615+
609616
def test_works_on_valid_markup(self):
610617
filename = os.path.join(DATA_PATH, 'valid_markup.html')
611618
dfs = self.read_html(filename, index_col=0, flavor=['lxml'])

0 commit comments

Comments
 (0)