Skip to content

ENH: Added lxml-liberal html parsing flavor #5131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ Improvements to existing features
- :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table
from semi-structured JSON data. :ref:`See the docs<io.json_normalize>` (:issue:`1067`)
- ``DataFrame.from_records()`` will now accept generators (:issue:`4910`)

- Added ``lxml-liberal`` html parsing flavor (:issue:`5130`)

API Changes
~~~~~~~~~~~

Expand Down
64 changes: 44 additions & 20 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numbers
import collections
import warnings
from functools import partial

from distutils.version import LooseVersion

Expand Down Expand Up @@ -165,13 +166,12 @@ class _HtmlFrameParser(object):
See each method's respective documentation for details on their
functionality.
"""
def __init__(self, io, match, attrs):
self.io = io
def __init__(self, match, attrs):
self.match = match
self.attrs = attrs

def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
def parse_tables(self, io):
tables = self._parse_tables(self._build_doc(io), self.match, self.attrs)
return (self._build_table(table) for table in tables)

def _parse_raw_data(self, rows):
Expand Down Expand Up @@ -314,7 +314,7 @@ def _parse_tfoot(self, table):
"""
raise NotImplementedError

def _build_doc(self):
def _build_doc(self, io):
"""Return a tree-like object that can be used to iterate over the DOM.

Returns
Expand Down Expand Up @@ -414,15 +414,15 @@ def _parse_tables(self, doc, match, attrs):
match.pattern)
return result

def _setup_build_doc(self):
raw_text = _read(self.io)
def _setup_build_doc(self, io):
raw_text = _read(io)
if not raw_text:
raise ValueError('No text parsed from document: %s' % self.io)
raise ValueError('No text parsed from document: %s' % io)
return raw_text

def _build_doc(self):
def _build_doc(self, io):
from bs4 import BeautifulSoup
return BeautifulSoup(self._setup_build_doc(), features='html5lib')
return BeautifulSoup(self._setup_build_doc(io), features='html5lib')


def _build_xpath_expr(attrs):
Expand Down Expand Up @@ -469,6 +469,8 @@ class _LxmlFrameParser(_HtmlFrameParser):
:class:`_HtmlFrameParser`.
"""
def __init__(self, *args, **kwargs):
self.strict = kwargs.pop('strict', True)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cpcloud Are you averse to my leaving this in? If you are then I can probably factor out HTMLParser(recover=False) to a new method like get_parser to make subclassing easier.


super(_LxmlFrameParser, self).__init__(*args, **kwargs)

def _text_getter(self, obj):
Expand Down Expand Up @@ -500,7 +502,7 @@ def _parse_tables(self, doc, match, kwargs):
raise ValueError("No tables found matching regex %r" % pattern)
return tables

def _build_doc(self):
def _build_doc(self, io):
"""
Raises
------
Expand All @@ -519,28 +521,28 @@ def _build_doc(self):
from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError

parser = HTMLParser(recover=False)
parser = HTMLParser(recover=not self.strict)

try:
# try to parse the input in the simplest way
r = parse(self.io, parser=parser)
r = parse(io, parser=parser)

try:
r = r.getroot()
except AttributeError:
pass
except (UnicodeDecodeError, IOError):
# if the input is a blob of html goop
if not _is_url(self.io):
r = fromstring(self.io, parser=parser)
if not _is_url(io):
r = fromstring(io, parser=parser)

try:
r = r.getroot()
except AttributeError:
pass
else:
# not a url
scheme = parse_url(self.io).scheme
scheme = parse_url(io).scheme
if scheme not in _valid_schemes:
# lxml can't parse it
msg = ('%r is not a valid url scheme, valid schemes are '
Expand Down Expand Up @@ -572,7 +574,7 @@ def _parse_raw_tfoot(self, table):
expr = './/tfoot//th'
return [_remove_whitespace(x.text_content()) for x in
table.xpath(expr)]


def _expand_elements(body):
lens = Series(lmap(len, body))
Expand Down Expand Up @@ -611,7 +613,8 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types,

_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
'html5lib': _BeautifulSoupHtml5LibFrameParser,
'bs4': _BeautifulSoupHtml5LibFrameParser}
'bs4': _BeautifulSoupHtml5LibFrameParser,
'lxml-liberal': partial(_LxmlFrameParser, strict=False),}


def _parser_dispatch(flavor):
Expand Down Expand Up @@ -696,10 +699,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs)
p = parser(compiled_match, attrs)

try:
tables = p.parse_tables()
tables = p.parse_tables(io)
except Exception as caught:
retained = caught
else:
Expand Down Expand Up @@ -737,6 +740,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
each other, they are both there for backwards compatibility. The
default of ``None`` tries to use ``lxml`` to parse and if that fails it
falls back on ``bs4`` + ``html5lib``.
``lxml-liberal`` - uses lxml parser but allows errors
to pass silently and then returns what it can from the parsed tables
that lxml is able to find.

header : int or list-like or None, optional
The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
Expand Down Expand Up @@ -816,6 +822,24 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,

This function will *always* return a list of :class:`DataFrame` *or*
it will fail, e.g., it will *not* return an empty list.

lxml-liberal tries hard to parse through broken XML.
It lets libxml2 try its best to return a valid HTML tree
with all content it can manage to parse.
It will not raise an exception on parser errors.
You should use libxml2 version 2.6.21 or newer
to take advantage of this feature.

The support for parsing broken HTML depends entirely on libxml2's
recovery algorithm.
It is not the fault of lxml if you find documents that
are so heavily broken that the parser cannot handle them.
There is also no guarantee that the resulting tree will
contain all data from the original document.
The parser may have to drop seriously broken parts when
struggling to keep parsing.
Especially misplaced meta tags can suffer from this,
which may lead to encoding problems.

Examples
--------
Expand Down
27 changes: 27 additions & 0 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,33 @@ def test_data_fail(self):
with tm.assertRaises(XMLSyntaxError):
self.read_html(banklist_data, flavor=['lxml'])

def test_lxml_liberal(self):
banklist_data = os.path.join(DATA_PATH, 'banklist.html')

dfs = self.read_html(banklist_data, flavor=['lxml-liberal'])
for df in dfs:
tm.assert_isinstance(df, DataFrame)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a pretty minimal test case, don't you have expectations for results?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jtratner Well, TBI, the only guarantees that lxml makes with recover=True is that:

"There is also no guarantee that the resulting tree will
contain all data from the original document. "

and

"It will not raise an exception on parser errors. "

so I am not sure that I want to check anything other than no errors.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the problem is that using non-strict lxml will be missing a row ... i don't remember exactly which one but @cancan101 you could check for that ... a little annoying to figure out which one it is but shouldn't be too hard with this new functionality

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cpcloud What exactly should I put in the test? That the parse from bs4 != parse from lxml-liberal?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cpcloud bump

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can take a look later tonight

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cancan101

That the parse from bs4 != parse from lxml-liberal?

That's a possibility. I actually think because we don't know the specific reasons why lxml will remove things without delving deep into its parser, that just testing that a non-empty DataFrame is returned is okay.

So instead of just tm.assert_isinstance() do that plus add something like self.assertFalse(df.empty) and then add the bs4 comparison test and this should be okay

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cpcloud let me know if that looks okay

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cpcloud Anything else to get this merged?

self.assertFalse(df.empty)

@slow
def test_lxml_liberal2(self):
_skip_if_no('bs4')
banklist_data = os.path.join(DATA_PATH, 'banklist.html')

dfs_lxml = self.read_html(banklist_data, flavor=['lxml-liberal'])
dfs_bs4 = self.read_html(banklist_data, flavor=['bs4'])

if len(dfs_lxml) != len(dfs_bs4):
return

for df_lxml,df_bs4 in zip(dfs_lxml, dfs_bs4):
try:
tm.assert_frame_equal(df_lxml,df_bs4)
except AssertionError:
return

self.fail()

def test_works_on_valid_markup(self):
filename = os.path.join(DATA_PATH, 'valid_markup.html')
dfs = self.read_html(filename, index_col=0, flavor=['lxml'])
Expand Down