From 91e7e5c89ce3306047d6c950ea804f8ad37c3cfd Mon Sep 17 00:00:00 2001 From: Alex Rothberg Date: Sun, 6 Oct 2013 16:45:10 -0400 Subject: [PATCH 1/3] ENH: Added lxml-liberal html parsing flavor (#5130) --- doc/source/release.rst | 3 ++- pandas/io/html.py | 48 ++++++++++++++++++++++++++++++++++-- pandas/io/tests/test_html.py | 7 ++++++ 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 8488d03f97cbd..c1a369ffa3ae4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -172,7 +172,8 @@ Improvements to existing features - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table from semi-structured JSON data. :ref:`See the docs` (:issue:`1067`) - ``DataFrame.from_records()`` will now accept generators (:issue:`4910`) - + - Added ``lxml-liberal`` html parsing flavor (:issue:`5130`) + API Changes ~~~~~~~~~~~ diff --git a/pandas/io/html.py b/pandas/io/html.py index 96bedbf390af6..b7607e37e26c0 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -469,6 +469,8 @@ class _LxmlFrameParser(_HtmlFrameParser): :class:`_HtmlFrameParser`. """ def __init__(self, *args, **kwargs): + self.strict = kwargs.pop('strict', True) + super(_LxmlFrameParser, self).__init__(*args, **kwargs) def _text_getter(self, obj): @@ -519,7 +521,7 @@ def _build_doc(self): from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError - parser = HTMLParser(recover=False) + parser = HTMLParser(recover=not self.strict) try: # try to parse the input in the simplest way @@ -572,8 +574,49 @@ def _parse_raw_tfoot(self, table): expr = './/tfoot//th' return [_remove_whitespace(x.text_content()) for x in table.xpath(expr)] + +class _LiberalLxmlFrameParser(_LxmlFrameParser): + """HTML to DataFrame parser that uses lxml under the hood. + + Tries hard to parse through broken XML. + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _LxmlFrameParser + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + It lets libxml2 try its best to return a valid HTML tree + with all content it can manage to parse. + It will not raise an exception on parser errors. + You should use libxml2 version 2.6.21 or newer + to take advantage of this feature. + + The support for parsing broken HTML depends entirely on libxml2's + recovery algorithm. + It is not the fault of lxml if you find documents that + are so heavily broken that the parser cannot handle them. + There is also no guarantee that the resulting tree will + contain all data from the original document. + The parser may have to drop seriously broken parts when + struggling to keep parsing. + Especially misplaced meta tags can suffer from this, + which may lead to encoding problems. + + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs): + super(_LiberalLxmlFrameParser, self).__init__(*args, strict=False, **kwargs) + def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() @@ -611,7 +654,8 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types, _valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser, 'html5lib': _BeautifulSoupHtml5LibFrameParser, - 'bs4': _BeautifulSoupHtml5LibFrameParser} + 'bs4': _BeautifulSoupHtml5LibFrameParser, + 'lxml-liberal': _LiberalLxmlFrameParser,} def _parser_dispatch(flavor): diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 9b0fb1cacfb65..eb34ff7ed3b1e 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -606,6 +606,13 @@ def test_data_fail(self): with tm.assertRaises(XMLSyntaxError): self.read_html(banklist_data, flavor=['lxml']) + def test_lxml_liberal(self): + banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + dfs = self.read_html(banklist_data, flavor=['lxml-liberal']) + for df in dfs: + tm.assert_isinstance(df, DataFrame) + def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') dfs = self.read_html(filename, index_col=0, flavor=['lxml']) From 46c3fe82c00bdede96f4c7539cbe9ff7a2004032 Mon Sep 17 00:00:00 2001 From: Alex Rothberg Date: Sun, 6 Oct 2013 22:01:27 -0400 Subject: [PATCH 2/3] - Changed API for HTML parsers - Use partial for lxml-liberal rather than a new class --- pandas/io/html.py | 98 +++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 59 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index b7607e37e26c0..f0cc5de7b2a32 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,6 +8,7 @@ import numbers import collections import warnings +from functools import partial from distutils.version import LooseVersion @@ -165,13 +166,12 @@ class _HtmlFrameParser(object): See each method's respective documentation for details on their functionality. """ - def __init__(self, io, match, attrs): - self.io = io + def __init__(self, match, attrs): self.match = match self.attrs = attrs - def parse_tables(self): - tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + def parse_tables(self, io): + tables = self._parse_tables(self._build_doc(io), self.match, self.attrs) return (self._build_table(table) for table in tables) def _parse_raw_data(self, rows): @@ -314,7 +314,7 @@ def _parse_tfoot(self, table): """ raise NotImplementedError - def _build_doc(self): + def _build_doc(self, io): """Return a tree-like object that can be used to iterate over the DOM. Returns @@ -414,15 +414,15 @@ def _parse_tables(self, doc, match, attrs): match.pattern) return result - def _setup_build_doc(self): - raw_text = _read(self.io) + def _setup_build_doc(self, io): + raw_text = _read(io) if not raw_text: - raise ValueError('No text parsed from document: %s' % self.io) + raise ValueError('No text parsed from document: %s' % io) return raw_text - def _build_doc(self): + def _build_doc(self, io): from bs4 import BeautifulSoup - return BeautifulSoup(self._setup_build_doc(), features='html5lib') + return BeautifulSoup(self._setup_build_doc(io), features='html5lib') def _build_xpath_expr(attrs): @@ -502,7 +502,7 @@ def _parse_tables(self, doc, match, kwargs): raise ValueError("No tables found matching regex %r" % pattern) return tables - def _build_doc(self): + def _build_doc(self, io): """ Raises ------ @@ -525,7 +525,7 @@ def _build_doc(self): try: # try to parse the input in the simplest way - r = parse(self.io, parser=parser) + r = parse(io, parser=parser) try: r = r.getroot() @@ -533,8 +533,8 @@ def _build_doc(self): pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop - if not _is_url(self.io): - r = fromstring(self.io, parser=parser) + if not _is_url(io): + r = fromstring(io, parser=parser) try: r = r.getroot() @@ -542,7 +542,7 @@ def _build_doc(self): pass else: # not a url - scheme = parse_url(self.io).scheme + scheme = parse_url(io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = ('%r is not a valid url scheme, valid schemes are ' @@ -576,47 +576,6 @@ def _parse_raw_tfoot(self, table): table.xpath(expr)] -class _LiberalLxmlFrameParser(_LxmlFrameParser): - """HTML to DataFrame parser that uses lxml under the hood. - - Tries hard to parse through broken XML. - - Warning - ------- - This parser can only handle HTTP, FTP, and FILE urls. - - See Also - -------- - _LxmlFrameParser - _HtmlFrameParser - _BeautifulSoupLxmlFrameParser - - Notes - ----- - It lets libxml2 try its best to return a valid HTML tree - with all content it can manage to parse. - It will not raise an exception on parser errors. - You should use libxml2 version 2.6.21 or newer - to take advantage of this feature. - - The support for parsing broken HTML depends entirely on libxml2's - recovery algorithm. - It is not the fault of lxml if you find documents that - are so heavily broken that the parser cannot handle them. - There is also no guarantee that the resulting tree will - contain all data from the original document. - The parser may have to drop seriously broken parts when - struggling to keep parsing. - Especially misplaced meta tags can suffer from this, - which may lead to encoding problems. - - Documentation strings for this class are in the base class - :class:`_HtmlFrameParser`. - """ - - def __init__(self, *args, **kwargs): - super(_LiberalLxmlFrameParser, self).__init__(*args, strict=False, **kwargs) - def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() @@ -655,7 +614,7 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types, _valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser, 'html5lib': _BeautifulSoupHtml5LibFrameParser, 'bs4': _BeautifulSoupHtml5LibFrameParser, - 'lxml-liberal': _LiberalLxmlFrameParser,} + 'lxml-liberal': partial(_LxmlFrameParser, strict=False),} def _parser_dispatch(flavor): @@ -740,10 +699,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs) + p = parser(compiled_match, attrs) try: - tables = p.parse_tables() + tables = p.parse_tables(io) except Exception as caught: retained = caught else: @@ -781,6 +740,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. + ``lxml-liberal`` - uses lxml parser but allows errors + to pass silently and then returns what it can from the parsed tables + that lxml is able to find. header : int or list-like or None, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to @@ -860,6 +822,24 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, This function will *always* return a list of :class:`DataFrame` *or* it will fail, e.g., it will *not* return an empty list. + + lxml-liberal tries hard to parse through broken XML. + It lets libxml2 try its best to return a valid HTML tree + with all content it can manage to parse. + It will not raise an exception on parser errors. + You should use libxml2 version 2.6.21 or newer + to take advantage of this feature. + + The support for parsing broken HTML depends entirely on libxml2's + recovery algorithm. + It is not the fault of lxml if you find documents that + are so heavily broken that the parser cannot handle them. + There is also no guarantee that the resulting tree will + contain all data from the original document. + The parser may have to drop seriously broken parts when + struggling to keep parsing. + Especially misplaced meta tags can suffer from this, + which may lead to encoding problems. Examples -------- From daad56198af978a2b548efa992ab53e8c41fd903 Mon Sep 17 00:00:00 2001 From: Alex Rothberg Date: Wed, 9 Oct 2013 00:25:51 -0400 Subject: [PATCH 3/3] Additional tests --- pandas/io/tests/test_html.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index eb34ff7ed3b1e..7415e33c1ece0 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -612,6 +612,26 @@ def test_lxml_liberal(self): dfs = self.read_html(banklist_data, flavor=['lxml-liberal']) for df in dfs: tm.assert_isinstance(df, DataFrame) + self.assertFalse(df.empty) + + @slow + def test_lxml_liberal2(self): + _skip_if_no('bs4') + banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + dfs_lxml = self.read_html(banklist_data, flavor=['lxml-liberal']) + dfs_bs4 = self.read_html(banklist_data, flavor=['bs4']) + + if len(dfs_lxml) != len(dfs_bs4): + return + + for df_lxml,df_bs4 in zip(dfs_lxml, dfs_bs4): + try: + tm.assert_frame_equal(df_lxml,df_bs4) + except AssertionError: + return + + self.fail() def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html')