From 91e7e5c89ce3306047d6c950ea804f8ad37c3cfd Mon Sep 17 00:00:00 2001
From: Alex Rothberg <agrothberg@gmail.com>
Date: Sun, 6 Oct 2013 16:45:10 -0400
Subject: [PATCH 1/3] ENH: Added lxml-liberal html parsing flavor (#5130)

---
 doc/source/release.rst       |  3 ++-
 pandas/io/html.py            | 48 ++++++++++++++++++++++++++++++++++--
 pandas/io/tests/test_html.py |  7 ++++++
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/doc/source/release.rst b/doc/source/release.rst
index 8488d03f97cbd..c1a369ffa3ae4 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -172,7 +172,8 @@ Improvements to existing features
   - :meth:`~pandas.io.json.json_normalize` is a new method to allow you to create a flat table
     from semi-structured JSON data. :ref:`See the docs<io.json_normalize>` (:issue:`1067`)
   - ``DataFrame.from_records()`` will now accept generators (:issue:`4910`)
-
+  - Added ``lxml-liberal`` html parsing flavor (:issue:`5130`)
+  
 API Changes
 ~~~~~~~~~~~
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 96bedbf390af6..b7607e37e26c0 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -469,6 +469,8 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
     def __init__(self, *args, **kwargs):
+        self.strict = kwargs.pop('strict', True)
+
         super(_LxmlFrameParser, self).__init__(*args, **kwargs)
 
     def _text_getter(self, obj):
@@ -519,7 +521,7 @@ def _build_doc(self):
         from lxml.html import parse, fromstring, HTMLParser
         from lxml.etree import XMLSyntaxError
 
-        parser = HTMLParser(recover=False)
+        parser = HTMLParser(recover=not self.strict)
 
         try:
             # try to parse the input in the simplest way
@@ -572,8 +574,49 @@ def _parse_raw_tfoot(self, table):
         expr = './/tfoot//th'
         return [_remove_whitespace(x.text_content()) for x in
                 table.xpath(expr)]
+        
 
+class _LiberalLxmlFrameParser(_LxmlFrameParser):
+    """HTML to DataFrame parser that uses lxml under the hood.
+    
+    Tries hard to parse through broken XML.
 
+    Warning
+    -------
+    This parser can only handle HTTP, FTP, and FILE urls.
+
+    See Also
+    --------
+    _LxmlFrameParser
+    _HtmlFrameParser
+    _BeautifulSoupLxmlFrameParser
+
+    Notes
+    -----
+    It lets libxml2 try its best to return a valid HTML tree 
+    with all content it can manage to parse. 
+    It will not raise an exception on parser errors. 
+    You should use libxml2 version 2.6.21 or newer 
+    to take advantage of this feature.
+    
+    The support for parsing broken HTML depends entirely on libxml2's 
+    recovery algorithm. 
+    It is not the fault of lxml if you find documents that 
+    are so heavily broken that the parser cannot handle them. 
+    There is also no guarantee that the resulting tree will 
+    contain all data from the original document. 
+    The parser may have to drop seriously broken parts when 
+    struggling to keep parsing. 
+    Especially misplaced meta tags can suffer from this, 
+    which may lead to encoding problems.
+    
+    Documentation strings for this class are in the base class
+    :class:`_HtmlFrameParser`.
+    """
+    
+    def __init__(self, *args, **kwargs):
+        super(_LiberalLxmlFrameParser, self).__init__(*args, strict=False, **kwargs)
+    
 def _expand_elements(body):
     lens = Series(lmap(len, body))
     lens_max = lens.max()
@@ -611,7 +654,8 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types,
 
 _valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
                   'html5lib': _BeautifulSoupHtml5LibFrameParser,
-                  'bs4': _BeautifulSoupHtml5LibFrameParser}
+                  'bs4': _BeautifulSoupHtml5LibFrameParser,
+                  'lxml-liberal': _LiberalLxmlFrameParser,}
 
 
 def _parser_dispatch(flavor):
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index 9b0fb1cacfb65..eb34ff7ed3b1e 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -606,6 +606,13 @@ def test_data_fail(self):
         with tm.assertRaises(XMLSyntaxError):
             self.read_html(banklist_data, flavor=['lxml'])
 
+    def test_lxml_liberal(self):
+        banklist_data = os.path.join(DATA_PATH, 'banklist.html')
+        
+        dfs = self.read_html(banklist_data, flavor=['lxml-liberal'])
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+            
     def test_works_on_valid_markup(self):
         filename = os.path.join(DATA_PATH, 'valid_markup.html')
         dfs = self.read_html(filename, index_col=0, flavor=['lxml'])

From 46c3fe82c00bdede96f4c7539cbe9ff7a2004032 Mon Sep 17 00:00:00 2001
From: Alex Rothberg <agrothberg@gmail.com>
Date: Sun, 6 Oct 2013 22:01:27 -0400
Subject: [PATCH 2/3] - Changed API for HTML parsers - Use partial for
 lxml-liberal rather than a new class

---
 pandas/io/html.py | 98 +++++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 59 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index b7607e37e26c0..f0cc5de7b2a32 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -8,6 +8,7 @@
 import numbers
 import collections
 import warnings
+from functools import partial
 
 from distutils.version import LooseVersion
 
@@ -165,13 +166,12 @@ class _HtmlFrameParser(object):
     See each method's respective documentation for details on their
     functionality.
     """
-    def __init__(self, io, match, attrs):
-        self.io = io
+    def __init__(self, match, attrs):
         self.match = match
         self.attrs = attrs
 
-    def parse_tables(self):
-        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+    def parse_tables(self, io):
+        tables = self._parse_tables(self._build_doc(io), self.match, self.attrs)
         return (self._build_table(table) for table in tables)
 
     def _parse_raw_data(self, rows):
@@ -314,7 +314,7 @@ def _parse_tfoot(self, table):
         """
         raise NotImplementedError
 
-    def _build_doc(self):
+    def _build_doc(self, io):
         """Return a tree-like object that can be used to iterate over the DOM.
 
         Returns
@@ -414,15 +414,15 @@ def _parse_tables(self, doc, match, attrs):
                              match.pattern)
         return result
 
-    def _setup_build_doc(self):
-        raw_text = _read(self.io)
+    def _setup_build_doc(self, io):
+        raw_text = _read(io)
         if not raw_text:
-            raise ValueError('No text parsed from document: %s' % self.io)
+            raise ValueError('No text parsed from document: %s' % io)
         return raw_text
 
-    def _build_doc(self):
+    def _build_doc(self, io):
         from bs4 import BeautifulSoup
-        return BeautifulSoup(self._setup_build_doc(), features='html5lib')
+        return BeautifulSoup(self._setup_build_doc(io), features='html5lib')
 
 
 def _build_xpath_expr(attrs):
@@ -502,7 +502,7 @@ def _parse_tables(self, doc, match, kwargs):
             raise ValueError("No tables found matching regex %r" % pattern)
         return tables
 
-    def _build_doc(self):
+    def _build_doc(self, io):
         """
         Raises
         ------
@@ -525,7 +525,7 @@ def _build_doc(self):
 
         try:
             # try to parse the input in the simplest way
-            r = parse(self.io, parser=parser)
+            r = parse(io, parser=parser)
 
             try:
                 r = r.getroot()
@@ -533,8 +533,8 @@ def _build_doc(self):
                 pass
         except (UnicodeDecodeError, IOError):
             # if the input is a blob of html goop
-            if not _is_url(self.io):
-                r = fromstring(self.io, parser=parser)
+            if not _is_url(io):
+                r = fromstring(io, parser=parser)
 
                 try:
                     r = r.getroot()
@@ -542,7 +542,7 @@ def _build_doc(self):
                     pass
             else:
                 # not a url
-                scheme = parse_url(self.io).scheme
+                scheme = parse_url(io).scheme
                 if scheme not in _valid_schemes:
                     # lxml can't parse it
                     msg = ('%r is not a valid url scheme, valid schemes are '
@@ -576,47 +576,6 @@ def _parse_raw_tfoot(self, table):
                 table.xpath(expr)]
         
 
-class _LiberalLxmlFrameParser(_LxmlFrameParser):
-    """HTML to DataFrame parser that uses lxml under the hood.
-    
-    Tries hard to parse through broken XML.
-
-    Warning
-    -------
-    This parser can only handle HTTP, FTP, and FILE urls.
-
-    See Also
-    --------
-    _LxmlFrameParser
-    _HtmlFrameParser
-    _BeautifulSoupLxmlFrameParser
-
-    Notes
-    -----
-    It lets libxml2 try its best to return a valid HTML tree 
-    with all content it can manage to parse. 
-    It will not raise an exception on parser errors. 
-    You should use libxml2 version 2.6.21 or newer 
-    to take advantage of this feature.
-    
-    The support for parsing broken HTML depends entirely on libxml2's 
-    recovery algorithm. 
-    It is not the fault of lxml if you find documents that 
-    are so heavily broken that the parser cannot handle them. 
-    There is also no guarantee that the resulting tree will 
-    contain all data from the original document. 
-    The parser may have to drop seriously broken parts when 
-    struggling to keep parsing. 
-    Especially misplaced meta tags can suffer from this, 
-    which may lead to encoding problems.
-    
-    Documentation strings for this class are in the base class
-    :class:`_HtmlFrameParser`.
-    """
-    
-    def __init__(self, *args, **kwargs):
-        super(_LiberalLxmlFrameParser, self).__init__(*args, strict=False, **kwargs)
-    
 def _expand_elements(body):
     lens = Series(lmap(len, body))
     lens_max = lens.max()
@@ -655,7 +614,7 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types,
 _valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
                   'html5lib': _BeautifulSoupHtml5LibFrameParser,
                   'bs4': _BeautifulSoupHtml5LibFrameParser,
-                  'lxml-liberal': _LiberalLxmlFrameParser,}
+                  'lxml-liberal': partial(_LxmlFrameParser, strict=False),}
 
 
 def _parser_dispatch(flavor):
@@ -740,10 +699,10 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs)
+        p = parser(compiled_match, attrs)
 
         try:
-            tables = p.parse_tables()
+            tables = p.parse_tables(io)
         except Exception as caught:
             retained = caught
         else:
@@ -781,6 +740,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
         each other, they are both there for backwards compatibility. The
         default of ``None`` tries to use ``lxml`` to parse and if that fails it
         falls back on ``bs4`` + ``html5lib``.
+        ``lxml-liberal`` - uses lxml parser but allows errors 
+        to pass silently and then returns what it can from the parsed tables 
+        that lxml is able to find.
 
     header : int or list-like or None, optional
         The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
@@ -860,6 +822,24 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
     This function will *always* return a list of :class:`DataFrame` *or*
     it will fail, e.g., it will *not* return an empty list.
+    
+    lxml-liberal tries hard to parse through broken XML.
+    It lets libxml2 try its best to return a valid HTML tree 
+    with all content it can manage to parse. 
+    It will not raise an exception on parser errors. 
+    You should use libxml2 version 2.6.21 or newer 
+    to take advantage of this feature.
+    
+    The support for parsing broken HTML depends entirely on libxml2's 
+    recovery algorithm. 
+    It is not the fault of lxml if you find documents that 
+    are so heavily broken that the parser cannot handle them. 
+    There is also no guarantee that the resulting tree will 
+    contain all data from the original document. 
+    The parser may have to drop seriously broken parts when 
+    struggling to keep parsing. 
+    Especially misplaced meta tags can suffer from this, 
+    which may lead to encoding problems.
 
     Examples
     --------

From daad56198af978a2b548efa992ab53e8c41fd903 Mon Sep 17 00:00:00 2001
From: Alex Rothberg <agrothberg@gmail.com>
Date: Wed, 9 Oct 2013 00:25:51 -0400
Subject: [PATCH 3/3] Additional tests

---
 pandas/io/tests/test_html.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index eb34ff7ed3b1e..7415e33c1ece0 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -612,6 +612,26 @@ def test_lxml_liberal(self):
         dfs = self.read_html(banklist_data, flavor=['lxml-liberal'])
         for df in dfs:
             tm.assert_isinstance(df, DataFrame)
+            self.assertFalse(df.empty)
+    
+    @slow        
+    def test_lxml_liberal2(self):
+        _skip_if_no('bs4')
+        banklist_data = os.path.join(DATA_PATH, 'banklist.html')
+        
+        dfs_lxml = self.read_html(banklist_data, flavor=['lxml-liberal'])
+        dfs_bs4 = self.read_html(banklist_data, flavor=['bs4'])
+        
+        if len(dfs_lxml) != len(dfs_bs4):
+            return 
+        
+        for df_lxml,df_bs4 in zip(dfs_lxml, dfs_bs4):
+            try:
+                tm.assert_frame_equal(df_lxml,df_bs4)
+            except AssertionError:
+                return
+            
+        self.fail()
             
     def test_works_on_valid_markup(self):
         filename = os.path.join(DATA_PATH, 'valid_markup.html')