diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b24a6f067cee4..6214236b41e7c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -125,6 +125,7 @@ Other Enhancements - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) +- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`) - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. (:issue:`15838`, :issue:`17438`) - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). diff --git a/pandas/io/html.py b/pandas/io/html.py index a4acb26af5259..d0021e9cc93d0 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -17,10 +17,10 @@ from pandas.io.common import (_is_url, urlopen, parse_url, _validate_header_arg) from pandas.io.parsers import TextParser -from pandas.compat import (lrange, lmap, u, string_types, iteritems, +from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems, raise_with_traceback, binary_type) from pandas import Series -from pandas.core.common import AbstractMethodError +from pandas.core.common import (AbstractMethodError, flatten) from pandas.io.formats.printing import pprint_thing _IMPORTS = False @@ -176,13 +176,15 @@ class _HtmlFrameParser(object): ----- To subclass this class effectively you must override the following methods: * :func:`_build_doc` - * :func:`_text_getter` - * :func:`_parse_td` * :func:`_parse_tables` - * :func:`_parse_tr` - * :func:`_parse_thead` - * :func:`_parse_tbody` - * :func:`_parse_tfoot` + * :func:`_text_getter` + * :func:`_equals_tag` + * :func:`_has_tag` + * :func:`_extract_td` + * :func:`_extract_tr` + * :func:`_extract_thead` + * :func:`_extract_tbody` + * :func:`_extract_tfoot` See each method's respective documentation for details on their functionality. """ @@ -194,40 +196,45 @@ def __init__(self, io, match, attrs, encoding): self.encoding = encoding def parse_tables(self): + """Parse and return all tables from the DOM. + + Returns + ------- + tables : list of parsed (header, body, footer) tuples from tables + """ tables = self._parse_tables(self._build_doc(), self.match, self.attrs) return (self._build_table(table) for table in tables) - def _parse_raw_data(self, rows): - """Parse the raw data into a list of lists. + def _parse_tables(self, doc, match, attrs): + """Return all tables from the parsed DOM. Parameters ---------- - rows : iterable of node-like - A list of row elements. + doc : the DOM from which to parse the table element. - text_getter : callable - A callable that gets the text from an individual node. This must be - defined by subclasses. + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. - column_finder : callable - A callable that takes a row node as input and returns a list of the - column node in that row. This must be defined by subclasses. + Raises + ------ + ValueError : `match` does not match any text in the document. Returns ------- - data : list of list of strings + tables : list of HTML elements to be parsed into raw data. """ - data = [[_remove_whitespace(self._text_getter(col)) for col in - self._parse_td(row)] for row in rows] - return data + raise AbstractMethodError(self) def _text_getter(self, obj): """Return the text of an individual DOM node. Parameters ---------- - obj : node-like - A DOM node. + obj : a DOM node. Returns ------- @@ -236,104 +243,104 @@ def _text_getter(self, obj): """ raise AbstractMethodError(self) - def _parse_td(self, obj): - """Return the td elements from a row element. + def _equals_tag(self, obj, tag): + """Returns whether an individual DOM node matches a tag Parameters ---------- - obj : node-like + obj : a DOM node. + + tag : str + Tag to be checked for equality Returns ------- - columns : list of node-like - These are the elements of each row, i.e., the columns. + is_tag_equal : boolean + boolean indicating if the object is equal to tag 'tag' """ raise AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): - """Return all tables from the parsed DOM. + def _contains_tag(self, obj, tag): + """Returns whether an individual DOM node has a particular tag + contained within it Parameters ---------- - doc : tree-like - The DOM from which to parse the table element. + obj : a DOM node. - match : str or regular expression - The text to search for in the DOM tree. + tag : str + Tag to be found in this DOM - attrs : dict - A dictionary of table attributes that can be used to disambiguate - mutliple tables on a page. + Returns + ------- + does_tag_contain : boolean + boolean indicating if the object contains tag 'tag' + """ + raise AbstractMethodError(self) - Raises - ------ - ValueError - * If `match` does not match any text in the document. + def _extract_td(self, obj): + """Return the td elements from a row element. + + Parameters + ---------- + obj : an HTML row element Returns ------- - tables : list of node-like - A list of
elements to be parsed into raw data. + columns : list of HTML td elements (i.e., the columns in the row) """ raise AbstractMethodError(self) - def _parse_tr(self, table): + def _extract_tr(self, table): """Return the list of row elements from the parsed table element. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - rows : list of node-like - A list row elements of a table, usually or or ... element. + thead : an HTML ... element. """ raise AbstractMethodError(self) - def _parse_tbody(self, table): + def _extract_tbody(self, table): """Return the body of the table. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - tbody : node-like - A ... element. + tbody : an HTML ... element. """ raise AbstractMethodError(self) - def _parse_tfoot(self, table): + def _extract_tfoot(self, table): """Return the footer of the table if any. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - tfoot : node-like - A ... element. + tfoot : an HTML ... element. """ raise AbstractMethodError(self) @@ -342,44 +349,155 @@ def _build_doc(self): Returns ------- - obj : tree-like + obj : the DOM from which to parse the table element. """ raise AbstractMethodError(self) - def _build_table(self, table): - header = self._parse_raw_thead(table) - body = self._parse_raw_tbody(table) - footer = self._parse_raw_tfoot(table) + def _build_table(self, table_html): + header, body, footer = self._parse_raw_thead_tbody_tfoot(table_html) + # the above "footer" actually produces a footer. The below "footer" + # rarely does. The below "footer" is the legacy behavior and so I'm + # leaving it for the time being. + footer = self._parse_raw_tfoot(table_html) return header, body, footer - def _parse_raw_thead(self, table): - thead = self._parse_thead(table) + def _parse_raw_thead_tbody_tfoot(self, table_html): + """Given a table, return parsed header, body, and foot. + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of parsed elements. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are
elements. + rows : list of row elements of a table, usually
+ elements. """ raise AbstractMethodError(self) - def _parse_thead(self, table): + def _extract_thead(self, table): """Return the header of a table. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - thead : node-like - A
+ - Move rows from bottom of body to footer only if + all elements inside row are + + Parameters + ---------- + table_html : a single HTML table element. + + Returns + ------- + tuple of (header, body, footer) + header : list of rows, each of which is a list of parsed + header elements + body : list of rows, each of which is a list of parsed body elements + footer : list of rows, each of which is a list of parsed + footer elements + """ + + header_rows = [] + body_rows = [] + footer_rows = [] + + # first, are there thead and tbody elements in the table? + if (self._contains_tag(table_html, 'thead') and + self._contains_tag(table_html, 'tbody')): + header_rows = self._extract_tr(self._extract_thead(table_html)[0]) + body_rows = self._extract_tr(self._extract_tbody(table_html)[0]) + + if self._contains_tag(table_html, 'tfoot'): + footer_rows = self._extract_tr( + self._extract_tfoot(table_html)[0]) + else: + # Otherwise we need to split the body into header/body/foot. + body_rows = self._extract_tr(table_html) + if body_rows == []: + # empty table, just return nothing + return [], [], [] + # splitting criterion: if all tags within a row are th, it's part + # of the header/footer + while all(self._equals_tag(t, 'th') for t in + self._extract_td(body_rows[0])): + # this row should be a header row, move it from body to header + header_rows.append(body_rows.pop(0)) + while all(self._equals_tag(t, 'th') for t in + self._extract_td(body_rows[-1])): + # this row should be a footer row, move it from body to footer + footer_rows.insert(0, body_rows.pop()) + + header = self._expand_colspan_rowspan(header_rows, fill_rowspan=False) + body = self._expand_colspan_rowspan(body_rows, fill_rowspan=True) + footer = self._expand_colspan_rowspan(footer_rows, fill_rowspan=False) + + # The below line is lifted from _parse_raw_tfoot. Not sure what + # it does. + footer = np.atleast_1d(np.array(footer).squeeze( + )) if footer and len(footer) == 1 else footer + return header, body, footer + + def _expand_colspan_rowspan(self, rows, fill_rowspan=True): + """Given a list of rows, return a list of rows that properly handle + colspan/rowspan + + Discussion on behavior of fill_rowspan in #17073 + + Parameters + ---------- + rows : list of rows, each of which is a list of elements in that row + + fill_rowspan : boolean + Should a rowspan fill every item in the rowspan (True) or only the + bottommost element (False)? Default is True. + + Returns + ------- + res : list of rows, each of which is a list of elements in that row, + respecting colspan/rowspan + """ + res = [] - if thead: - trs = self._parse_tr(thead[0]) - for tr in trs: - cols = lmap(self._text_getter, self._parse_td(tr)) - if any([col != '' for col in cols]): - res.append(cols) + saved_span = [] + for row in rows: + extracted_row = self._extract_td(row) + cols_text = [_remove_whitespace( + self._text_getter(col)) for col in extracted_row] + col_colspans = [int(col.get('colspan', 1)) + for col in extracted_row] + col_rowspans = [int(col.get('rowspan', 1)) + for col in extracted_row] + # expand cols using col_colspans + # maybe this can be done with a list comprehension, dunno + cols = list(zip( + list(flatten( + lmap(lambda text_nc: [text_nc[0]] * text_nc[1], + list(zip(cols_text, col_colspans))))), + list(flatten( + lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0], + list(zip(col_colspans, col_rowspans)))))) + ) + # cols is now a list of (text, number of rows) + # now insert any previous rowspans + for (col, (text, nr)) in saved_span: + cols.insert(col, (text, nr)) + + # save next saved_span + def advance_item_to_next_row(item): + (col, (text, nr)) = item + if nr == 1: + return None + else: + # only keep the text around if fill_rowspan is set + return (col, (text if fill_rowspan else '', nr - 1)) + saved_span = lfilter(lambda i: i is not None, + lmap(advance_item_to_next_row, + list(enumerate(cols)))) + cols = [text for (text, nr) in cols] + # generate cols with text only + if any([col != '' for col in cols]): + res.append(cols) return res - def _parse_raw_tfoot(self, table): - tfoot = self._parse_tfoot(table) + def _parse_raw_tfoot(self, table_html): + tfoot = self._extract_tfoot(table_html) res = [] if tfoot: - res = lmap(self._text_getter, self._parse_td(tfoot[0])) + res = lmap(self._text_getter, self._extract_td(tfoot[0])) return np.atleast_1d( np.array(res).squeeze()) if res and len(res) == 1 else res - def _parse_raw_tbody(self, table): - tbody = self._parse_tbody(table) - - try: - res = self._parse_tr(tbody[0]) - except IndexError: - res = self._parse_tr(table) - return self._parse_raw_data(res) - class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): """HTML to DataFrame parser that uses BeautifulSoup under the hood. @@ -401,27 +519,6 @@ def __init__(self, *args, **kwargs): from bs4 import SoupStrainer self._strainer = SoupStrainer('table') - def _text_getter(self, obj): - return obj.text - - def _parse_td(self, row): - return row.find_all(('td', 'th')) - - def _parse_tr(self, element): - return element.find_all('tr') - - def _parse_th(self, element): - return element.find_all('th') - - def _parse_thead(self, table): - return table.find_all('thead') - - def _parse_tbody(self, table): - return table.find_all('tbody') - - def _parse_tfoot(self, table): - return table.find_all('tfoot') - def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) @@ -443,6 +540,33 @@ def _parse_tables(self, doc, match, attrs): match.pattern) return result + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag): + return obj.name == tag + + def _contains_tag(self, obj, tag): + return obj.find(tag) is not None + + def _extract_td(self, row): + return row.find_all(('td', 'th')) + + def _extract_tr(self, element): + return element.find_all('tr') + + def _extract_th(self, element): + return element.find_all('th') + + def _extract_thead(self, table): + return table.find_all('thead') + + def _extract_tbody(self, table): + return table.find_all('tbody') + + def _extract_tfoot(self, table): + return table.find_all('tfoot') + def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: @@ -502,16 +626,6 @@ class _LxmlFrameParser(_HtmlFrameParser): def __init__(self, *args, **kwargs): super(_LxmlFrameParser, self).__init__(*args, **kwargs) - def _text_getter(self, obj): - return obj.text_content() - - def _parse_td(self, row): - return row.xpath('.//td|.//th') - - def _parse_tr(self, table): - expr = './/tr[normalize-space()]' - return table.xpath(expr) - def _parse_tables(self, doc, match, kwargs): pattern = match.pattern @@ -531,6 +645,22 @@ def _parse_tables(self, doc, match, kwargs): raise ValueError("No tables found matching regex %r" % pattern) return tables + def _equals_tag(self, obj, tag): + return obj.tag == tag + + def _contains_tag(self, obj, tag): + return obj.find(tag) is not None + + def _text_getter(self, obj): + return obj.text_content() + + def _extract_td(self, row): + return row.xpath('.//td|.//th') + + def _extract_tr(self, table): + expr = './/tr[normalize-space()]' + return table.xpath(expr) + def _build_doc(self): """ Raises @@ -585,13 +715,13 @@ def _build_doc(self): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r - def _parse_tbody(self, table): + def _extract_tbody(self, table): return table.xpath('.//tbody') - def _parse_thead(self, table): + def _extract_thead(self, table): return table.xpath('.//thead') - def _parse_tfoot(self, table): + def _extract_tfoot(self, table): return table.xpath('.//tfoot') def _parse_raw_thead(self, table): @@ -599,10 +729,10 @@ def _parse_raw_thead(self, table): thead = table.xpath(expr) res = [] if thead: - trs = self._parse_tr(thead[0]) + trs = self._extract_tr(thead[0]) for tr in trs: cols = [_remove_whitespace(x.text_content()) for x in - self._parse_td(tr)] + self._extract_td(tr)] if any([col != '' for col in cols]): res.append(cols) return res @@ -873,7 +1003,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, This function searches for ```` elements and only for ```` and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``
`` rows and ```` elements within each ``
`` - element in the table. ```` stands for "table data". + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied **after** `skiprows` is applied. diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6fc080c8d9090..ce12df60c565a 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -28,7 +28,6 @@ from pandas.io.common import URLError, urlopen, file_path_to_url import pandas.io.html from pandas.io.html import read_html -from pandas._libs.parsers import ParserError import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -385,7 +384,7 @@ def test_thousands_macau_stats(self): attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self): @@ -394,7 +393,7 @@ def test_thousands_macau_index_col(self): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) def test_empty_tables(self): """ @@ -640,6 +639,121 @@ def test_different_number_of_rows(self): res = self.read_html(out, index_col=0)[0] tm.assert_frame_equal(expected, res) + def test_colspan_rowspan_are_1(self): + # GH17054 + expected = """ + + + + + + + + + + +
XYZW
""" + out = """ + + + + + + + + + + +
XYZW
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_colspan_rowspan_are_more_than_1(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + + + +
XXYZW
1223
""" + out = """ + + + + + + + + + + + + + + + +
XYZW
123
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_header_should_be_inferred_from_th_elements(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + + +
XXYZW
12345
""" + out = """ + + + + + + + + + + + + + +
XXYZW
12345
""" + expected = self.read_html(expected)[0] # header is explicit + res = self.read_html(out)[0] # infer header + tm.assert_frame_equal(expected, res) + res2 = self.read_html(out, header=0)[0] # manually set header + tm.assert_frame_equal(expected, res2) + def test_parse_dates_list(self): df = DataFrame({'date': date_range('1/1/2001', periods=10)}) expected = df.to_html() @@ -657,14 +771,6 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') - with tm.assert_raises_regex(ParserError, - r"Passed header=\[0,1\] are " - r"too many rows for this " - r"multi_index of columns"): - self.read_html(data, header=[0, 1]) - def test_wikipedia_states_table(self): data = os.path.join(DATA_PATH, 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data @@ -891,7 +997,7 @@ def test_computer_sales_page(self): def test_invalid_flavor(): url = 'google.com' with pytest.raises(ValueError): - read_html(url, 'google', flavor='not a* valid**++ flaver') + read_html(url, 'google', flavor='not a* valid**++ flavor') def get_elements_from_file(url, element='table'): @@ -939,6 +1045,7 @@ def test_same_ordering(): class ErrorThread(threading.Thread): + def run(self): try: super(ErrorThread, self).run()