elements. + rows : list of row elements of a table, usually | ||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+ elements. """ raise AbstractMethodError(self) - def _parse_thead(self, table): + def _extract_thead(self, table): """Return the header of a table. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - thead : node-like - A ... element. + thead : an HTML ... element. """ raise AbstractMethodError(self) - def _parse_tbody(self, table): + def _extract_tbody(self, table): """Return the body of the table. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - tbody : node-like - A | + - Move rows from bottom of body to footer only if + all elements inside row are |
+
+ Parameters
+ ----------
+ table_html : a single HTML table element.
+
+ Returns
+ -------
+ tuple of (header, body, footer)
+ header : list of rows, each of which is a list of parsed
+ header elements
+ body : list of rows, each of which is a list of parsed body elements
+ footer : list of rows, each of which is a list of parsed
+ footer elements
+ """
+
+ header_rows = []
+ body_rows = []
+ footer_rows = []
+
+ # first, are there thead and tbody elements in the table?
+ if (self._contains_tag(table_html, 'thead') and
+ self._contains_tag(table_html, 'tbody')):
+ header_rows = self._extract_tr(self._extract_thead(table_html)[0])
+ body_rows = self._extract_tr(self._extract_tbody(table_html)[0])
+
+ if self._contains_tag(table_html, 'tfoot'):
+ footer_rows = self._extract_tr(
+ self._extract_tfoot(table_html)[0])
+ else:
+ # Otherwise we need to split the body into header/body/foot.
+ body_rows = self._extract_tr(table_html)
+ if body_rows == []:
+ # empty table, just return nothing
+ return [], [], []
+ # splitting criterion: if all tags within a row are th, it's part
+ # of the header/footer
+ while all(self._equals_tag(t, 'th') for t in
+ self._extract_td(body_rows[0])):
+ # this row should be a header row, move it from body to header
+ header_rows.append(body_rows.pop(0))
+ while all(self._equals_tag(t, 'th') for t in
+ self._extract_td(body_rows[-1])):
+ # this row should be a footer row, move it from body to footer
+ footer_rows.insert(0, body_rows.pop())
+
+ header = self._expand_colspan_rowspan(header_rows, fill_rowspan=False)
+ body = self._expand_colspan_rowspan(body_rows, fill_rowspan=True)
+ footer = self._expand_colspan_rowspan(footer_rows, fill_rowspan=False)
+
+ # The below line is lifted from _parse_raw_tfoot. Not sure what
+ # it does.
+ footer = np.atleast_1d(np.array(footer).squeeze(
+ )) if footer and len(footer) == 1 else footer
+ return header, body, footer
+
+ def _expand_colspan_rowspan(self, rows, fill_rowspan=True):
+ """Given a list of rows, return a list of rows that properly handle
+ colspan/rowspan
+
+ Discussion on behavior of fill_rowspan in #17073
+
+ Parameters
+ ----------
+ rows : list of rows, each of which is a list of elements in that row
+
+ fill_rowspan : boolean
+ Should a rowspan fill every item in the rowspan (True) or only the
+ bottommost element (False)? Default is True.
+
+ Returns
+ -------
+ res : list of rows, each of which is a list of elements in that row,
+ respecting colspan/rowspan
+ """
+
res = []
- if thead:
- trs = self._parse_tr(thead[0])
- for tr in trs:
- cols = lmap(self._text_getter, self._parse_td(tr))
- if any([col != '' for col in cols]):
- res.append(cols)
+ saved_span = []
+ for row in rows:
+ extracted_row = self._extract_td(row)
+ cols_text = [_remove_whitespace(
+ self._text_getter(col)) for col in extracted_row]
+ col_colspans = [int(col.get('colspan', 1))
+ for col in extracted_row]
+ col_rowspans = [int(col.get('rowspan', 1))
+ for col in extracted_row]
+ # expand cols using col_colspans
+ # maybe this can be done with a list comprehension, dunno
+ cols = list(zip(
+ list(flatten(
+ lmap(lambda text_nc: [text_nc[0]] * text_nc[1],
+ list(zip(cols_text, col_colspans))))),
+ list(flatten(
+ lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0],
+ list(zip(col_colspans, col_rowspans))))))
+ )
+ # cols is now a list of (text, number of rows)
+ # now insert any previous rowspans
+ for (col, (text, nr)) in saved_span:
+ cols.insert(col, (text, nr))
+
+ # save next saved_span
+ def advance_item_to_next_row(item):
+ (col, (text, nr)) = item
+ if nr == 1:
+ return None
+ else:
+ # only keep the text around if fill_rowspan is set
+ return (col, (text if fill_rowspan else '', nr - 1))
+ saved_span = lfilter(lambda i: i is not None,
+ lmap(advance_item_to_next_row,
+ list(enumerate(cols))))
+ cols = [text for (text, nr) in cols]
+ # generate cols with text only
+ if any([col != '' for col in cols]):
+ res.append(cols)
return res
- def _parse_raw_tfoot(self, table):
- tfoot = self._parse_tfoot(table)
+ def _parse_raw_tfoot(self, table_html):
+ tfoot = self._extract_tfoot(table_html)
res = []
if tfoot:
- res = lmap(self._text_getter, self._parse_td(tfoot[0]))
+ res = lmap(self._text_getter, self._extract_td(tfoot[0]))
return np.atleast_1d(
np.array(res).squeeze()) if res and len(res) == 1 else res
- def _parse_raw_tbody(self, table):
- tbody = self._parse_tbody(table)
-
- try:
- res = self._parse_tr(tbody[0])
- except IndexError:
- res = self._parse_tr(table)
- return self._parse_raw_data(res)
-
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
@@ -401,27 +519,6 @@ def __init__(self, *args, **kwargs):
from bs4 import SoupStrainer
self._strainer = SoupStrainer('table')
- def _text_getter(self, obj):
- return obj.text
-
- def _parse_td(self, row):
- return row.find_all(('td', 'th'))
-
- def _parse_tr(self, element):
- return element.find_all('tr')
-
- def _parse_th(self, element):
- return element.find_all('th')
-
- def _parse_thead(self, table):
- return table.find_all('thead')
-
- def _parse_tbody(self, table):
- return table.find_all('tbody')
-
- def _parse_tfoot(self, table):
- return table.find_all('tfoot')
-
def _parse_tables(self, doc, match, attrs):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)
@@ -443,6 +540,33 @@ def _parse_tables(self, doc, match, attrs):
match.pattern)
return result
+ def _text_getter(self, obj):
+ return obj.text
+
+ def _equals_tag(self, obj, tag):
+ return obj.name == tag
+
+ def _contains_tag(self, obj, tag):
+ return obj.find(tag) is not None
+
+ def _extract_td(self, row):
+ return row.find_all(('td', 'th'))
+
+ def _extract_tr(self, element):
+ return element.find_all('tr')
+
+ def _extract_th(self, element):
+ return element.find_all('th')
+
+ def _extract_thead(self, table):
+ return table.find_all('thead')
+
+ def _extract_tbody(self, table):
+ return table.find_all('tbody')
+
+ def _extract_tfoot(self, table):
+ return table.find_all('tfoot')
+
def _setup_build_doc(self):
raw_text = _read(self.io)
if not raw_text:
@@ -502,16 +626,6 @@ class _LxmlFrameParser(_HtmlFrameParser):
def __init__(self, *args, **kwargs):
super(_LxmlFrameParser, self).__init__(*args, **kwargs)
- def _text_getter(self, obj):
- return obj.text_content()
-
- def _parse_td(self, row):
- return row.xpath('.//td|.//th')
-
- def _parse_tr(self, table):
- expr = './/tr[normalize-space()]'
- return table.xpath(expr)
-
def _parse_tables(self, doc, match, kwargs):
pattern = match.pattern
@@ -531,6 +645,22 @@ def _parse_tables(self, doc, match, kwargs):
raise ValueError("No tables found matching regex %r" % pattern)
return tables
+ def _equals_tag(self, obj, tag):
+ return obj.tag == tag
+
+ def _contains_tag(self, obj, tag):
+ return obj.find(tag) is not None
+
+ def _text_getter(self, obj):
+ return obj.text_content()
+
+ def _extract_td(self, row):
+ return row.xpath('.//td|.//th')
+
+ def _extract_tr(self, table):
+ expr = './/tr[normalize-space()]'
+ return table.xpath(expr)
+
def _build_doc(self):
"""
Raises
@@ -585,13 +715,13 @@ def _build_doc(self):
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
return r
- def _parse_tbody(self, table):
+ def _extract_tbody(self, table):
return table.xpath('.//tbody')
- def _parse_thead(self, table):
+ def _extract_thead(self, table):
return table.xpath('.//thead')
- def _parse_tfoot(self, table):
+ def _extract_tfoot(self, table):
return table.xpath('.//tfoot')
def _parse_raw_thead(self, table):
@@ -599,10 +729,10 @@ def _parse_raw_thead(self, table):
thead = table.xpath(expr)
res = []
if thead:
- trs = self._parse_tr(thead[0])
+ trs = self._extract_tr(thead[0])
for tr in trs:
cols = [_remove_whitespace(x.text_content()) for x in
- self._parse_td(tr)]
+ self._extract_td(tr)]
if any([col != '' for col in cols]):
res.append(cols)
return res
@@ -873,7 +1003,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
This function searches for ``
|