diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 1b2033999d67d..d0b8f00150099 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -10,7 +10,7 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) -.. _whatsnew_0240.enhancements.extension_array_operators +.. _whatsnew_0240.enhancements.extension_array_operators: ``ExtensionArray`` operator support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -26,6 +26,46 @@ See the :ref:`ExtensionArray Operator Support ` documentation section for details on both ways of adding operator support. +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + .. _whatsnew_0240.enhancements.other: Other Enhancements @@ -40,6 +80,7 @@ Other Enhancements `__. (:issue:`21627`) - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) - @@ -329,7 +370,7 @@ MultiIndex I/O ^^^ -- +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - - diff --git a/pandas/io/html.py b/pandas/io/html.py index 8fd876e85889f..45fe3b017e4f6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -10,8 +10,6 @@ from distutils.version import LooseVersion -import numpy as np - from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError from pandas.io.common import _is_url, urlopen, _validate_header_arg @@ -191,13 +189,14 @@ class _HtmlFrameParser(object): ----- To subclass this class effectively you must override the following methods: * :func:`_build_doc` + * :func:`_attr_getter` * :func:`_text_getter` * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` * :func:`_parse_tables` - * :func:`_parse_tr` - * :func:`_parse_thead` - * :func:`_parse_tbody` - * :func:`_parse_tfoot` + * :func:`_equals_tag` See each method's respective documentation for details on their functionality. """ @@ -210,35 +209,39 @@ def __init__(self, io, match, attrs, encoding, displayed_only): self.displayed_only = displayed_only def parse_tables(self): + """ + Parse and return all tables from the DOM. + + Returns + ------- + list of parsed (header, body, footer) tuples from tables. + """ tables = self._parse_tables(self._build_doc(), self.match, self.attrs) - return (self._build_table(table) for table in tables) + return (self._parse_thead_tbody_tfoot(table) for table in tables) - def _parse_raw_data(self, rows): - """Parse the raw data into a list of lists. + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. Parameters ---------- - rows : iterable of node-like - A list of row elements. - - text_getter : callable - A callable that gets the text from an individual node. This must be - defined by subclasses. + obj : node-like + A DOM node. - column_finder : callable - A callable that takes a row node as input and returns a list of the - column node in that row. This must be defined by subclasses. + attr : str or unicode + The attribute, such as "colspan" Returns ------- - data : list of list of strings + str or unicode + The attribute value. """ - data = [[_remove_whitespace(self._text_getter(col)) for col in - self._parse_td(row)] for row in rows] - return data + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) def _text_getter(self, obj): - """Return the text of an individual DOM node. + """ + Return the text of an individual DOM node. Parameters ---------- @@ -258,161 +261,257 @@ def _parse_td(self, obj): Parameters ---------- obj : node-like + A DOM node. Returns ------- - columns : list of node-like + list of node-like These are the elements of each row, i.e., the columns. """ raise com.AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): - """Return all tables from the parsed DOM. + def _parse_thead_tr(self, table): + """ + Return the list of thead row elements from the parsed table element. Parameters ---------- - doc : tree-like - The DOM from which to parse the table element. - - match : str or regular expression - The text to search for in the DOM tree. - - attrs : dict - A dictionary of table attributes that can be used to disambiguate - multiple tables on a page. - - Raises - ------ - ValueError - * If `match` does not match any text in the document. + table : a table element that contains zero or more thead elements. Returns ------- - tables : list of node-like - A list of elements to be parsed into raw data. + list of node-like + These are the row elements of a table. """ raise com.AbstractMethodError(self) - def _parse_tr(self, table): - """Return the list of row elements from the parsed table element. + def _parse_tbody_tr(self, table): + """ + Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - rows : list of node-like - A list row elements of a table, usually or row elements of a table. """ raise com.AbstractMethodError(self) - def _parse_thead(self, table): - """Return the header of a table. + def _parse_tfoot_tr(self, table): + """ + Return the list of tfoot row elements from the parsed table element. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - thead : node-like - A ... element. + list of node-like + These are the row elements of a table. """ raise com.AbstractMethodError(self) - def _parse_tbody(self, table): - """Return the list of tbody elements from the parsed table element. + def _parse_tables(self, doc, match, attrs): + """ + Return all tables from the parsed DOM. Parameters ---------- - table : node-like - A table element that contains row elements. + doc : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. Returns ------- - tbodys : list of node-like - A list of ... elements + list of node-like + HTML
elements. + list of node-like + These are the
elements to be parsed into raw data. """ raise com.AbstractMethodError(self) - def _parse_tfoot(self, table): - """Return the footer of the table if any. + def _equals_tag(self, obj, tag): + """ + Return whether an individual DOM node matches a tag Parameters ---------- - table : node-like - A table element that contains row elements. + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality. Returns ------- - tfoot : node-like - A ... element. + boolean + Whether `obj`'s tag name is `tag` """ raise com.AbstractMethodError(self) def _build_doc(self): - """Return a tree-like object that can be used to iterate over the DOM. + """ + Return a tree-like object that can be used to iterate over the DOM. Returns ------- - obj : tree-like + node-like + The DOM from which to parse the table element. """ raise com.AbstractMethodError(self) - def _build_table(self, table): - header = self._parse_raw_thead(table) - body = self._parse_raw_tbody(table) - footer = self._parse_raw_tfoot(table) + def _parse_thead_tbody_tfoot(self, table_html): + """ + Given a table, return parsed header, body, and foot. + + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + + Notes + ----- + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Move the top all- or + while body_rows and row_is_all_th(body_rows[0]): + header_rows.append(body_rows.pop(0)) + + header = self._expand_colspan_rowspan(header_rows) + body = self._expand_colspan_rowspan(body_rows) + footer = self._expand_colspan_rowspan(footer_rows) + return header, body, footer - def _parse_raw_thead(self, table): - thead = self._parse_thead(table) - res = [] - if thead: - trs = self._parse_tr(thead[0]) - for tr in trs: - cols = lmap(self._text_getter, self._parse_td(tr)) - if any(col != '' for col in cols): - res.append(cols) - return res - - def _parse_raw_tfoot(self, table): - tfoot = self._parse_tfoot(table) - res = [] - if tfoot: - res = lmap(self._text_getter, self._parse_td(tfoot[0])) - return np.atleast_1d( - np.array(res).squeeze()) if res and len(res) == 1 else res - - def _parse_raw_tbody(self, table): - tbodies = self._parse_tbody(table) - - raw_data = [] - - if tbodies: - for tbody in tbodies: - raw_data.extend(self._parse_tr(tbody)) - else: - raw_data.extend(self._parse_tr(table)) + def _expand_colspan_rowspan(self, rows): + """ + Given a list of s, return a list of text rows. - return self._parse_raw_data(raw_data) + Parameters + ---------- + rows : list of node-like + List of s + + Returns + ------- + list of list + Each returned row is a list of str text. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. + """ + + all_texts = [] # list of rows, each a list of str + remainder = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this or (see _parse_thead_tr). + return row.xpath('./td|./th') def _parse_tables(self, doc, match, kwargs): pattern = match.pattern @@ -590,6 +688,9 @@ def _parse_tables(self, doc, match, kwargs): .format(patt=pattern)) return tables + def _equals_tag(self, obj, tag): + return obj.tag == tag + def _build_doc(self): """ Raises @@ -637,41 +738,32 @@ def _build_doc(self): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r - def _parse_tbody(self, table): - return table.xpath('.//tbody') - - def _parse_thead(self, table): - return table.xpath('.//thead') - - def _parse_tfoot(self, table): - return table.xpath('.//tfoot') - - def _parse_raw_thead(self, table): - expr = './/thead' - thead = table.xpath(expr) - res = [] - if thead: - # Grab any directly descending table headers first - ths = thead[0].xpath('./th') - if ths: - cols = [_remove_whitespace(x.text_content()) for x in ths] - if any(col != '' for col in cols): - res.append(cols) - else: - trs = self._parse_tr(thead[0]) + def _parse_thead_tr(self, table): + rows = [] + + for thead in table.xpath('.//thead'): + rows.extend(thead.xpath('./tr')) + + # HACK: lxml does not clean up the clearly-erroneous + # . (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. + elements_at_root = thead.xpath('./td|./th') + if elements_at_root: + rows.append(thead) - for tr in trs: - cols = [_remove_whitespace(x.text_content()) for x in - self._parse_td(tr)] + return rows - if any(col != '' for col in cols): - res.append(cols) - return res + def _parse_tbody_tr(self, table): + from_tbody = table.xpath('.//tbody//tr') + from_root = table.xpath('./tr') + # HTML spec: at most one of these lists has content + return from_tbody + from_root - def _parse_raw_tfoot(self, table): - expr = './/tfoot//th|//tfoot//td' - return [_remove_whitespace(x.text_content()) for x in - table.xpath(expr)] + def _parse_tfoot_tr(self, table): + return table.xpath('.//tfoot//tr') def _expand_elements(body): @@ -689,13 +781,19 @@ def _data_to_frame(**kwargs): header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: - rows = lrange(len(head)) body = head + body - if header is None: # special case when a table has or top
+ - Move rows from bottom of body to footer only if + all elements inside row are + """ + + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) + + def row_is_all_th(row): + return all(self._equals_tag(t, 'th') for t in + self._parse_td(row)) + + if not header_rows: + # The table has no
rows from + # body_rows to header_rows. (This is a common case because many + # tables in the wild have no
+ while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, + prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + rowspan = int(self._attr_getter(td, 'rowspan') or 1) + colspan = int(self._attr_getter(td, 'colspan') or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, + prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, + prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + return all_texts def _handle_hidden_tables(self, tbl_list, attr_name): - """Returns list of tables, potentially removing hidden elements + """ + Return list of tables, potentially removing hidden elements Parameters ---------- - tbl_list : list of Tag or list of Element + tbl_list : list of node-like Type of list elements will vary depending upon parser used attr_name : str Name of the accessor for retrieving HTML attributes Returns ------- - list of Tag or list of Element + list of node-like Return type matches `tbl_list` """ if not self.displayed_only: @@ -442,27 +541,6 @@ def __init__(self, *args, **kwargs): from bs4 import SoupStrainer self._strainer = SoupStrainer('table') - def _text_getter(self, obj): - return obj.text - - def _parse_td(self, row): - return row.find_all(('td', 'th')) - - def _parse_tr(self, element): - return element.find_all('tr') - - def _parse_th(self, element): - return element.find_all('th') - - def _parse_thead(self, table): - return table.find_all('thead') - - def _parse_tbody(self, table): - return table.find_all('tbody') - - def _parse_tfoot(self, table): - return table.find_all('tfoot') - def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) @@ -490,6 +568,27 @@ def _parse_tables(self, doc, match, attrs): .format(patt=match.pattern)) return result + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag): + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(('td', 'th'), recursive=False) + + def _parse_thead_tr(self, table): + return table.select('thead tr') + + def _parse_tbody_tr(self, table): + from_tbody = table.select('tbody tr') + from_root = table.find_all('tr', recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select('tfoot tr') + def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: @@ -554,10 +653,9 @@ def _text_getter(self, obj): return obj.text_content() def _parse_td(self, row): - return row.xpath('.//td|.//th') - - def _parse_tr(self, table): - return table.xpath('.//tr') + # Look for direct children only: the "row" element here may be a + #
foobar
elements - header = 0 if rows == [0] else rows + + # Infer header when there is a
-only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) + if any(text for text in row)] if foot: - body += [foot] + body += foot # fill out elements of body that are "ragged" _expand_elements(body) @@ -953,7 +1051,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, This function searches for ```` elements and only for ```` and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``
`` rows and ```` elements within each ``
`` - element in the table. ```` stands for "table data". + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied **after** `skiprows` is applied. diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9c6a8de7ed446..b78c4f27d8c3f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -15,10 +15,10 @@ date_range, Series) from pandas.compat import (map, zip, StringIO, BytesIO, is_platform_windows, PY3, reload) +from pandas.errors import ParserError from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html -from pandas._libs.parsers import ParserError import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -129,16 +129,7 @@ def test_banklist(self): assert_framelist_equal(df1, df2) - def test_spam_no_types(self): - - # infer_types removed in #10892 - df1 = self.read_html(self.spam_data, '.*Water.*') - df2 = self.read_html(self.spam_data, 'Unit') - assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' - - def test_spam_with_types(self): + def test_spam(self): df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) @@ -157,7 +148,7 @@ def test_banklist_no_match(self): assert isinstance(df, DataFrame) def test_spam_header(self): - df = self.read_html(self.spam_data, '.*Water.*', header=1)[0] + df = self.read_html(self.spam_data, '.*Water.*', header=2)[0] assert df.columns[0] == 'Proximates' assert not df.empty @@ -387,32 +378,33 @@ def test_empty_tables(self): """ Make sure that read_html ignores empty tables. """ - data1 = ''' - - - - - - - - - - - - -
AB
12
''' - data2 = data1 + ''' - - -
''' - res1 = self.read_html(StringIO(data1)) - res2 = self.read_html(StringIO(data2)) - assert_framelist_equal(res1, res2) + result = self.read_html(''' + + + + + + + + + + + + + +
AB
12
+ + + +
+ ''') + + assert len(result) == 1 def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. - data = ''' + result = self.read_html('''
@@ -431,9 +423,10 @@ def test_multiple_tbody(self): -
A4
''' - expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) - result = self.read_html(StringIO(data))[0] +
''')[0] + + expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B']) + tm.assert_frame_equal(result, expected) def test_header_and_one_column(self): @@ -441,9 +434,7 @@ def test_header_and_one_column(self): Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - data = StringIO(''' - - + result = self.read_html('''
@@ -454,11 +445,36 @@ def test_header_and_one_column(self): -
Headerfirst
- - ''') +
''')[0] + expected = DataFrame(data={'Header': 'first'}, index=[0]) - result = self.read_html(data)[0] + + tm.assert_frame_equal(result, expected) + + def test_thead_without_tr(self): + """ + Ensure parser adds within on malformed HTML. + """ + result = self.read_html(''' + + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''')[0] + + expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]], + columns=['Country', 'Municipality', 'Year']) + tm.assert_frame_equal(result, expected) def test_tfoot_read(self): @@ -484,63 +500,51 @@ def test_tfoot_read(self): ''' + expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B']) + + expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']], + columns=['A', 'B']) + data1 = data_template.format(footer="") data2 = data_template.format( footer="footAfootB") - d1 = {'A': ['bodyA'], 'B': ['bodyB']} - d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']} + result1 = self.read_html(data1)[0] + result2 = self.read_html(data2)[0] - tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1)) - tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2)) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) - def test_countries_municipalities(self): - # GH5048 - data1 = StringIO(''' - - - - - - - - - - - - - - -
CountryMunicipalityYear
UkraineOdessa1944
''') - data2 = StringIO(''' - - + def test_parse_header_of_non_string_column(self): + # GH5048: if header is specified explicitly, an int column should be + # parsed as int while its header is parsed as str + result = self.read_html(''' +
- - - + + - - + - -
CountryMunicipalityYearSI
UkraineOdessatext 1944
''') - res1 = self.read_html(data1) - res2 = self.read_html(data2, header=0) - assert_framelist_equal(res1, res2) + + ''', header=0)[0] + + expected = DataFrame([['text', 1944]], columns=('S', 'I')) + + tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] - columns = Index(['Issue(Roll over for charts and headlines)', - 'Volume', 'Price', 'Chg', '% Chg']) + expected = Index(['Issue(Roll over for charts and headlines)', + 'Volume', 'Price', 'Chg', '% Chg']) nrows = 100 assert df.shape[0] == nrows - tm.assert_index_equal(df.columns, columns) + tm.assert_index_equal(df.columns, expected) @pytest.mark.slow def test_banklist_header(self, datapath): @@ -592,8 +596,8 @@ def test_gold_canyon(self): attrs={'id': 'table'})[0] assert gc in df.to_string() - def test_different_number_of_rows(self): - expected = """ + def test_different_number_of_cols(self): + expected = self.read_html("""
@@ -622,8 +626,9 @@ def test_different_number_of_rows(self): -
0.222
""" - out = """ +
""", index_col=0)[0] + + result = self.read_html(""" @@ -649,10 +654,151 @@ def test_different_number_of_rows(self): -
0.222
""" - expected = self.read_html(expected, index_col=0)[0] - res = self.read_html(out, index_col=0)[0] - tm.assert_frame_equal(expected, res) + """, index_col=0)[0] + + tm.assert_frame_equal(result, expected) + + def test_colspan_rowspan_1(self): + # GH17054 + result = self.read_html(""" + + + + + + + + + + + +
ABC
abc
+ """)[0] + + expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C']) + + tm.assert_frame_equal(result, expected) + + def test_colspan_rowspan_copy_values(self): + # GH17054 + + # In ASCII, with lowercase letters being copies: + # + # X x Y Z W + # A B b z C + + result = self.read_html(""" + + + + + + + + + + + + +
XYZW
ABC
+ """, header=0)[0] + + expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']], + columns=['X', 'X.1', 'Y', 'Z', 'W']) + + tm.assert_frame_equal(result, expected) + + def test_colspan_rowspan_both_not_1(self): + # GH17054 + + # In ASCII, with lowercase letters being copies: + # + # A B b b C + # a b b b D + + result = self.read_html(""" + + + + + + + + + +
ABC
D
+ """, header=0)[0] + + expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']], + columns=['A', 'B', 'B.1', 'B.2', 'C']) + + tm.assert_frame_equal(result, expected) + + def test_rowspan_at_end_of_row(self): + # GH17054 + + # In ASCII, with lowercase letters being copies: + # + # A B + # C b + + result = self.read_html(""" + + + + + + + + +
AB
C
+ """, header=0)[0] + + expected = DataFrame(data=[['C', 'B']], columns=['A', 'B']) + + tm.assert_frame_equal(result, expected) + + def test_rowspan_only_rows(self): + # GH17054 + + result = self.read_html(""" + + + + + +
AB
+ """, header=0)[0] + + expected = DataFrame(data=[['A', 'B'], ['A', 'B']], + columns=['A', 'B']) + + tm.assert_frame_equal(result, expected) + + def test_header_inferred_from_rows_with_only_th(self): + # GH17054 + result = self.read_html(""" + + + + + + + + + + + + + +
AB
ab
12
+ """)[0] + + columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], + labels=[[0, 1], [0, 1]]) + expected = DataFrame(data=[[1, 2]], columns=columns) + + tm.assert_frame_equal(result, expected) def test_parse_dates_list(self): df = DataFrame({'date': date_range('1/1/2001', periods=10)}) @@ -689,10 +835,26 @@ def test_wikipedia_states_table(self, datapath): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') - def test_decimal_rows(self): + def test_parser_error_on_empty_header_row(self): + with tm.assert_raises_regex(ParserError, + r"Passed header=\[0,1\] are " + r"too many rows for this " + r"multi_index of columns"): + self.read_html(""" + + + + + + + + +
AB
ab
+ """, header=[0, 1]) + def test_decimal_rows(self): # GH 12907 - data = StringIO(''' + result = self.read_html(''' @@ -707,9 +869,10 @@ def test_decimal_rows(self):
- ''') + ''', decimal='#')[0] + expected = DataFrame(data={'Header': 1100.101}, index=[0]) - result = self.read_html(data, decimal='#')[0] + assert result['Header'].dtype == np.dtype('float64') tm.assert_frame_equal(result, expected) @@ -717,53 +880,61 @@ def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - read_html(self.spam_data, header=arg) + self.read_html(self.spam_data, header=arg) def test_converters(self): # GH 13461 - html_data = """ - - - - - - - - - - - - -
a
0.763
0.244
""" + result = self.read_html( + """ + + + + + + + + + + + + + +
a
0.763
0.244
""", + converters={'a': str} + )[0] + + expected = DataFrame({'a': ['0.763', '0.244']}) - expected_df = DataFrame({'a': ['0.763', '0.244']}) - html_df = read_html(html_data, converters={'a': str})[0] - tm.assert_frame_equal(expected_df, html_df) + tm.assert_frame_equal(result, expected) def test_na_values(self): # GH 13461 - html_data = """ - - - - - - - - - - - - -
a
0.763
0.244
""" + result = self.read_html( + """ + + + + + + + + + + + + + +
a
0.763
0.244
""", + na_values=[0.244])[0] + + expected = DataFrame({'a': [0.763, np.nan]}) - expected_df = DataFrame({'a': [0.763, np.nan]}) - html_df = read_html(html_data, na_values=[0.244])[0] - tm.assert_frame_equal(expected_df, html_df) + tm.assert_frame_equal(result, expected) def test_keep_default_na(self): html_data = """ + @@ -778,13 +949,56 @@ def test_keep_default_na(self):
a
""" expected_df = DataFrame({'a': ['N/A', 'NA']}) - html_df = read_html(html_data, keep_default_na=False)[0] + html_df = self.read_html(html_data, keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({'a': [np.nan, np.nan]}) - html_df = read_html(html_data, keep_default_na=True)[0] + html_df = self.read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) + def test_preserve_empty_rows(self): + result = self.read_html(""" + + + + + + + + + + + + + +
AB
ab
+ """)[0] + + expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]], + columns=['A', 'B']) + + tm.assert_frame_equal(result, expected) + + def test_ignore_empty_rows_when_inferring_header(self): + result = self.read_html(""" + + + + + + + + + +
AB
ab
12
+ """)[0] + + columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], + labels=[[0, 1], [0, 1]]) + expected = DataFrame(data=[[1, 2]], columns=columns) + + tm.assert_frame_equal(result, expected) + def test_multiple_header_rows(self): # Issue #13434 expected_df = DataFrame(data=[("Hillary", 68, "D"), @@ -794,7 +1008,7 @@ def test_multiple_header_rows(self): ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"]] html = expected_df.to_html(index=False) - html_df = read_html(html, )[0] + html_df = self.read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath):