diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 1b2033999d67d..d0b8f00150099 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -10,7 +10,7 @@ New features
 
 - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
 
-.. _whatsnew_0240.enhancements.extension_array_operators
+.. _whatsnew_0240.enhancements.extension_array_operators:
 
 ``ExtensionArray`` operator support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -26,6 +26,46 @@ See the :ref:`ExtensionArray Operator Support
 <extending.extension.operator>` documentation section for details on both
 ways of adding operator support.
 
+.. _whatsnew_0240.enhancements.read_html:
+
+``read_html`` Enhancements
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes.
+Now it understands them, treating them as sequences of cells with the same
+value. (:issue:`17054`)
+
+.. ipython:: python
+
+    result = pd.read_html("""
+      <table>
+        <thead>
+          <tr>
+            <th>A</th><th>B</th><th>C</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td colspan="2">1</td><td>2</td>
+          </tr>
+        </tbody>
+      </table>""")
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [13]: result
+    Out [13]:
+    [   A  B   C
+     0  1  2 NaN]
+
+Current Behavior:
+
+.. ipython:: python
+
+    result
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
@@ -40,6 +80,7 @@ Other Enhancements
   <https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
   (:issue:`21627`)
 - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
+- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
 - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
 - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
 -
@@ -329,7 +370,7 @@ MultiIndex
 I/O
 ^^^
 
--
+- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 -
 -
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 8fd876e85889f..45fe3b017e4f6 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -10,8 +10,6 @@
 
 from distutils.version import LooseVersion
 
-import numpy as np
-
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
 from pandas.io.common import _is_url, urlopen, _validate_header_arg
@@ -191,13 +189,14 @@ class _HtmlFrameParser(object):
     -----
     To subclass this class effectively you must override the following methods:
         * :func:`_build_doc`
+        * :func:`_attr_getter`
         * :func:`_text_getter`
         * :func:`_parse_td`
+        * :func:`_parse_thead_tr`
+        * :func:`_parse_tbody_tr`
+        * :func:`_parse_tfoot_tr`
         * :func:`_parse_tables`
-        * :func:`_parse_tr`
-        * :func:`_parse_thead`
-        * :func:`_parse_tbody`
-        * :func:`_parse_tfoot`
+        * :func:`_equals_tag`
     See each method's respective documentation for details on their
     functionality.
     """
@@ -210,35 +209,39 @@ def __init__(self, io, match, attrs, encoding, displayed_only):
         self.displayed_only = displayed_only
 
     def parse_tables(self):
+        """
+        Parse and return all tables from the DOM.
+
+        Returns
+        -------
+        list of parsed (header, body, footer) tuples from tables.
+        """
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
-        return (self._build_table(table) for table in tables)
+        return (self._parse_thead_tbody_tfoot(table) for table in tables)
 
-    def _parse_raw_data(self, rows):
-        """Parse the raw data into a list of lists.
+    def _attr_getter(self, obj, attr):
+        """
+        Return the attribute value of an individual DOM node.
 
         Parameters
         ----------
-        rows : iterable of node-like
-            A list of row elements.
-
-        text_getter : callable
-            A callable that gets the text from an individual node. This must be
-            defined by subclasses.
+        obj : node-like
+            A DOM node.
 
-        column_finder : callable
-            A callable that takes a row node as input and returns a list of the
-            column node in that row. This must be defined by subclasses.
+        attr : str or unicode
+            The attribute, such as "colspan"
 
         Returns
         -------
-        data : list of list of strings
+        str or unicode
+            The attribute value.
         """
-        data = [[_remove_whitespace(self._text_getter(col)) for col in
-                 self._parse_td(row)] for row in rows]
-        return data
+        # Both lxml and BeautifulSoup have the same implementation:
+        return obj.get(attr)
 
     def _text_getter(self, obj):
-        """Return the text of an individual DOM node.
+        """
+        Return the text of an individual DOM node.
 
         Parameters
         ----------
@@ -258,161 +261,257 @@ def _parse_td(self, obj):
         Parameters
         ----------
         obj : node-like
+            A DOM <tr> node.
 
         Returns
         -------
-        columns : list of node-like
+        list of node-like
             These are the elements of each row, i.e., the columns.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tables(self, doc, match, attrs):
-        """Return all tables from the parsed DOM.
+    def _parse_thead_tr(self, table):
+        """
+        Return the list of thead row elements from the parsed table element.
 
         Parameters
         ----------
-        doc : tree-like
-            The DOM from which to parse the table element.
-
-        match : str or regular expression
-            The text to search for in the DOM tree.
-
-        attrs : dict
-            A dictionary of table attributes that can be used to disambiguate
-            multiple tables on a page.
-
-        Raises
-        ------
-        ValueError
-            * If `match` does not match any text in the document.
+        table : a table element that contains zero or more thead elements.
 
         Returns
         -------
-        tables : list of node-like
-            A list of <table> elements to be parsed into raw data.
+        list of node-like
+            These are the <tr> row elements of a table.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tr(self, table):
-        """Return the list of row elements from the parsed table element.
+    def _parse_tbody_tr(self, table):
+        """
+        Return the list of tbody row elements from the parsed table element.
+
+        HTML5 table bodies consist of either 0 or more <tbody> elements (which
+        only contain <tr> elements) or 0 or more <tr> elements. This method
+        checks for both structures.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        table : a table element that contains row elements.
 
         Returns
         -------
-        rows : list of node-like
-            A list row elements of a table, usually <tr> or <th> elements.
+        list of node-like
+            These are the <tr> row elements of a table.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_thead(self, table):
-        """Return the header of a table.
+    def _parse_tfoot_tr(self, table):
+        """
+        Return the list of tfoot row elements from the parsed table element.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        table : a table element that contains row elements.
 
         Returns
         -------
-        thead : node-like
-            A <thead>...</thead> element.
+        list of node-like
+            These are the <tr> row elements of a table.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tbody(self, table):
-        """Return the list of tbody elements from the parsed table element.
+    def _parse_tables(self, doc, match, attrs):
+        """
+        Return all tables from the parsed DOM.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        doc : the DOM from which to parse the table element.
+
+        match : str or regular expression
+            The text to search for in the DOM tree.
+
+        attrs : dict
+            A dictionary of table attributes that can be used to disambiguate
+            multiple tables on a page.
+
+        Raises
+        ------
+        ValueError : `match` does not match any text in the document.
 
         Returns
         -------
-        tbodys : list of node-like
-            A list of <tbody>...</tbody> elements
+        list of node-like
+            HTML <table> elements to be parsed into raw data.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tfoot(self, table):
-        """Return the footer of the table if any.
+    def _equals_tag(self, obj, tag):
+        """
+        Return whether an individual DOM node matches a tag
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        obj : node-like
+            A DOM node.
+
+        tag : str
+            Tag name to be checked for equality.
 
         Returns
         -------
-        tfoot : node-like
-            A <tfoot>...</tfoot> element.
+        boolean
+            Whether `obj`'s tag name is `tag`
         """
         raise com.AbstractMethodError(self)
 
     def _build_doc(self):
-        """Return a tree-like object that can be used to iterate over the DOM.
+        """
+        Return a tree-like object that can be used to iterate over the DOM.
 
         Returns
         -------
-        obj : tree-like
+        node-like
+            The DOM from which to parse the table element.
         """
         raise com.AbstractMethodError(self)
 
-    def _build_table(self, table):
-        header = self._parse_raw_thead(table)
-        body = self._parse_raw_tbody(table)
-        footer = self._parse_raw_tfoot(table)
+    def _parse_thead_tbody_tfoot(self, table_html):
+        """
+        Given a table, return parsed header, body, and foot.
+
+        Parameters
+        ----------
+        table_html : node-like
+
+        Returns
+        -------
+        tuple of (header, body, footer), each a list of list-of-text rows.
+
+        Notes
+        -----
+        Header and body are lists-of-lists. Top level list is a list of
+        rows. Each row is a list of str text.
+
+        Logic: Use <thead>, <tbody>, <tfoot> elements to identify
+               header, body, and footer, otherwise:
+               - Put all rows into body
+               - Move rows from top of body to header only if
+                 all elements inside row are <th>
+               - Move rows from bottom of body to footer only if
+                 all elements inside row are <th>
+        """
+
+        header_rows = self._parse_thead_tr(table_html)
+        body_rows = self._parse_tbody_tr(table_html)
+        footer_rows = self._parse_tfoot_tr(table_html)
+
+        def row_is_all_th(row):
+            return all(self._equals_tag(t, 'th') for t in
+                       self._parse_td(row))
+
+        if not header_rows:
+            # The table has no <thead>. Move the top all-<th> rows from
+            # body_rows to header_rows. (This is a common case because many
+            # tables in the wild have no <thead> or <tfoot>
+            while body_rows and row_is_all_th(body_rows[0]):
+                header_rows.append(body_rows.pop(0))
+
+        header = self._expand_colspan_rowspan(header_rows)
+        body = self._expand_colspan_rowspan(body_rows)
+        footer = self._expand_colspan_rowspan(footer_rows)
+
         return header, body, footer
 
-    def _parse_raw_thead(self, table):
-        thead = self._parse_thead(table)
-        res = []
-        if thead:
-            trs = self._parse_tr(thead[0])
-            for tr in trs:
-                cols = lmap(self._text_getter, self._parse_td(tr))
-                if any(col != '' for col in cols):
-                    res.append(cols)
-        return res
-
-    def _parse_raw_tfoot(self, table):
-        tfoot = self._parse_tfoot(table)
-        res = []
-        if tfoot:
-            res = lmap(self._text_getter, self._parse_td(tfoot[0]))
-        return np.atleast_1d(
-            np.array(res).squeeze()) if res and len(res) == 1 else res
-
-    def _parse_raw_tbody(self, table):
-        tbodies = self._parse_tbody(table)
-
-        raw_data = []
-
-        if tbodies:
-            for tbody in tbodies:
-                raw_data.extend(self._parse_tr(tbody))
-        else:
-            raw_data.extend(self._parse_tr(table))
+    def _expand_colspan_rowspan(self, rows):
+        """
+        Given a list of <tr>s, return a list of text rows.
 
-        return self._parse_raw_data(raw_data)
+        Parameters
+        ----------
+        rows : list of node-like
+            List of <tr>s
+
+        Returns
+        -------
+        list of list
+            Each returned row is a list of str text.
+
+        Notes
+        -----
+        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
+        to subsequent cells.
+        """
+
+        all_texts = []  # list of rows, each a list of str
+        remainder = []  # list of (index, text, nrows)
+
+        for tr in rows:
+            texts = []  # the output for this row
+            next_remainder = []
+
+            index = 0
+            tds = self._parse_td(tr)
+            for td in tds:
+                # Append texts from previous rows with rowspan>1 that come
+                # before this <td>
+                while remainder and remainder[0][0] <= index:
+                    prev_i, prev_text, prev_rowspan = remainder.pop(0)
+                    texts.append(prev_text)
+                    if prev_rowspan > 1:
+                        next_remainder.append((prev_i, prev_text,
+                                               prev_rowspan - 1))
+                    index += 1
+
+                # Append the text from this <td>, colspan times
+                text = _remove_whitespace(self._text_getter(td))
+                rowspan = int(self._attr_getter(td, 'rowspan') or 1)
+                colspan = int(self._attr_getter(td, 'colspan') or 1)
+
+                for _ in range(colspan):
+                    texts.append(text)
+                    if rowspan > 1:
+                        next_remainder.append((index, text, rowspan - 1))
+                    index += 1
+
+            # Append texts from previous rows at the final position
+            for prev_i, prev_text, prev_rowspan in remainder:
+                texts.append(prev_text)
+                if prev_rowspan > 1:
+                    next_remainder.append((prev_i, prev_text,
+                                           prev_rowspan - 1))
+
+            all_texts.append(texts)
+            remainder = next_remainder
+
+        # Append rows that only appear because the previous row had non-1
+        # rowspan
+        while remainder:
+            next_remainder = []
+            texts = []
+            for prev_i, prev_text, prev_rowspan in remainder:
+                texts.append(prev_text)
+                if prev_rowspan > 1:
+                    next_remainder.append((prev_i, prev_text,
+                                           prev_rowspan - 1))
+            all_texts.append(texts)
+            remainder = next_remainder
+
+        return all_texts
 
     def _handle_hidden_tables(self, tbl_list, attr_name):
-        """Returns list of tables, potentially removing hidden elements
+        """
+        Return list of tables, potentially removing hidden elements
 
         Parameters
         ----------
-        tbl_list : list of Tag or list of Element
+        tbl_list : list of node-like
             Type of list elements will vary depending upon parser used
         attr_name : str
             Name of the accessor for retrieving HTML attributes
 
         Returns
         -------
-        list of Tag or list of Element
+        list of node-like
             Return type matches `tbl_list`
         """
         if not self.displayed_only:
@@ -442,27 +541,6 @@ def __init__(self, *args, **kwargs):
         from bs4 import SoupStrainer
         self._strainer = SoupStrainer('table')
 
-    def _text_getter(self, obj):
-        return obj.text
-
-    def _parse_td(self, row):
-        return row.find_all(('td', 'th'))
-
-    def _parse_tr(self, element):
-        return element.find_all('tr')
-
-    def _parse_th(self, element):
-        return element.find_all('th')
-
-    def _parse_thead(self, table):
-        return table.find_all('thead')
-
-    def _parse_tbody(self, table):
-        return table.find_all('tbody')
-
-    def _parse_tfoot(self, table):
-        return table.find_all('tfoot')
-
     def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
@@ -490,6 +568,27 @@ def _parse_tables(self, doc, match, attrs):
                              .format(patt=match.pattern))
         return result
 
+    def _text_getter(self, obj):
+        return obj.text
+
+    def _equals_tag(self, obj, tag):
+        return obj.name == tag
+
+    def _parse_td(self, row):
+        return row.find_all(('td', 'th'), recursive=False)
+
+    def _parse_thead_tr(self, table):
+        return table.select('thead tr')
+
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.select('tbody tr')
+        from_root = table.find_all('tr', recursive=False)
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
+
+    def _parse_tfoot_tr(self, table):
+        return table.select('tfoot tr')
+
     def _setup_build_doc(self):
         raw_text = _read(self.io)
         if not raw_text:
@@ -554,10 +653,9 @@ def _text_getter(self, obj):
         return obj.text_content()
 
     def _parse_td(self, row):
-        return row.xpath('.//td|.//th')
-
-    def _parse_tr(self, table):
-        return table.xpath('.//tr')
+        # Look for direct children only: the "row" element here may be a
+        # <thead> or <tfoot> (see _parse_thead_tr).
+        return row.xpath('./td|./th')
 
     def _parse_tables(self, doc, match, kwargs):
         pattern = match.pattern
@@ -590,6 +688,9 @@ def _parse_tables(self, doc, match, kwargs):
                              .format(patt=pattern))
         return tables
 
+    def _equals_tag(self, obj, tag):
+        return obj.tag == tag
+
     def _build_doc(self):
         """
         Raises
@@ -637,41 +738,32 @@ def _build_doc(self):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
         return r
 
-    def _parse_tbody(self, table):
-        return table.xpath('.//tbody')
-
-    def _parse_thead(self, table):
-        return table.xpath('.//thead')
-
-    def _parse_tfoot(self, table):
-        return table.xpath('.//tfoot')
-
-    def _parse_raw_thead(self, table):
-        expr = './/thead'
-        thead = table.xpath(expr)
-        res = []
-        if thead:
-            # Grab any directly descending table headers first
-            ths = thead[0].xpath('./th')
-            if ths:
-                cols = [_remove_whitespace(x.text_content()) for x in ths]
-                if any(col != '' for col in cols):
-                    res.append(cols)
-            else:
-                trs = self._parse_tr(thead[0])
+    def _parse_thead_tr(self, table):
+        rows = []
+
+        for thead in table.xpath('.//thead'):
+            rows.extend(thead.xpath('./tr'))
+
+            # HACK: lxml does not clean up the clearly-erroneous
+            # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
+            # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
+            # children as though it's a <tr>.
+            #
+            # Better solution would be to use html5lib.
+            elements_at_root = thead.xpath('./td|./th')
+            if elements_at_root:
+                rows.append(thead)
 
-                for tr in trs:
-                    cols = [_remove_whitespace(x.text_content()) for x in
-                            self._parse_td(tr)]
+        return rows
 
-                    if any(col != '' for col in cols):
-                        res.append(cols)
-        return res
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.xpath('.//tbody//tr')
+        from_root = table.xpath('./tr')
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
 
-    def _parse_raw_tfoot(self, table):
-        expr = './/tfoot//th|//tfoot//td'
-        return [_remove_whitespace(x.text_content()) for x in
-                table.xpath(expr)]
+    def _parse_tfoot_tr(self, table):
+        return table.xpath('.//tfoot//tr')
 
 
 def _expand_elements(body):
@@ -689,13 +781,19 @@ def _data_to_frame(**kwargs):
     header = kwargs.pop('header')
     kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
     if head:
-        rows = lrange(len(head))
         body = head + body
-        if header is None:  # special case when a table has <th> elements
-            header = 0 if rows == [0] else rows
+
+        # Infer header when there is a <thead> or top <th>-only rows
+        if header is None:
+            if len(head) == 1:
+                header = 0
+            else:
+                # ignore all-empty-text rows
+                header = [i for i, row in enumerate(head)
+                          if any(text for text in row)]
 
     if foot:
-        body += [foot]
+        body += foot
 
     # fill out elements of body that are "ragged"
     _expand_elements(body)
@@ -953,7 +1051,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
     This function searches for ``<table>`` elements and only for ``<tr>``
     and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
-    element in the table. ``<td>`` stands for "table data".
+    element in the table. ``<td>`` stands for "table data". This function
+    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
+    If the function has a ``<thead>`` argument, it is used to construct
+    the header, otherwise the function attempts to find the header within
+    the body (by putting rows with only ``<th>`` elements into the header).
+
+        .. versionadded:: 0.21.0
 
     Similar to :func:`~pandas.read_csv` the `header` argument is applied
     **after** `skiprows` is applied.
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 9c6a8de7ed446..b78c4f27d8c3f 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -15,10 +15,10 @@
                     date_range, Series)
 from pandas.compat import (map, zip, StringIO, BytesIO,
                            is_platform_windows, PY3, reload)
+from pandas.errors import ParserError
 from pandas.io.common import URLError, file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html
-from pandas._libs.parsers import ParserError
 
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
@@ -129,16 +129,7 @@ def test_banklist(self):
 
         assert_framelist_equal(df1, df2)
 
-    def test_spam_no_types(self):
-
-        # infer_types removed in #10892
-        df1 = self.read_html(self.spam_data, '.*Water.*')
-        df2 = self.read_html(self.spam_data, 'Unit')
-        assert_framelist_equal(df1, df2)
-        assert df1[0].iloc[0, 0] == 'Proximates'
-        assert df1[0].columns[0] == 'Nutrient'
-
-    def test_spam_with_types(self):
+    def test_spam(self):
         df1 = self.read_html(self.spam_data, '.*Water.*')
         df2 = self.read_html(self.spam_data, 'Unit')
         assert_framelist_equal(df1, df2)
@@ -157,7 +148,7 @@ def test_banklist_no_match(self):
             assert isinstance(df, DataFrame)
 
     def test_spam_header(self):
-        df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
+        df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
         assert df.columns[0] == 'Proximates'
         assert not df.empty
 
@@ -387,32 +378,33 @@ def test_empty_tables(self):
         """
         Make sure that read_html ignores empty tables.
         """
-        data1 = '''<table>
-            <thead>
-                <tr>
-                    <th>A</th>
-                    <th>B</th>
-                </tr>
-            </thead>
-            <tbody>
-                <tr>
-                    <td>1</td>
-                    <td>2</td>
-                </tr>
-            </tbody>
-        </table>'''
-        data2 = data1 + '''<table>
-            <tbody>
-            </tbody>
-        </table>'''
-        res1 = self.read_html(StringIO(data1))
-        res2 = self.read_html(StringIO(data2))
-        assert_framelist_equal(res1, res2)
+        result = self.read_html('''
+            <table>
+                <thead>
+                    <tr>
+                        <th>A</th>
+                        <th>B</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1</td>
+                        <td>2</td>
+                    </tr>
+                </tbody>
+            </table>
+            <table>
+                <tbody>
+                </tbody>
+            </table>
+        ''')
+
+        assert len(result) == 1
 
     def test_multiple_tbody(self):
         # GH-20690
         # Read all tbody tags within a single table.
-        data = '''<table>
+        result = self.read_html('''<table>
             <thead>
                 <tr>
                     <th>A</th>
@@ -431,9 +423,10 @@ def test_multiple_tbody(self):
                     <td>4</td>
                 </tr>
             </tbody>
-        </table>'''
-        expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
-        result = self.read_html(StringIO(data))[0]
+        </table>''')[0]
+
+        expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B'])
+
         tm.assert_frame_equal(result, expected)
 
     def test_header_and_one_column(self):
@@ -441,9 +434,7 @@ def test_header_and_one_column(self):
         Don't fail with bs4 when there is a header and only one column
         as described in issue #9178
         """
-        data = StringIO('''<html>
-            <body>
-             <table>
+        result = self.read_html('''<table>
                 <thead>
                     <tr>
                         <th>Header</th>
@@ -454,11 +445,36 @@ def test_header_and_one_column(self):
                         <td>first</td>
                     </tr>
                 </tbody>
-            </table>
-            </body>
-        </html>''')
+            </table>''')[0]
+
         expected = DataFrame(data={'Header': 'first'}, index=[0])
-        result = self.read_html(data)[0]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_thead_without_tr(self):
+        """
+        Ensure parser adds <tr> within <thead> on malformed HTML.
+        """
+        result = self.read_html('''<table>
+            <thead>
+                <tr>
+                    <th>Country</th>
+                    <th>Municipality</th>
+                    <th>Year</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')[0]
+
+        expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]],
+                             columns=['Country', 'Municipality', 'Year'])
+
         tm.assert_frame_equal(result, expected)
 
     def test_tfoot_read(self):
@@ -484,63 +500,51 @@ def test_tfoot_read(self):
             </tfoot>
         </table>'''
 
+        expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B'])
+
+        expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']],
+                              columns=['A', 'B'])
+
         data1 = data_template.format(footer="")
         data2 = data_template.format(
             footer="<tr><td>footA</td><th>footB</th></tr>")
 
-        d1 = {'A': ['bodyA'], 'B': ['bodyB']}
-        d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']}
+        result1 = self.read_html(data1)[0]
+        result2 = self.read_html(data2)[0]
 
-        tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1))
-        tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2))
+        tm.assert_frame_equal(result1, expected1)
+        tm.assert_frame_equal(result2, expected2)
 
-    def test_countries_municipalities(self):
-        # GH5048
-        data1 = StringIO('''<table>
-            <thead>
-                <tr>
-                    <th>Country</th>
-                    <th>Municipality</th>
-                    <th>Year</th>
-                </tr>
-            </thead>
-            <tbody>
-                <tr>
-                    <td>Ukraine</td>
-                    <th>Odessa</th>
-                    <td>1944</td>
-                </tr>
-            </tbody>
-        </table>''')
-        data2 = StringIO('''
-        <table>
-            <tbody>
+    def test_parse_header_of_non_string_column(self):
+        # GH5048: if header is specified explicitly, an int column should be
+        # parsed as int while its header is parsed as str
+        result = self.read_html('''
+            <table>
                 <tr>
-                    <th>Country</th>
-                    <th>Municipality</th>
-                    <th>Year</th>
+                    <td>S</td>
+                    <td>I</td>
                 </tr>
                 <tr>
-                    <td>Ukraine</td>
-                    <th>Odessa</th>
+                    <td>text</td>
                     <td>1944</td>
                 </tr>
-            </tbody>
-        </table>''')
-        res1 = self.read_html(data1)
-        res2 = self.read_html(data2, header=0)
-        assert_framelist_equal(res1, res2)
+            </table>
+        ''', header=0)[0]
+
+        expected = DataFrame([['text', 1944]], columns=('S', 'I'))
+
+        tm.assert_frame_equal(result, expected)
 
     def test_nyse_wsj_commas_table(self, datapath):
         data = datapath('io', 'data', 'nyse_wsj.html')
         df = self.read_html(data, index_col=0, header=0,
                             attrs={'class': 'mdcTable'})[0]
 
-        columns = Index(['Issue(Roll over for charts and headlines)',
-                         'Volume', 'Price', 'Chg', '% Chg'])
+        expected = Index(['Issue(Roll over for charts and headlines)',
+                          'Volume', 'Price', 'Chg', '% Chg'])
         nrows = 100
         assert df.shape[0] == nrows
-        tm.assert_index_equal(df.columns, columns)
+        tm.assert_index_equal(df.columns, expected)
 
     @pytest.mark.slow
     def test_banklist_header(self, datapath):
@@ -592,8 +596,8 @@ def test_gold_canyon(self):
                             attrs={'id': 'table'})[0]
         assert gc in df.to_string()
 
-    def test_different_number_of_rows(self):
-        expected = """<table border="1" class="dataframe">
+    def test_different_number_of_cols(self):
+        expected = self.read_html("""<table>
                         <thead>
                             <tr style="text-align: right;">
                             <th></th>
@@ -622,8 +626,9 @@ def test_different_number_of_rows(self):
                             <td> 0.222</td>
                             </tr>
                         </tbody>
-                    </table>"""
-        out = """<table border="1" class="dataframe">
+                    </table>""", index_col=0)[0]
+
+        result = self.read_html("""<table>
                     <thead>
                         <tr style="text-align: right;">
                         <th></th>
@@ -649,10 +654,151 @@ def test_different_number_of_rows(self):
                         <td> 0.222</td>
                         </tr>
                     </tbody>
-                 </table>"""
-        expected = self.read_html(expected, index_col=0)[0]
-        res = self.read_html(out, index_col=0)[0]
-        tm.assert_frame_equal(expected, res)
+                 </table>""", index_col=0)[0]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_colspan_rowspan_1(self):
+        # GH17054
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th colspan="1">B</th>
+                    <th rowspan="1">C</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                    <td>c</td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_colspan_rowspan_copy_values(self):
+        # GH17054
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # X x Y Z W
+        # A B b z C
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td colspan="2">X</td>
+                    <td>Y</td>
+                    <td rowspan="2">Z</td>
+                    <td>W</td>
+                </tr>
+                <tr>
+                    <td>A</td>
+                    <td colspan="2">B</td>
+                    <td>C</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']],
+                             columns=['X', 'X.1', 'Y', 'Z', 'W'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_colspan_rowspan_both_not_1(self):
+        # GH17054
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # A B b b C
+        # a b b b D
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td rowspan="2">A</td>
+                    <td rowspan="2" colspan="3">B</td>
+                    <td>C</td>
+                </tr>
+                <tr>
+                    <td>D</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']],
+                             columns=['A', 'B', 'B.1', 'B.2', 'C'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rowspan_at_end_of_row(self):
+        # GH17054
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # A B
+        # C b
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td>A</td>
+                    <td rowspan="2">B</td>
+                </tr>
+                <tr>
+                    <td>C</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data=[['C', 'B']], columns=['A', 'B'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rowspan_only_rows(self):
+        # GH17054
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td rowspan="3">A</td>
+                    <td rowspan="3">B</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data=[['A', 'B'], ['A', 'B']],
+                             columns=['A', 'B'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_header_inferred_from_rows_with_only_th(self):
+        # GH17054
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <th>a</th>
+                    <th>b</th>
+                </tr>
+                <tr>
+                    <td>1</td>
+                    <td>2</td>
+                </tr>
+            </table>
+        """)[0]
+
+        columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
+                             labels=[[0, 1], [0, 1]])
+        expected = DataFrame(data=[[1, 2]], columns=columns)
+
+        tm.assert_frame_equal(result, expected)
 
     def test_parse_dates_list(self):
         df = DataFrame({'date': date_range('1/1/2001', periods=10)})
@@ -689,10 +835,26 @@ def test_wikipedia_states_table(self, datapath):
         result = self.read_html(data, 'Arizona', header=1)[0]
         assert result['sq mi'].dtype == np.dtype('float64')
 
-    def test_decimal_rows(self):
+    def test_parser_error_on_empty_header_row(self):
+        with tm.assert_raises_regex(ParserError,
+                                    r"Passed header=\[0,1\] are "
+                                    r"too many rows for this "
+                                    r"multi_index of columns"):
+            self.read_html("""
+                <table>
+                    <thead>
+                        <tr><th></th><th></tr>
+                        <tr><th>A</th><th>B</th></tr>
+                    </thead>
+                    <tbody>
+                        <tr><td>a</td><td>b</td></tr>
+                    </tbody>
+                </table>
+            """, header=[0, 1])
 
+    def test_decimal_rows(self):
         # GH 12907
-        data = StringIO('''<html>
+        result = self.read_html('''<html>
             <body>
              <table>
                 <thead>
@@ -707,9 +869,10 @@ def test_decimal_rows(self):
                 </tbody>
             </table>
             </body>
-        </html>''')
+        </html>''', decimal='#')[0]
+
         expected = DataFrame(data={'Header': 1100.101}, index=[0])
-        result = self.read_html(data, decimal='#')[0]
+
         assert result['Header'].dtype == np.dtype('float64')
         tm.assert_frame_equal(result, expected)
 
@@ -717,53 +880,61 @@ def test_bool_header_arg(self):
         # GH 6114
         for arg in [True, False]:
             with pytest.raises(TypeError):
-                read_html(self.spam_data, header=arg)
+                self.read_html(self.spam_data, header=arg)
 
     def test_converters(self):
         # GH 13461
-        html_data = """<table>
-                        <thead>
-                            <th>a</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                            <td> 0.763</td>
-                            </tr>
-                            <tr>
-                            <td> 0.244</td>
-                            </tr>
-                        </tbody>
-                    </table>"""
+        result = self.read_html(
+            """<table>
+                 <thead>
+                   <tr>
+                     <th>a</th>
+                    </tr>
+                 </thead>
+                 <tbody>
+                   <tr>
+                     <td> 0.763</td>
+                   </tr>
+                   <tr>
+                     <td> 0.244</td>
+                   </tr>
+                 </tbody>
+               </table>""",
+            converters={'a': str}
+        )[0]
+
+        expected = DataFrame({'a': ['0.763', '0.244']})
 
-        expected_df = DataFrame({'a': ['0.763', '0.244']})
-        html_df = read_html(html_data, converters={'a': str})[0]
-        tm.assert_frame_equal(expected_df, html_df)
+        tm.assert_frame_equal(result, expected)
 
     def test_na_values(self):
         # GH 13461
-        html_data = """<table>
-                        <thead>
-                            <th>a</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                            <td> 0.763</td>
-                            </tr>
-                            <tr>
-                            <td> 0.244</td>
-                            </tr>
-                        </tbody>
-                    </table>"""
+        result = self.read_html(
+            """<table>
+                 <thead>
+                   <tr>
+                     <th>a</th>
+                   </tr>
+                 </thead>
+                 <tbody>
+                   <tr>
+                     <td> 0.763</td>
+                   </tr>
+                   <tr>
+                     <td> 0.244</td>
+                   </tr>
+                 </tbody>
+               </table>""",
+            na_values=[0.244])[0]
+
+        expected = DataFrame({'a': [0.763, np.nan]})
 
-        expected_df = DataFrame({'a': [0.763, np.nan]})
-        html_df = read_html(html_data, na_values=[0.244])[0]
-        tm.assert_frame_equal(expected_df, html_df)
+        tm.assert_frame_equal(result, expected)
 
     def test_keep_default_na(self):
         html_data = """<table>
                         <thead>
+                            <tr>
                             <th>a</th>
                             </tr>
                         </thead>
@@ -778,13 +949,56 @@ def test_keep_default_na(self):
                     </table>"""
 
         expected_df = DataFrame({'a': ['N/A', 'NA']})
-        html_df = read_html(html_data, keep_default_na=False)[0]
+        html_df = self.read_html(html_data, keep_default_na=False)[0]
         tm.assert_frame_equal(expected_df, html_df)
 
         expected_df = DataFrame({'a': [np.nan, np.nan]})
-        html_df = read_html(html_data, keep_default_na=True)[0]
+        html_df = self.read_html(html_data, keep_default_na=True)[0]
         tm.assert_frame_equal(expected_df, html_df)
 
+    def test_preserve_empty_rows(self):
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                </tr>
+                <tr>
+                    <td></td>
+                    <td></td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]],
+                             columns=['A', 'B'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_ignore_empty_rows_when_inferring_header(self):
+        result = self.read_html("""
+            <table>
+                <thead>
+                    <tr><th></th><th></tr>
+                    <tr><th>A</th><th>B</th></tr>
+                    <tr><th>a</th><th>b</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td>1</td><td>2</td></tr>
+                </tbody>
+            </table>
+        """)[0]
+
+        columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
+                             labels=[[0, 1], [0, 1]])
+        expected = DataFrame(data=[[1, 2]], columns=columns)
+
+        tm.assert_frame_equal(result, expected)
+
     def test_multiple_header_rows(self):
         # Issue #13434
         expected_df = DataFrame(data=[("Hillary", 68, "D"),
@@ -794,7 +1008,7 @@ def test_multiple_header_rows(self):
                                ["Name", "Unnamed: 1_level_1",
                                 "Unnamed: 2_level_1"]]
         html = expected_df.to_html(index=False)
-        html_df = read_html(html, )[0]
+        html_df = self.read_html(html, )[0]
         tm.assert_frame_equal(expected_df, html_df)
 
     def test_works_on_valid_markup(self, datapath):