diff --git a/doc/source/io.rst b/doc/source/io.rst
index 92747f9906da2..5bf3075f2688e 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -938,8 +938,106 @@ Reading HTML Content
.. versionadded:: 0.11.1
-The toplevel :func:`~pandas.io.parsers.read_html` function can accept an HTML
+The toplevel :func:`~pandas.io.html.read_html` function can accept an HTML
string/file/url and will parse HTML tables into list of pandas DataFrames.
+Let's look at a few examples.
+
+Read a URL with no options
+
+.. ipython:: python
+
+ url = 'http://www.fdic.gov/bank/individual/failed/banklist.html'
+ dfs = read_html(url)
+ dfs
+
+.. note::
+
+ ``read_html`` returns a ``list`` of ``DataFrame`` objects, even if there is
+ only a single table contained in the HTML content
+
+Read a URL and match a table that contains specific text
+
+.. ipython:: python
+
+ match = 'Metcalf Bank'
+ df_list = read_html(url, match=match)
+ len(dfs)
+ dfs[0]
+
+Specify a header row (by default ``
`` elements are used to form the column
+index); if specified, the header row is taken from the data minus the parsed
+header elements (`` | `` elements).
+
+.. ipython:: python
+
+ dfs = read_html(url, header=0)
+ len(dfs)
+ dfs[0]
+
+Specify an index column
+
+.. ipython:: python
+
+ dfs = read_html(url, index_col=0)
+ len(dfs)
+ dfs[0]
+ dfs[0].index.name
+
+Specify a number of rows to skip
+
+.. ipython:: python
+
+ dfs = read_html(url, skiprows=0)
+ len(dfs)
+ dfs[0]
+
+Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works
+as well)
+
+.. ipython:: python
+
+ dfs = read_html(url, skiprows=range(2))
+ len(dfs)
+ dfs[0]
+
+Don't infer numeric and date types
+
+.. ipython:: python
+
+ dfs = read_html(url, infer_types=False)
+ len(dfs)
+ dfs[0]
+
+Specify an HTML attribute
+
+.. ipython:: python
+
+ dfs = read_html(url)
+ len(dfs)
+ dfs[0]
+
+Use some combination of the above
+
+.. ipython:: python
+
+ dfs = read_html(url, match='Metcalf Bank', index_col=0)
+ len(dfs)
+ dfs[0]
+
+Read in pandas ``to_html`` output (with some loss of floating point precision)
+
+.. ipython:: python
+
+ df = DataFrame(randn(2, 2))
+ s = df.to_html(float_format='{0:.40g}'.format)
+ dfin = read_html(s, index_col=0)
+ df
+ dfin[0]
+ df.index
+ df.columns
+ dfin[0].index
+ dfin[0].columns
+ np.allclose(df, dfin[0])
Writing to HTML files
@@ -947,9 +1045,134 @@ Writing to HTML files
.. _io.html:
-DataFrame object has an instance method ``to_html`` which renders the contents
-of the DataFrame as an html table. The function arguments are as in the method
-``to_string`` described above.
+``DataFrame`` objects have an instance method ``to_html`` which renders the
+contents of the ``DataFrame`` as an HTML table. The function arguments are as
+in the method ``to_string`` described above.
+
+.. note::
+
+ Not all of the possible options for ``DataFrame.to_html`` are shown here for
+ brevity's sake. See :func:`~pandas.DataFrame.to_html` for the full set of
+ options.
+
+.. ipython:: python
+ :suppress:
+
+ def write_html(df, filename, *args, **kwargs):
+ static = os.path.abspath(os.path.join('source', '_static'))
+ with open(os.path.join(static, filename + '.html'), 'w') as f:
+ df.to_html(f, *args, **kwargs)
+
+.. ipython:: python
+
+ df = DataFrame(randn(2, 2))
+ df
+ print df.to_html() # raw html
+
+.. ipython:: python
+ :suppress:
+
+ write_html(df, 'basic')
+
+HTML:
+
+.. raw:: html
+ :file: _static/basic.html
+
+The ``columns`` argument will limit the columns shown
+
+.. ipython:: python
+
+ print df.to_html(columns=[0])
+
+.. ipython:: python
+ :suppress:
+
+ write_html(df, 'columns', columns=[0])
+
+HTML:
+
+.. raw:: html
+ :file: _static/columns.html
+
+``float_format`` takes a Python callable to control the precision of floating
+point values
+
+.. ipython:: python
+
+ print df.to_html(float_format='{0:.10f}'.format)
+
+.. ipython:: python
+ :suppress:
+
+ write_html(df, 'float_format', float_format='{0:.10f}'.format)
+
+HTML:
+
+.. raw:: html
+ :file: _static/float_format.html
+
+``bold_rows`` will make the row labels bold by default, but you can turn that
+off
+
+.. ipython:: python
+
+ print df.to_html(bold_rows=False)
+
+.. ipython:: python
+ :suppress:
+
+ write_html(df, 'nobold', bold_rows=False)
+
+.. raw:: html
+ :file: _static/nobold.html
+
+The ``classes`` argument provides the ability to give the resulting HTML
+table CSS classes. Note that these classes are *appended* to the existing
+``'dataframe'`` class.
+
+.. ipython:: python
+
+ print df.to_html(classes=['awesome_table_class', 'even_more_awesome_class'])
+
+Finally, the ``escape`` argument allows you to control whether the
+"<", ">" and "&" characters escaped in the resulting HTML (by default it is
+``True``). So to get the HTML without escaped characters pass ``escape=False``
+
+.. ipython:: python
+
+ df = DataFrame({'a': list('&<>'), 'b': randn(3)})
+
+
+.. ipython:: python
+ :suppress:
+
+ write_html(df, 'escape')
+ write_html(df, 'noescape', escape=False)
+
+Escaped:
+
+.. ipython:: python
+
+ print df.to_html()
+
+.. raw:: html
+ :file: _static/escape.html
+
+Not escaped:
+
+.. ipython:: python
+
+ print df.to_html(escape=False)
+
+.. raw:: html
+ :file: _static/noescape.html
+
+.. note::
+
+ Some browsers may not show a difference in the rendering of the previous two
+ HTML tables.
+
Clipboard
---------
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
index a9fc412a6b8e3..5ff436f6d0d50 100644
--- a/doc/source/missing_data.rst
+++ b/doc/source/missing_data.rst
@@ -357,7 +357,7 @@ Replace the '.' with ``nan`` (str -> str)
:suppress:
from numpy.random import rand, randn
- nan = np.nan
+ from numpy import nan
from pandas import DataFrame
.. ipython:: python
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ea8dee51565ac..3ad8de077f1ea 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1598,6 +1598,7 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None,
classes=None, escape=True):
"""
to_html-specific options
+
bold_rows : boolean, default True
Make the row labels bold in the output
classes : str or list or tuple, default None
@@ -1605,7 +1606,7 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None,
escape : boolean, default True
Convert the characters <, >, and & to HTML-safe sequences.
- Render a DataFrame to an html table.
+ Render a DataFrame as an HTML table.
"""
import warnings
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 915c30ecc3c40..9b2f292d30f47 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -18,7 +18,7 @@
import numpy as np
-from pandas import DataFrame, MultiIndex
+from pandas import DataFrame, MultiIndex, Index, Series, isnull
from pandas.io.parsers import _is_url
@@ -398,7 +398,6 @@ def _parse_tables(self, doc, match, attrs):
if not tables:
raise AssertionError("No tables found matching "
"'{0}'".format(match.pattern))
- #import ipdb; ipdb.set_trace()
return tables
def _setup_build_doc(self):
@@ -560,6 +559,17 @@ def _parse_raw_tfoot(self, table):
table.xpath(expr)]
+def _maybe_convert_index_type(index):
+ try:
+ index = index.astype(int)
+ except (TypeError, ValueError):
+ if not isinstance(index, MultiIndex):
+ s = Series(index, name=index.name)
+ index = Index(s.convert_objects(convert_numeric=True),
+ name=index.name)
+ return index
+
+
def _data_to_frame(data, header, index_col, infer_types, skiprows):
"""Parse a BeautifulSoup table into a DataFrame.
@@ -620,6 +630,12 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows):
raise ValueError('Labels {0} not found when trying to skip'
' rows'.format(it))
+ # convert to numbers/dates where possible
+ # must be sequential since dates trump numbers if both args are given
+ if infer_types:
+ df = df.convert_objects(convert_numeric=True)
+ df = df.convert_objects(convert_dates='coerce')
+
if header is not None:
header_rows = df.iloc[header]
@@ -632,11 +648,6 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows):
df = df.drop(df.index[header])
- # convert to numbers/dates where possible
- # must be sequential since dates trump numbers if both args are given
- if infer_types:
- df = df.convert_objects(convert_numeric=True)
-
if index_col is not None:
cols = df.columns[index_col]
@@ -648,12 +659,16 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows):
# drop by default
df.set_index(cols, inplace=True)
if df.index.nlevels == 1:
- if not (df.index.name or df.index.name is None):
+ if isnull(df.index.name) or not df.index.name:
df.index.name = None
else:
names = [name or None for name in df.index.names]
df.index = MultiIndex.from_tuples(df.index.values, names=names)
+ if infer_types:
+ df.index = _maybe_convert_index_type(df.index)
+ df.columns = _maybe_convert_index_type(df.columns)
+
return df
|