diff --git a/doc/source/io.rst b/doc/source/io.rst index 92747f9906da2..5bf3075f2688e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -938,8 +938,106 @@ Reading HTML Content .. versionadded:: 0.11.1 -The toplevel :func:`~pandas.io.parsers.read_html` function can accept an HTML +The toplevel :func:`~pandas.io.html.read_html` function can accept an HTML string/file/url and will parse HTML tables into list of pandas DataFrames. +Let's look at a few examples. + +Read a URL with no options + +.. ipython:: python + + url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' + dfs = read_html(url) + dfs + +.. note:: + + ``read_html`` returns a ``list`` of ``DataFrame`` objects, even if there is + only a single table contained in the HTML content + +Read a URL and match a table that contains specific text + +.. ipython:: python + + match = 'Metcalf Bank' + df_list = read_html(url, match=match) + len(dfs) + dfs[0] + +Specify a header row (by default ```` elements are used to form the column +index); if specified, the header row is taken from the data minus the parsed +header elements (```` elements). + +.. ipython:: python + + dfs = read_html(url, header=0) + len(dfs) + dfs[0] + +Specify an index column + +.. ipython:: python + + dfs = read_html(url, index_col=0) + len(dfs) + dfs[0] + dfs[0].index.name + +Specify a number of rows to skip + +.. ipython:: python + + dfs = read_html(url, skiprows=0) + len(dfs) + dfs[0] + +Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works +as well) + +.. ipython:: python + + dfs = read_html(url, skiprows=range(2)) + len(dfs) + dfs[0] + +Don't infer numeric and date types + +.. ipython:: python + + dfs = read_html(url, infer_types=False) + len(dfs) + dfs[0] + +Specify an HTML attribute + +.. ipython:: python + + dfs = read_html(url) + len(dfs) + dfs[0] + +Use some combination of the above + +.. ipython:: python + + dfs = read_html(url, match='Metcalf Bank', index_col=0) + len(dfs) + dfs[0] + +Read in pandas ``to_html`` output (with some loss of floating point precision) + +.. ipython:: python + + df = DataFrame(randn(2, 2)) + s = df.to_html(float_format='{0:.40g}'.format) + dfin = read_html(s, index_col=0) + df + dfin[0] + df.index + df.columns + dfin[0].index + dfin[0].columns + np.allclose(df, dfin[0]) Writing to HTML files @@ -947,9 +1045,134 @@ Writing to HTML files .. _io.html: -DataFrame object has an instance method ``to_html`` which renders the contents -of the DataFrame as an html table. The function arguments are as in the method -``to_string`` described above. +``DataFrame`` objects have an instance method ``to_html`` which renders the +contents of the ``DataFrame`` as an HTML table. The function arguments are as +in the method ``to_string`` described above. + +.. note:: + + Not all of the possible options for ``DataFrame.to_html`` are shown here for + brevity's sake. See :func:`~pandas.DataFrame.to_html` for the full set of + options. + +.. ipython:: python + :suppress: + + def write_html(df, filename, *args, **kwargs): + static = os.path.abspath(os.path.join('source', '_static')) + with open(os.path.join(static, filename + '.html'), 'w') as f: + df.to_html(f, *args, **kwargs) + +.. ipython:: python + + df = DataFrame(randn(2, 2)) + df + print df.to_html() # raw html + +.. ipython:: python + :suppress: + + write_html(df, 'basic') + +HTML: + +.. raw:: html + :file: _static/basic.html + +The ``columns`` argument will limit the columns shown + +.. ipython:: python + + print df.to_html(columns=[0]) + +.. ipython:: python + :suppress: + + write_html(df, 'columns', columns=[0]) + +HTML: + +.. raw:: html + :file: _static/columns.html + +``float_format`` takes a Python callable to control the precision of floating +point values + +.. ipython:: python + + print df.to_html(float_format='{0:.10f}'.format) + +.. ipython:: python + :suppress: + + write_html(df, 'float_format', float_format='{0:.10f}'.format) + +HTML: + +.. raw:: html + :file: _static/float_format.html + +``bold_rows`` will make the row labels bold by default, but you can turn that +off + +.. ipython:: python + + print df.to_html(bold_rows=False) + +.. ipython:: python + :suppress: + + write_html(df, 'nobold', bold_rows=False) + +.. raw:: html + :file: _static/nobold.html + +The ``classes`` argument provides the ability to give the resulting HTML +table CSS classes. Note that these classes are *appended* to the existing +``'dataframe'`` class. + +.. ipython:: python + + print df.to_html(classes=['awesome_table_class', 'even_more_awesome_class']) + +Finally, the ``escape`` argument allows you to control whether the +"<", ">" and "&" characters escaped in the resulting HTML (by default it is +``True``). So to get the HTML without escaped characters pass ``escape=False`` + +.. ipython:: python + + df = DataFrame({'a': list('&<>'), 'b': randn(3)}) + + +.. ipython:: python + :suppress: + + write_html(df, 'escape') + write_html(df, 'noescape', escape=False) + +Escaped: + +.. ipython:: python + + print df.to_html() + +.. raw:: html + :file: _static/escape.html + +Not escaped: + +.. ipython:: python + + print df.to_html(escape=False) + +.. raw:: html + :file: _static/noescape.html + +.. note:: + + Some browsers may not show a difference in the rendering of the previous two + HTML tables. + Clipboard --------- diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index a9fc412a6b8e3..5ff436f6d0d50 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -357,7 +357,7 @@ Replace the '.' with ``nan`` (str -> str) :suppress: from numpy.random import rand, randn - nan = np.nan + from numpy import nan from pandas import DataFrame .. ipython:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ea8dee51565ac..3ad8de077f1ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1598,6 +1598,7 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, classes=None, escape=True): """ to_html-specific options + bold_rows : boolean, default True Make the row labels bold in the output classes : str or list or tuple, default None @@ -1605,7 +1606,7 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, escape : boolean, default True Convert the characters <, >, and & to HTML-safe sequences. - Render a DataFrame to an html table. + Render a DataFrame as an HTML table. """ import warnings diff --git a/pandas/io/html.py b/pandas/io/html.py index 915c30ecc3c40..9b2f292d30f47 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -18,7 +18,7 @@ import numpy as np -from pandas import DataFrame, MultiIndex +from pandas import DataFrame, MultiIndex, Index, Series, isnull from pandas.io.parsers import _is_url @@ -398,7 +398,6 @@ def _parse_tables(self, doc, match, attrs): if not tables: raise AssertionError("No tables found matching " "'{0}'".format(match.pattern)) - #import ipdb; ipdb.set_trace() return tables def _setup_build_doc(self): @@ -560,6 +559,17 @@ def _parse_raw_tfoot(self, table): table.xpath(expr)] +def _maybe_convert_index_type(index): + try: + index = index.astype(int) + except (TypeError, ValueError): + if not isinstance(index, MultiIndex): + s = Series(index, name=index.name) + index = Index(s.convert_objects(convert_numeric=True), + name=index.name) + return index + + def _data_to_frame(data, header, index_col, infer_types, skiprows): """Parse a BeautifulSoup table into a DataFrame. @@ -620,6 +630,12 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows): raise ValueError('Labels {0} not found when trying to skip' ' rows'.format(it)) + # convert to numbers/dates where possible + # must be sequential since dates trump numbers if both args are given + if infer_types: + df = df.convert_objects(convert_numeric=True) + df = df.convert_objects(convert_dates='coerce') + if header is not None: header_rows = df.iloc[header] @@ -632,11 +648,6 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows): df = df.drop(df.index[header]) - # convert to numbers/dates where possible - # must be sequential since dates trump numbers if both args are given - if infer_types: - df = df.convert_objects(convert_numeric=True) - if index_col is not None: cols = df.columns[index_col] @@ -648,12 +659,16 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows): # drop by default df.set_index(cols, inplace=True) if df.index.nlevels == 1: - if not (df.index.name or df.index.name is None): + if isnull(df.index.name) or not df.index.name: df.index.name = None else: names = [name or None for name in df.index.names] df.index = MultiIndex.from_tuples(df.index.values, names=names) + if infer_types: + df.index = _maybe_convert_index_type(df.index) + df.columns = _maybe_convert_index_type(df.columns) + return df