pandas-dev · jorisvandenbossche · Jul 21, 2016 · Jul 7, 2016 · Jul 7, 2016 · Jul 7, 2016
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -207,6 +207,12 @@ Other enhancements
 - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
 - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)
 
+- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`)
+- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`)
+- The ``pd.read_html()`` has gained support for the ``keep_default_na`` option (:issue:`13461`)
+- The ``pd.read_html()`` has gained support for the ``squeeze`` option (:issue:`13461`)
+- The ``pd.read_html()`` has gained support for the ``date_parser`` option (:issue:`13461`)
+
 - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
 - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
 

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -611,10 +611,10 @@ def _expand_elements(body):
         body[ind] += empty * (lens_max - length)
 
 
-def _data_to_frame(data, header, index_col, skiprows,
-                   parse_dates, tupleize_cols, thousands,
-                   decimal):
-    head, body, foot = data
+def _data_to_frame(**kwargs):
+    head, body, foot = kwargs.pop('data')
+    header = kwargs.pop('header')
+    kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
 
     if head:
         body = [head] + body
@@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows,
     # fill out elements of body that are "ragged"
     _expand_elements(body)
 
-    tp = TextParser(body, header=header, index_col=index_col,
-                    skiprows=_get_skiprows(skiprows),
-                    parse_dates=parse_dates, tupleize_cols=tupleize_cols,
-                    thousands=thousands, decimal=decimal)
+    tp = TextParser(body, header=header, **kwargs)
     df = tp.read()
     return df
 
@@ -716,9 +713,9 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, header, index_col, skiprows,
-           parse_dates, tupleize_cols, thousands, attrs, encoding,
-           decimal):
+def _parse(flavor, io, match,
+           attrs, encoding,
+           **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
@@ -740,15 +737,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
     ret = []
     for table in tables:
         try:
-            ret.append(_data_to_frame(data=table,
-                                      header=header,
-                                      index_col=index_col,
-                                      skiprows=skiprows,
-                                      parse_dates=parse_dates,
-                                      tupleize_cols=tupleize_cols,
-                                      thousands=thousands,
-                                      decimal=decimal
-                                      ))
+            ret.append(_data_to_frame(data=table, **kwargs))
         except EmptyDataError:  # empty table
             continue
     return ret
@@ -757,7 +746,8 @@ def _parse(flavor, io, match, header, index_col, skiprows,
 def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, attrs=None, parse_dates=False,
               tupleize_cols=False, thousands=',', encoding=None,
-              decimal='.'):
+              decimal='.', converters=None, na_values=None,
+              keep_default_na=True, squeeze=False, date_parser=None):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -839,6 +829,34 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
         .. versionadded:: 0.19.0
 
+    converters : dict, default None
+        Dict of functions for converting values in certain columns. Keys can
+        either be integers or column labels, values are functions that take one
+        input argument, the cell (not column) content, and return the
+        transformed content.
+
+        .. versionadded:: 0.19.0
+
+    na_values : iterable, default None
+        Custom NA values
+
+        .. versionadded:: 0.19.0
+
+    keep_default_na : bool, default True
+        If na_values are specified and keep_default_na is False the default NaN
+        values are overridden, otherwise they're appended to
+
+        .. versionadded:: 0.19.0
+
+    squeeze : boolean, default False
+        If the parsed data only contains one column then return a Series
+
+        .. versionadded:: 0.19.0
+
+    date_parser : function, default None
+
+        .. versionadded:: 0.19.0
+
     Returns
     -------
     dfs : list of DataFrames
@@ -881,6 +899,10 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
         raise ValueError('cannot skip rows starting from the end of the '
                          'data (you passed a negative value)')
     _validate_header_arg(header)
-    return _parse(flavor, io, match, header, index_col, skiprows,
-                  parse_dates, tupleize_cols, thousands, attrs, encoding,
-                  decimal)
+    return _parse(flavor=flavor, io=io, match=match, header=header,
+                  index_col=index_col, skiprows=skiprows,
+                  parse_dates=parse_dates, tupleize_cols=tupleize_cols,
+                  thousands=thousands, attrs=attrs, encoding=encoding,
+                  decimal=decimal, converters=converters, na_values=na_values,
+                  keep_default_na=keep_default_na, squeeze=squeeze,
+                  date_parser=date_parser)
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -4,6 +4,7 @@
 import os
 import re
 import warnings
+from datetime import datetime
 
 try:
     from importlib import import_module
@@ -24,6 +25,7 @@
 from pandas.io.common import URLError, urlopen, file_path_to_url
 from pandas.io.html import read_html
 from pandas.parser import CParserError
+from pandas.io.date_converters import parse_date_time
 
 import pandas.util.testing as tm
 from pandas.util.testing import makeCustomDataframe as mkdf, network
@@ -694,6 +696,115 @@ def test_bool_header_arg(self):
             with tm.assertRaises(TypeError):
                 read_html(self.spam_data, header=arg)
 
+    def test_converters(self):
+        # GH 13461
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> 0.763</td>
+                            </tr>
+                            <tr>
+                            <td> 0.244</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({'a': ['0.763', '0.244']})
+        html_df = read_html(html_data, converters={'a': str})[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_na_values(self):
+        # GH 13461
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> 0.763</td>
+                            </tr>
+                            <tr>
+                            <td> 0.244</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({'a': [0.763, np.nan]})
+        html_df = read_html(html_data, na_values=[0.244])[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_keep_default_na(self):
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> N/A</td>
+                            </tr>
+                            <tr>
+                            <td> NA</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({'a': ['N/A', 'NA']})
+        html_df = read_html(html_data, keep_default_na=False)[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+        expected_df = DataFrame({'a': [np.nan, np.nan]})
+        html_df = read_html(html_data, keep_default_na=True)[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_squeeze(self):
+        html_data = """<table>
+                        <thead>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> 0.763</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_s = Series({0: 0.763}, name='a')
+        html_s = read_html(html_data, squeeze=True)[0]
+        tm.assert_series_equal(expected_s, html_s)
+
+    def test_date_parser(self):
+        html_data = """<table>
+                        <thead>
+                            <th>date</th>
+                            <th>time</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> 2001-01-05</td>
+                            <td> 10:00:00</td>
+                            </tr>
+                            <tr>
+                            <td> 2001-01-05</td>
+                            <td> 00:00:00</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        datecols = {'date_time': [0, 1]}
+        df = read_html(html_data, header=0,
+                       parse_dates=datecols,
+                       date_parser=parse_date_time)[0]
+        self.assertIn('date_time', df)
+        self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0))
+
 
 def _lang_enc(filename):
     return os.path.splitext(os.path.basename(filename))[0].split('_')