diff --git a/doc/source/io.rst b/doc/source/io.rst
index da0444a8b8df9..113afa32d182e 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -1959,6 +1959,35 @@ Specify an HTML attribute
dfs2 = read_html(url, attrs={'class': 'sortable'})
print(np.array_equal(dfs1[0], dfs2[0])) # Should be True
+Specify values that should be converted to NaN
+
+.. code-block:: python
+
+ dfs = read_html(url, na_values=['No Acquirer'])
+
+.. versionadded:: 0.19
+
+Specify whether to keep the default set of NaN values
+
+.. code-block:: python
+
+ dfs = read_html(url, keep_default_na=False)
+
+.. versionadded:: 0.19
+
+Specify converters for columns. This is useful for numerical text data that has
+leading zeros. By default columns that are numerical are cast to numeric
+types and the leading zeros are lost. To avoid this, we can convert these
+columns to strings.
+
+.. code-block:: python
+
+ url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code'
+ dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC':
+ str})
+
+.. versionadded:: 0.19
+
Use some combination of the above
.. code-block:: python
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
index 657de7ec26efc..351b0ba9b2906 100644
--- a/doc/source/whatsnew/v0.19.0.txt
+++ b/doc/source/whatsnew/v0.19.0.txt
@@ -207,6 +207,8 @@ Other enhancements
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)
+- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`)
+
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 609642e248eda..79f0f326c4dd7 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -611,10 +611,10 @@ def _expand_elements(body):
body[ind] += empty * (lens_max - length)
-def _data_to_frame(data, header, index_col, skiprows,
- parse_dates, tupleize_cols, thousands,
- decimal):
- head, body, foot = data
+def _data_to_frame(**kwargs):
+ head, body, foot = kwargs.pop('data')
+ header = kwargs.pop('header')
+ kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
if head:
body = [head] + body
@@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows,
# fill out elements of body that are "ragged"
_expand_elements(body)
- tp = TextParser(body, header=header, index_col=index_col,
- skiprows=_get_skiprows(skiprows),
- parse_dates=parse_dates, tupleize_cols=tupleize_cols,
- thousands=thousands, decimal=decimal)
+ tp = TextParser(body, header=header, **kwargs)
df = tp.read()
return df
@@ -716,9 +713,7 @@ def _validate_flavor(flavor):
return flavor
-def _parse(flavor, io, match, header, index_col, skiprows,
- parse_dates, tupleize_cols, thousands, attrs, encoding,
- decimal):
+def _parse(flavor, io, match, attrs, encoding, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here
@@ -740,15 +735,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
ret = []
for table in tables:
try:
- ret.append(_data_to_frame(data=table,
- header=header,
- index_col=index_col,
- skiprows=skiprows,
- parse_dates=parse_dates,
- tupleize_cols=tupleize_cols,
- thousands=thousands,
- decimal=decimal
- ))
+ ret.append(_data_to_frame(data=table, **kwargs))
except EmptyDataError: # empty table
continue
return ret
@@ -757,7 +744,8 @@ def _parse(flavor, io, match, header, index_col, skiprows,
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=',', encoding=None,
- decimal='.'):
+ decimal='.', converters=None, na_values=None,
+ keep_default_na=True):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
Parameters
@@ -839,6 +827,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
.. versionadded:: 0.19.0
+ converters : dict, default None
+ Dict of functions for converting values in certain columns. Keys can
+ either be integers or column labels, values are functions that take one
+ input argument, the cell (not column) content, and return the
+ transformed content.
+
+ .. versionadded:: 0.19.0
+
+ na_values : iterable, default None
+ Custom NA values
+
+ .. versionadded:: 0.19.0
+
+ keep_default_na : bool, default True
+ If na_values are specified and keep_default_na is False the default NaN
+ values are overridden, otherwise they're appended to
+
+ .. versionadded:: 0.19.0
+
Returns
-------
dfs : list of DataFrames
@@ -881,6 +888,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
_validate_header_arg(header)
- return _parse(flavor, io, match, header, index_col, skiprows,
- parse_dates, tupleize_cols, thousands, attrs, encoding,
- decimal)
+ return _parse(flavor=flavor, io=io, match=match, header=header,
+ index_col=index_col, skiprows=skiprows,
+ parse_dates=parse_dates, tupleize_cols=tupleize_cols,
+ thousands=thousands, attrs=attrs, encoding=encoding,
+ decimal=decimal, converters=converters, na_values=na_values,
+ keep_default_na=keep_default_na)
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index 5a95fe7727df0..7b4e775db9476 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -694,6 +694,72 @@ def test_bool_header_arg(self):
with tm.assertRaises(TypeError):
read_html(self.spam_data, header=arg)
+ def test_converters(self):
+ # GH 13461
+ html_data = """
+
+ a |
+
+
+
+
+ 0.763 |
+
+
+ 0.244 |
+
+
+
"""
+
+ expected_df = DataFrame({'a': ['0.763', '0.244']})
+ html_df = read_html(html_data, converters={'a': str})[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
+ def test_na_values(self):
+ # GH 13461
+ html_data = """
+
+ a |
+
+
+
+
+ 0.763 |
+
+
+ 0.244 |
+
+
+
"""
+
+ expected_df = DataFrame({'a': [0.763, np.nan]})
+ html_df = read_html(html_data, na_values=[0.244])[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
+ def test_keep_default_na(self):
+ html_data = """
+
+ a |
+
+
+
+
+ N/A |
+
+
+ NA |
+
+
+
"""
+
+ expected_df = DataFrame({'a': ['N/A', 'NA']})
+ html_df = read_html(html_data, keep_default_na=False)[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
+ expected_df = DataFrame({'a': [np.nan, np.nan]})
+ html_df = read_html(html_data, keep_default_na=True)[0]
+ tm.assert_frame_equal(expected_df, html_df)
+
def _lang_enc(filename):
return os.path.splitext(os.path.basename(filename))[0].split('_')