Skip to content

ENH: Adding additional keywords to read_html for #13461 #13575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,12 @@ Other enhancements
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)

- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`)
- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`)
- The ``pd.read_html()`` has gained support for the ``keep_default_na`` option (:issue:`13461`)
- The ``pd.read_html()`` has gained support for the ``squeeze`` option (:issue:`13461`)
- The ``pd.read_html()`` has gained support for the ``date_parser`` option (:issue:`13461`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you combine this in one entry?


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need an update in the docs themsleves? e.g. an example

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: added to io.rst

- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)

Expand Down
70 changes: 46 additions & 24 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,10 +611,10 @@ def _expand_elements(body):
body[ind] += empty * (lens_max - length)


def _data_to_frame(data, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands,
decimal):
head, body, foot = data
def _data_to_frame(**kwargs):
head, body, foot = kwargs.pop('data')
header = kwargs.pop('header')
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])

if head:
body = [head] + body
Expand All @@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows,
# fill out elements of body that are "ragged"
_expand_elements(body)

tp = TextParser(body, header=header, index_col=index_col,
skiprows=_get_skiprows(skiprows),
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, decimal=decimal)
tp = TextParser(body, header=header, **kwargs)
df = tp.read()
return df

Expand Down Expand Up @@ -716,9 +713,9 @@ def _validate_flavor(flavor):
return flavor


def _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal):
def _parse(flavor, io, match,
attrs, encoding,
**kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put those on one line? (not needed for PEP8 to put on multiple lines here)

flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

Expand All @@ -740,15 +737,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
ret = []
for table in tables:
try:
ret.append(_data_to_frame(data=table,
header=header,
index_col=index_col,
skiprows=skiprows,
parse_dates=parse_dates,
tupleize_cols=tupleize_cols,
thousands=thousands,
decimal=decimal
))
ret.append(_data_to_frame(data=table, **kwargs))
except EmptyDataError: # empty table
continue
return ret
Expand All @@ -757,7 +746,8 @@ def _parse(flavor, io, match, header, index_col, skiprows,
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=',', encoding=None,
decimal='.'):
decimal='.', converters=None, na_values=None,
keep_default_na=True, squeeze=False, date_parser=None):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -839,6 +829,34 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,

.. versionadded:: 0.19.0

converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the cell (not column) content, and return the
transformed content.

.. versionadded:: 0.19.0

na_values : iterable, default None
Custom NA values

.. versionadded:: 0.19.0

keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to

.. versionadded:: 0.19.0

squeeze : boolean, default False
If the parsed data only contains one column then return a Series

.. versionadded:: 0.19.0

date_parser : function, default None

.. versionadded:: 0.19.0

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -881,6 +899,10 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
_validate_header_arg(header)
return _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal)
return _parse(flavor=flavor, io=io, match=match, header=header,
index_col=index_col, skiprows=skiprows,
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, attrs=attrs, encoding=encoding,
decimal=decimal, converters=converters, na_values=na_values,
keep_default_na=keep_default_na, squeeze=squeeze,
date_parser=date_parser)
111 changes: 111 additions & 0 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import re
import warnings
from datetime import datetime

try:
from importlib import import_module
Expand All @@ -24,6 +25,7 @@
from pandas.io.common import URLError, urlopen, file_path_to_url
from pandas.io.html import read_html
from pandas.parser import CParserError
from pandas.io.date_converters import parse_date_time

import pandas.util.testing as tm
from pandas.util.testing import makeCustomDataframe as mkdf, network
Expand Down Expand Up @@ -694,6 +696,115 @@ def test_bool_header_arg(self):
with tm.assertRaises(TypeError):
read_html(self.spam_data, header=arg)

def test_converters(self):
# GH 13461
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
<tr>
<td> 0.244</td>
</tr>
</tbody>
</table>"""

expected_df = DataFrame({'a': ['0.763', '0.244']})
html_df = read_html(html_data, converters={'a': str})[0]
tm.assert_frame_equal(expected_df, html_df)

def test_na_values(self):
# GH 13461
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
<tr>
<td> 0.244</td>
</tr>
</tbody>
</table>"""

expected_df = DataFrame({'a': [0.763, np.nan]})
html_df = read_html(html_data, na_values=[0.244])[0]
tm.assert_frame_equal(expected_df, html_df)

def test_keep_default_na(self):
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> N/A</td>
</tr>
<tr>
<td> NA</td>
</tr>
</tbody>
</table>"""

expected_df = DataFrame({'a': ['N/A', 'NA']})
html_df = read_html(html_data, keep_default_na=False)[0]
tm.assert_frame_equal(expected_df, html_df)

expected_df = DataFrame({'a': [np.nan, np.nan]})
html_df = read_html(html_data, keep_default_na=True)[0]
tm.assert_frame_equal(expected_df, html_df)

def test_squeeze(self):
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
</tbody>
</table>"""

expected_s = Series({0: 0.763}, name='a')
html_s = read_html(html_data, squeeze=True)[0]
tm.assert_series_equal(expected_s, html_s)

def test_date_parser(self):
html_data = """<table>
<thead>
<th>date</th>
<th>time</th>
</tr>
</thead>
<tbody>
<tr>
<td> 2001-01-05</td>
<td> 10:00:00</td>
</tr>
<tr>
<td> 2001-01-05</td>
<td> 00:00:00</td>
</tr>
</tbody>
</table>"""

datecols = {'date_time': [0, 1]}
df = read_html(html_data, header=0,
parse_dates=datecols,
date_parser=parse_date_time)[0]
self.assertIn('date_time', df)
self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0))


def _lang_enc(filename):
return os.path.splitext(os.path.basename(filename))[0].split('_')
Expand Down