Skip to content

ENH: Adding additional keywords to read_html for #13461 #13575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
29 changes: 29 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1959,6 +1959,35 @@ Specify an HTML attribute
dfs2 = read_html(url, attrs={'class': 'sortable'})
print(np.array_equal(dfs1[0], dfs2[0])) # Should be True

Specify values that should be converted to NaN

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a version added tag for these

.. code-block:: python

dfs = read_html(url, na_values=['No Acquirer'])

.. versionadded:: 0.19

Specify whether to keep the default set of NaN values

.. code-block:: python

dfs = read_html(url, keep_default_na=False)

.. versionadded:: 0.19

Specify converters for columns. This is useful for numerical text data that has
leading zeros. By default columns that are numerical are cast to numeric
types and the leading zeros are lost. To avoid this, we can convert these
columns to strings.

.. code-block:: python

url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code'
dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC':
str})

.. versionadded:: 0.19

Use some combination of the above

.. code-block:: python
Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ Other enhancements
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)

- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need an update in the docs themsleves? e.g. an example

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: added to io.rst

- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)

Expand Down
58 changes: 34 additions & 24 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,10 +611,10 @@ def _expand_elements(body):
body[ind] += empty * (lens_max - length)


def _data_to_frame(data, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands,
decimal):
head, body, foot = data
def _data_to_frame(**kwargs):
head, body, foot = kwargs.pop('data')
header = kwargs.pop('header')
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])

if head:
body = [head] + body
Expand All @@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows,
# fill out elements of body that are "ragged"
_expand_elements(body)

tp = TextParser(body, header=header, index_col=index_col,
skiprows=_get_skiprows(skiprows),
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, decimal=decimal)
tp = TextParser(body, header=header, **kwargs)
df = tp.read()
return df

Expand Down Expand Up @@ -716,9 +713,7 @@ def _validate_flavor(flavor):
return flavor


def _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal):
def _parse(flavor, io, match, attrs, encoding, **kwargs):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

Expand All @@ -740,15 +735,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
ret = []
for table in tables:
try:
ret.append(_data_to_frame(data=table,
header=header,
index_col=index_col,
skiprows=skiprows,
parse_dates=parse_dates,
tupleize_cols=tupleize_cols,
thousands=thousands,
decimal=decimal
))
ret.append(_data_to_frame(data=table, **kwargs))
except EmptyDataError: # empty table
continue
return ret
Expand All @@ -757,7 +744,8 @@ def _parse(flavor, io, match, header, index_col, skiprows,
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=',', encoding=None,
decimal='.'):
decimal='.', converters=None, na_values=None,
keep_default_na=True):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -839,6 +827,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,

.. versionadded:: 0.19.0

converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the cell (not column) content, and return the
transformed content.

.. versionadded:: 0.19.0

na_values : iterable, default None
Custom NA values

.. versionadded:: 0.19.0

keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to

.. versionadded:: 0.19.0

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -881,6 +888,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
_validate_header_arg(header)
return _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal)
return _parse(flavor=flavor, io=io, match=match, header=header,
index_col=index_col, skiprows=skiprows,
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, attrs=attrs, encoding=encoding,
decimal=decimal, converters=converters, na_values=na_values,
keep_default_na=keep_default_na)
66 changes: 66 additions & 0 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,72 @@ def test_bool_header_arg(self):
with tm.assertRaises(TypeError):
read_html(self.spam_data, header=arg)

def test_converters(self):
# GH 13461
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
<tr>
<td> 0.244</td>
</tr>
</tbody>
</table>"""

expected_df = DataFrame({'a': ['0.763', '0.244']})
html_df = read_html(html_data, converters={'a': str})[0]
tm.assert_frame_equal(expected_df, html_df)

def test_na_values(self):
# GH 13461
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> 0.763</td>
</tr>
<tr>
<td> 0.244</td>
</tr>
</tbody>
</table>"""

expected_df = DataFrame({'a': [0.763, np.nan]})
html_df = read_html(html_data, na_values=[0.244])[0]
tm.assert_frame_equal(expected_df, html_df)

def test_keep_default_na(self):
html_data = """<table>
<thead>
<th>a</th>
</tr>
</thead>
<tbody>
<tr>
<td> N/A</td>
</tr>
<tr>
<td> NA</td>
</tr>
</tbody>
</table>"""

expected_df = DataFrame({'a': ['N/A', 'NA']})
html_df = read_html(html_data, keep_default_na=False)[0]
tm.assert_frame_equal(expected_df, html_df)

expected_df = DataFrame({'a': [np.nan, np.nan]})
html_df = read_html(html_data, keep_default_na=True)[0]
tm.assert_frame_equal(expected_df, html_df)


def _lang_enc(filename):
return os.path.splitext(os.path.basename(filename))[0].split('_')
Expand Down