Skip to content

ENH: Adding additional keywords to read_html for #13461 #13575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ Other enhancements
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)

- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`)
- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`)


- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)

Expand Down
28 changes: 22 additions & 6 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ def _expand_elements(body):

def _data_to_frame(data, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands,
decimal):
decimal, converters, na_values):
head, body, foot = data

if head:
Expand All @@ -631,7 +631,8 @@ def _data_to_frame(data, header, index_col, skiprows,
tp = TextParser(body, header=header, index_col=index_col,
skiprows=_get_skiprows(skiprows),
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands, decimal=decimal)
thousands=thousands, decimal=decimal,
converters=converters, na_values=na_values)
df = tp.read()
return df

Expand Down Expand Up @@ -718,7 +719,7 @@ def _validate_flavor(flavor):

def _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal):
decimal, converters, na_values):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

Expand Down Expand Up @@ -747,7 +748,9 @@ def _parse(flavor, io, match, header, index_col, skiprows,
parse_dates=parse_dates,
tupleize_cols=tupleize_cols,
thousands=thousands,
decimal=decimal
decimal=decimal,
converters=converters,
na_values=na_values
))
except EmptyDataError: # empty table
continue
Expand All @@ -757,7 +760,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=',', encoding=None,
decimal='.'):
decimal='.', converters=None, na_values=None):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -839,6 +842,19 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,

.. versionadded:: 0.18.2

converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the cell (not column) content, and return the
transformed content.

.. versionadded:: 0.18.2

na_values : iterable, default None
Custom NA values

.. versionadded:: 0.18.2

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -883,4 +899,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
_validate_header_arg(header)
return _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal)
decimal, converters, na_values)
54 changes: 54 additions & 0 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,60 @@ def test_bool_header_arg(self):
with tm.assertRaises(TypeError):
read_html(self.spam_data, header=arg)

def test_converters(self):
# GH 13461
html_data = """<table>
<thead>
<th>Names</th>
<th>C_l0_g0</th>
<th>C_l0_g1</th>
</tr>
</thead>
<tbody>
<tr>
<th>R_l0_g0</th>
<td> 0.763</td>
<td> 0.233</td>
</tr>
<tr>
<th>R_l0_g1</th>
<td> 0.244</td>
<td> 0.285</td>
</tr>
</tbody>
</table>"""
raw_data = np.array([[u'R_l0_g0', '0.763', 0.233],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls compare with expected DataFrame.

[u'R_l0_g1', '0.244', 0.285]], dtype=object)
html_df = read_html(html_data, converters={'C_l0_g0': str})[0]
tm.assert_numpy_array_equal(raw_data, html_df.values)

def test_na_values(self):
# GH 13461
html_data = """<table>
<thead>
<th>Names</th>
<th>C_l0_g0</th>
<th>C_l0_g1</th>
</tr>
</thead>
<tbody>
<tr>
<th>R_l0_g0</th>
<td> 0.763</td>
<td> 0.233</td>
</tr>
<tr>
<th>R_l0_g1</th>
<td> 0.244</td>
<td> 0.285</td>
</tr>
</tbody>
</table>"""
raw_data = np.array([[u'R_l0_g0', 0.763, 0.233],
[u'R_l0_g1', 0.244, np.nan]], dtype=object)
html_df = read_html(html_data, na_values=[0.285])[0]
tm.assert_numpy_array_equal(raw_data, html_df.values)


def _lang_enc(filename):
return os.path.splitext(os.path.basename(filename))[0].split('_')
Expand Down