Skip to content

Commit 4d3b6c1

Browse files
gte620vjorisvandenbossche
authored andcommitted
ENH: Adding additional keywords to read_html for #13461 (#13575)
1 parent e357ea1 commit 4d3b6c1

File tree

4 files changed

+131
-24
lines changed

4 files changed

+131
-24
lines changed

doc/source/io.rst

+29
Original file line numberDiff line numberDiff line change
@@ -1959,6 +1959,35 @@ Specify an HTML attribute
19591959
dfs2 = read_html(url, attrs={'class': 'sortable'})
19601960
print(np.array_equal(dfs1[0], dfs2[0])) # Should be True
19611961
1962+
Specify values that should be converted to NaN
1963+
1964+
.. code-block:: python
1965+
1966+
dfs = read_html(url, na_values=['No Acquirer'])
1967+
1968+
.. versionadded:: 0.19
1969+
1970+
Specify whether to keep the default set of NaN values
1971+
1972+
.. code-block:: python
1973+
1974+
dfs = read_html(url, keep_default_na=False)
1975+
1976+
.. versionadded:: 0.19
1977+
1978+
Specify converters for columns. This is useful for numerical text data that has
1979+
leading zeros. By default columns that are numerical are cast to numeric
1980+
types and the leading zeros are lost. To avoid this, we can convert these
1981+
columns to strings.
1982+
1983+
.. code-block:: python
1984+
1985+
url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code'
1986+
dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC':
1987+
str})
1988+
1989+
.. versionadded:: 0.19
1990+
19621991
Use some combination of the above
19631992

19641993
.. code-block:: python

doc/source/whatsnew/v0.19.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ Other enhancements
293293
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
294294
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)
295295

296+
- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`)
297+
296298
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
297299
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
298300

pandas/io/html.py

+34-24
Original file line numberDiff line numberDiff line change
@@ -611,10 +611,10 @@ def _expand_elements(body):
611611
body[ind] += empty * (lens_max - length)
612612

613613

614-
def _data_to_frame(data, header, index_col, skiprows,
615-
parse_dates, tupleize_cols, thousands,
616-
decimal):
617-
head, body, foot = data
614+
def _data_to_frame(**kwargs):
615+
head, body, foot = kwargs.pop('data')
616+
header = kwargs.pop('header')
617+
kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
618618

619619
if head:
620620
body = [head] + body
@@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows,
628628
# fill out elements of body that are "ragged"
629629
_expand_elements(body)
630630

631-
tp = TextParser(body, header=header, index_col=index_col,
632-
skiprows=_get_skiprows(skiprows),
633-
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
634-
thousands=thousands, decimal=decimal)
631+
tp = TextParser(body, header=header, **kwargs)
635632
df = tp.read()
636633
return df
637634

@@ -716,9 +713,7 @@ def _validate_flavor(flavor):
716713
return flavor
717714

718715

719-
def _parse(flavor, io, match, header, index_col, skiprows,
720-
parse_dates, tupleize_cols, thousands, attrs, encoding,
721-
decimal):
716+
def _parse(flavor, io, match, attrs, encoding, **kwargs):
722717
flavor = _validate_flavor(flavor)
723718
compiled_match = re.compile(match) # you can pass a compiled regex here
724719

@@ -740,15 +735,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
740735
ret = []
741736
for table in tables:
742737
try:
743-
ret.append(_data_to_frame(data=table,
744-
header=header,
745-
index_col=index_col,
746-
skiprows=skiprows,
747-
parse_dates=parse_dates,
748-
tupleize_cols=tupleize_cols,
749-
thousands=thousands,
750-
decimal=decimal
751-
))
738+
ret.append(_data_to_frame(data=table, **kwargs))
752739
except EmptyDataError: # empty table
753740
continue
754741
return ret
@@ -757,7 +744,8 @@ def _parse(flavor, io, match, header, index_col, skiprows,
757744
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
758745
skiprows=None, attrs=None, parse_dates=False,
759746
tupleize_cols=False, thousands=',', encoding=None,
760-
decimal='.'):
747+
decimal='.', converters=None, na_values=None,
748+
keep_default_na=True):
761749
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
762750
763751
Parameters
@@ -839,6 +827,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
839827
840828
.. versionadded:: 0.19.0
841829
830+
converters : dict, default None
831+
Dict of functions for converting values in certain columns. Keys can
832+
either be integers or column labels, values are functions that take one
833+
input argument, the cell (not column) content, and return the
834+
transformed content.
835+
836+
.. versionadded:: 0.19.0
837+
838+
na_values : iterable, default None
839+
Custom NA values
840+
841+
.. versionadded:: 0.19.0
842+
843+
keep_default_na : bool, default True
844+
If na_values are specified and keep_default_na is False the default NaN
845+
values are overridden, otherwise they're appended to
846+
847+
.. versionadded:: 0.19.0
848+
842849
Returns
843850
-------
844851
dfs : list of DataFrames
@@ -881,6 +888,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
881888
raise ValueError('cannot skip rows starting from the end of the '
882889
'data (you passed a negative value)')
883890
_validate_header_arg(header)
884-
return _parse(flavor, io, match, header, index_col, skiprows,
885-
parse_dates, tupleize_cols, thousands, attrs, encoding,
886-
decimal)
891+
return _parse(flavor=flavor, io=io, match=match, header=header,
892+
index_col=index_col, skiprows=skiprows,
893+
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
894+
thousands=thousands, attrs=attrs, encoding=encoding,
895+
decimal=decimal, converters=converters, na_values=na_values,
896+
keep_default_na=keep_default_na)

pandas/io/tests/test_html.py

+66
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,72 @@ def test_bool_header_arg(self):
694694
with tm.assertRaises(TypeError):
695695
read_html(self.spam_data, header=arg)
696696

697+
def test_converters(self):
698+
# GH 13461
699+
html_data = """<table>
700+
<thead>
701+
<th>a</th>
702+
</tr>
703+
</thead>
704+
<tbody>
705+
<tr>
706+
<td> 0.763</td>
707+
</tr>
708+
<tr>
709+
<td> 0.244</td>
710+
</tr>
711+
</tbody>
712+
</table>"""
713+
714+
expected_df = DataFrame({'a': ['0.763', '0.244']})
715+
html_df = read_html(html_data, converters={'a': str})[0]
716+
tm.assert_frame_equal(expected_df, html_df)
717+
718+
def test_na_values(self):
719+
# GH 13461
720+
html_data = """<table>
721+
<thead>
722+
<th>a</th>
723+
</tr>
724+
</thead>
725+
<tbody>
726+
<tr>
727+
<td> 0.763</td>
728+
</tr>
729+
<tr>
730+
<td> 0.244</td>
731+
</tr>
732+
</tbody>
733+
</table>"""
734+
735+
expected_df = DataFrame({'a': [0.763, np.nan]})
736+
html_df = read_html(html_data, na_values=[0.244])[0]
737+
tm.assert_frame_equal(expected_df, html_df)
738+
739+
def test_keep_default_na(self):
740+
html_data = """<table>
741+
<thead>
742+
<th>a</th>
743+
</tr>
744+
</thead>
745+
<tbody>
746+
<tr>
747+
<td> N/A</td>
748+
</tr>
749+
<tr>
750+
<td> NA</td>
751+
</tr>
752+
</tbody>
753+
</table>"""
754+
755+
expected_df = DataFrame({'a': ['N/A', 'NA']})
756+
html_df = read_html(html_data, keep_default_na=False)[0]
757+
tm.assert_frame_equal(expected_df, html_df)
758+
759+
expected_df = DataFrame({'a': [np.nan, np.nan]})
760+
html_df = read_html(html_data, keep_default_na=True)[0]
761+
tm.assert_frame_equal(expected_df, html_df)
762+
697763

698764
def _lang_enc(filename):
699765
return os.path.splitext(os.path.basename(filename))[0].split('_')

0 commit comments

Comments
 (0)