Skip to content

Commit afc7b2e

Browse files
committed
Adding and arguments to for pandas-dev#13461 and pandas-dev#10534
1 parent cc0a188 commit afc7b2e

File tree

2 files changed

+75
-6
lines changed

2 files changed

+75
-6
lines changed

pandas/io/html.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ def _expand_elements(body):
613613

614614
def _data_to_frame(data, header, index_col, skiprows,
615615
parse_dates, tupleize_cols, thousands,
616-
decimal):
616+
decimal, converters, na_values):
617617
head, body, foot = data
618618

619619
if head:
@@ -631,7 +631,8 @@ def _data_to_frame(data, header, index_col, skiprows,
631631
tp = TextParser(body, header=header, index_col=index_col,
632632
skiprows=_get_skiprows(skiprows),
633633
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
634-
thousands=thousands, decimal=decimal)
634+
thousands=thousands, decimal=decimal,
635+
converters=converters, na_values=na_values)
635636
df = tp.read()
636637
return df
637638

@@ -718,7 +719,7 @@ def _validate_flavor(flavor):
718719

719720
def _parse(flavor, io, match, header, index_col, skiprows,
720721
parse_dates, tupleize_cols, thousands, attrs, encoding,
721-
decimal):
722+
decimal, converters, na_values):
722723
flavor = _validate_flavor(flavor)
723724
compiled_match = re.compile(match) # you can pass a compiled regex here
724725

@@ -747,7 +748,9 @@ def _parse(flavor, io, match, header, index_col, skiprows,
747748
parse_dates=parse_dates,
748749
tupleize_cols=tupleize_cols,
749750
thousands=thousands,
750-
decimal=decimal
751+
decimal=decimal,
752+
converters=converters,
753+
na_values=na_values
751754
))
752755
except EmptyDataError: # empty table
753756
continue
@@ -757,7 +760,7 @@ def _parse(flavor, io, match, header, index_col, skiprows,
757760
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
758761
skiprows=None, attrs=None, parse_dates=False,
759762
tupleize_cols=False, thousands=',', encoding=None,
760-
decimal='.'):
763+
decimal='.', converters=None, na_values=None):
761764
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
762765
763766
Parameters
@@ -839,6 +842,19 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
839842
840843
.. versionadded:: 0.18.2
841844
845+
converters : dict, default None
846+
Dict of functions for converting values in certain columns. Keys can
847+
either be integers or column labels, values are functions that take one
848+
input argument, the cell (not column) content, and return the
849+
transformed content.
850+
851+
.. versionadded:: 0.19.0
852+
853+
na_values : iterable, default None
854+
Custom NA values
855+
856+
.. versionadded:: 0.19.0
857+
842858
Returns
843859
-------
844860
dfs : list of DataFrames
@@ -883,4 +899,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
883899
_validate_header_arg(header)
884900
return _parse(flavor, io, match, header, index_col, skiprows,
885901
parse_dates, tupleize_cols, thousands, attrs, encoding,
886-
decimal)
902+
decimal, converters, na_values)

pandas/io/tests/test_html.py

+53
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,59 @@ def test_bool_header_arg(self):
694694
with tm.assertRaises(TypeError):
695695
read_html(self.spam_data, header=arg)
696696

697+
def test_converters(self):
698+
# GH 13461
699+
html_data = """<table>
700+
<thead>
701+
<th>Names</th>
702+
<th>C_l0_g0</th>
703+
<th>C_l0_g1</th>
704+
</tr>
705+
</thead>
706+
<tbody>
707+
<tr>
708+
<th>R_l0_g0</th>
709+
<td> 0.763</td>
710+
<td> 0.233</td>
711+
</tr>
712+
<tr>
713+
<th>R_l0_g1</th>
714+
<td> 0.244</td>
715+
<td> 0.285</td>
716+
</tr>
717+
</tbody>
718+
</table>"""
719+
raw_data = np.array([[u'R_l0_g0', '0.763', 0.233],
720+
[u'R_l0_g1', '0.244', 0.285]], dtype=object)
721+
html_df = read_html(html_data, converters={'C_l0_g0': str})[0]
722+
tm.assert_numpy_array_equal(raw_data, html_df.values)
723+
724+
def test_na_values(self):
725+
# GH 13461
726+
html_data = """<table>
727+
<thead>
728+
<th>Names</th>
729+
<th>C_l0_g0</th>
730+
<th>C_l0_g1</th>
731+
</tr>
732+
</thead>
733+
<tbody>
734+
<tr>
735+
<th>R_l0_g0</th>
736+
<td> 0.763</td>
737+
<td> 0.233</td>
738+
</tr>
739+
<tr>
740+
<th>R_l0_g1</th>
741+
<td> 0.244</td>
742+
<td> 0.285</td>
743+
</tr>
744+
</tbody>
745+
</table>"""
746+
raw_data = np.array([[u'R_l0_g0', 0.763, 0.233],
747+
[u'R_l0_g1', 0.244, np.nan]], dtype=object)
748+
html_df = read_html(html_data, na_values=[0.285])[0]
749+
tm.assert_numpy_array_equal(raw_data, html_df.values)
697750

698751
def _lang_enc(filename):
699752
return os.path.splitext(os.path.basename(filename))[0].split('_')

0 commit comments

Comments
 (0)