Skip to content

Commit 111625f

Browse files
Camilo Cotaccronca
Camilo Cota
authored andcommitted
ENH: support decimal argument in read_html pandas-dev#12907
1 parent e0a2e3b commit 111625f

File tree

3 files changed

+39
-6
lines changed

3 files changed

+39
-6
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Other enhancements
7979
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
8080
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
8181

82+
- The ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
8283

8384
.. _whatsnew_0182.api:
8485

pandas/io/html.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,8 @@ def _expand_elements(body):
612612

613613

614614
def _data_to_frame(data, header, index_col, skiprows,
615-
parse_dates, tupleize_cols, thousands):
615+
parse_dates, tupleize_cols, thousands,
616+
decimal):
616617
head, body, foot = data
617618

618619
if head:
@@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows,
630631
tp = TextParser(body, header=header, index_col=index_col,
631632
skiprows=_get_skiprows(skiprows),
632633
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
633-
thousands=thousands)
634+
thousands=thousands, decimal=decimal)
634635
df = tp.read()
635636
return df
636637

@@ -716,7 +717,8 @@ def _validate_flavor(flavor):
716717

717718

718719
def _parse(flavor, io, match, header, index_col, skiprows,
719-
parse_dates, tupleize_cols, thousands, attrs, encoding):
720+
parse_dates, tupleize_cols, thousands, attrs, encoding,
721+
decimal):
720722
flavor = _validate_flavor(flavor)
721723
compiled_match = re.compile(match) # you can pass a compiled regex here
722724

@@ -744,15 +746,18 @@ def _parse(flavor, io, match, header, index_col, skiprows,
744746
skiprows=skiprows,
745747
parse_dates=parse_dates,
746748
tupleize_cols=tupleize_cols,
747-
thousands=thousands))
749+
thousands=thousands,
750+
decimal=decimal
751+
))
748752
except EmptyDataError: # empty table
749753
continue
750754
return ret
751755

752756

753757
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
754758
skiprows=None, attrs=None, parse_dates=False,
755-
tupleize_cols=False, thousands=',', encoding=None):
759+
tupleize_cols=False, thousands=',', encoding=None,
760+
decimal=b'.'):
756761
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
757762
758763
Parameters
@@ -828,6 +833,10 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
828833
underlying parser library (e.g., the parser library will try to use
829834
the encoding provided by the document).
830835
836+
decimal : str, default '.'
837+
Character to recognize as decimal point (e.g. use ',' for European
838+
data).
839+
831840
Returns
832841
-------
833842
dfs : list of DataFrames
@@ -871,4 +880,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
871880
'data (you passed a negative value)')
872881
_validate_header_arg(header)
873882
return _parse(flavor, io, match, header, index_col, skiprows,
874-
parse_dates, tupleize_cols, thousands, attrs, encoding)
883+
parse_dates, tupleize_cols, thousands, attrs, encoding,
884+
decimal)

pandas/io/tests/test_html.py

+22
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,28 @@ def test_wikipedia_states_table(self):
665665
result = self.read_html(data, 'Arizona', header=1)[0]
666666
nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64'))
667667

668+
def test_decimal_rows(self):
669+
data = StringIO('''<html>
670+
<body>
671+
<table>
672+
<thead>
673+
<tr>
674+
<th>Header</th>
675+
</tr>
676+
</thead>
677+
<tbody>
678+
<tr>
679+
<td>1100#101</td>
680+
</tr>
681+
</tbody>
682+
</table>
683+
</body>
684+
</html>''')
685+
expected = DataFrame(data={'Header': 1100.101}, index=[0])
686+
result = self.read_html(data, decimal='#')[0]
687+
nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64'))
688+
tm.assert_frame_equal(result, expected)
689+
668690
def test_bool_header_arg(self):
669691
# GH 6114
670692
for arg in [True, False]:

0 commit comments

Comments
 (0)