Skip to content

ENH: support decimal argument in read_html #12907 #13272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ Other enhancements
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)

- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)

.. _whatsnew_0182.api:

Expand Down
24 changes: 18 additions & 6 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,8 @@ def _expand_elements(body):


def _data_to_frame(data, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands):
parse_dates, tupleize_cols, thousands,
decimal):
head, body, foot = data

if head:
Expand All @@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows,
tp = TextParser(body, header=header, index_col=index_col,
skiprows=_get_skiprows(skiprows),
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
thousands=thousands)
thousands=thousands, decimal=decimal)
df = tp.read()
return df

Expand Down Expand Up @@ -716,7 +717,8 @@ def _validate_flavor(flavor):


def _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding):
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

Expand Down Expand Up @@ -744,15 +746,18 @@ def _parse(flavor, io, match, header, index_col, skiprows,
skiprows=skiprows,
parse_dates=parse_dates,
tupleize_cols=tupleize_cols,
thousands=thousands))
thousands=thousands,
decimal=decimal
))
except EmptyDataError: # empty table
continue
return ret


def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=',', encoding=None):
tupleize_cols=False, thousands=',', encoding=None,
decimal='.'):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -828,6 +833,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
underlying parser library (e.g., the parser library will try to use
the encoding provided by the document).

decimal : str, default '.'
Character to recognize as decimal point (e.g. use ',' for European
data).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag here

.. versionadded:: 0.18.2

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -871,4 +882,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
'data (you passed a negative value)')
_validate_header_arg(header)
return _parse(flavor, io, match, header, index_col, skiprows,
parse_dates, tupleize_cols, thousands, attrs, encoding)
parse_dates, tupleize_cols, thousands, attrs, encoding,
decimal)
22 changes: 22 additions & 0 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,28 @@ def test_wikipedia_states_table(self):
result = self.read_html(data, 'Arizona', header=1)[0]
nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64'))

def test_decimal_rows(self):
data = StringIO('''<html>
<body>
<table>
<thead>
<tr>
<th>Header</th>
</tr>
</thead>
<tbody>
<tr>
<td>1100#101</td>
</tr>
</tbody>
</table>
</body>
</html>''')
expected = DataFrame(data={'Header': 1100.101}, index=[0])
result = self.read_html(data, decimal='#')[0]
nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64'))
tm.assert_frame_equal(result, expected)

def test_bool_header_arg(self):
# GH 6114
for arg in [True, False]:
Expand Down