diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index ee2761b79b620..1438eb29eff40 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -79,6 +79,7 @@ Other enhancements - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) .. _whatsnew_0182.api: diff --git a/pandas/io/html.py b/pandas/io/html.py index e350a40bfa805..48caaa39dd711 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -612,7 +612,8 @@ def _expand_elements(body): def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands): + parse_dates, tupleize_cols, thousands, + decimal): head, body, foot = data if head: @@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows, tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands) + thousands=thousands, decimal=decimal) df = tp.read() return df @@ -716,7 +717,8 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding): + parse_dates, tupleize_cols, thousands, attrs, encoding, + decimal): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -744,7 +746,9 @@ def _parse(flavor, io, match, header, index_col, skiprows, skiprows=skiprows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands)) + thousands=thousands, + decimal=decimal + )) except EmptyDataError: # empty table continue return ret @@ -752,7 +756,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, - tupleize_cols=False, thousands=',', encoding=None): + tupleize_cols=False, thousands=',', encoding=None, + decimal='.'): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -828,6 +833,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, underlying parser library (e.g., the parser library will try to use the encoding provided by the document). + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + .. versionadded:: 0.18.2 + Returns ------- dfs : list of DataFrames @@ -871,4 +882,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, 'data (you passed a negative value)') _validate_header_arg(header) return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding) + parse_dates, tupleize_cols, thousands, attrs, encoding, + decimal) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 21d0748fb6aba..edf1eeee7e622 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -665,6 +665,28 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64')) + def test_decimal_rows(self): + data = StringIO(''' + + + + + + + + + + + + +
Header
1100#101
+ + ''') + expected = DataFrame(data={'Header': 1100.101}, index=[0]) + result = self.read_html(data, decimal='#')[0] + nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64')) + tm.assert_frame_equal(result, expected) + def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: