Skip to content

Commit 0f1666d

Browse files
ccroncajreback
authored andcommitted
ENH: support decimal argument in read_html pandas-dev#12907
closes pandas-dev#12907 Author: Camilo Cota <[email protected]> Author: Camilo Cota <[email protected]> Closes pandas-dev#13272 from camilocot/issue-12907 and squashes the following commits: 0c15e37 [Camilo Cota] Remove bytes in decimal default value 111625f [Camilo Cota] ENH: support decimal argument in read_html pandas-dev#12907
1 parent 4b05055 commit 0f1666d

File tree

3 files changed

+49
-12
lines changed

3 files changed

+49
-12
lines changed

doc/source/whatsnew/v0.18.2.txt

+7-6
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Other enhancements
6666
idx = pd.Index(["a1a2", "b1", "c1"])
6767
idx.str.extractall("[ab](?P<digit>\d)")
6868

69-
- ``Timestamp``s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`)
69+
- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`)
7070

7171
.. ipython:: python
7272

@@ -80,6 +80,7 @@ Other enhancements
8080
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
8181
- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
8282

83+
- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
8384

8485
.. _whatsnew_0182.api:
8586

@@ -121,10 +122,10 @@ New Behavior:
121122

122123
.. _whatsnew_0182.api.promote:
123124

124-
``Series`` type promotoion on assignment
125-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
125+
``Series`` type promotion on assignment
126+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
126127

127-
A ``Series`` will now correctly promote its dtype with assignment with incompat values to the current dtype (:issue:`13234`)
128+
A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`)
128129

129130

130131
.. ipython:: python
@@ -213,7 +214,7 @@ Bug Fixes
213214

214215

215216
- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
216-
- Bug in ``.to_records()`` when index name is a unicode string (:issue: `13172`)
217+
- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`)
217218

218219
- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`)
219220

@@ -238,7 +239,7 @@ Bug Fixes
238239

239240

240241

241-
- Bug in ``pd.read_csv()`` with ``engine=='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
242+
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
242243

243244

244245

pandas/io/html.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -612,7 +612,8 @@ def _expand_elements(body):
612612

613613

614614
def _data_to_frame(data, header, index_col, skiprows,
615-
parse_dates, tupleize_cols, thousands):
615+
parse_dates, tupleize_cols, thousands,
616+
decimal):
616617
head, body, foot = data
617618

618619
if head:
@@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows,
630631
tp = TextParser(body, header=header, index_col=index_col,
631632
skiprows=_get_skiprows(skiprows),
632633
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
633-
thousands=thousands)
634+
thousands=thousands, decimal=decimal)
634635
df = tp.read()
635636
return df
636637

@@ -716,7 +717,8 @@ def _validate_flavor(flavor):
716717

717718

718719
def _parse(flavor, io, match, header, index_col, skiprows,
719-
parse_dates, tupleize_cols, thousands, attrs, encoding):
720+
parse_dates, tupleize_cols, thousands, attrs, encoding,
721+
decimal):
720722
flavor = _validate_flavor(flavor)
721723
compiled_match = re.compile(match) # you can pass a compiled regex here
722724

@@ -744,15 +746,18 @@ def _parse(flavor, io, match, header, index_col, skiprows,
744746
skiprows=skiprows,
745747
parse_dates=parse_dates,
746748
tupleize_cols=tupleize_cols,
747-
thousands=thousands))
749+
thousands=thousands,
750+
decimal=decimal
751+
))
748752
except EmptyDataError: # empty table
749753
continue
750754
return ret
751755

752756

753757
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
754758
skiprows=None, attrs=None, parse_dates=False,
755-
tupleize_cols=False, thousands=',', encoding=None):
759+
tupleize_cols=False, thousands=',', encoding=None,
760+
decimal='.'):
756761
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
757762
758763
Parameters
@@ -828,6 +833,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
828833
underlying parser library (e.g., the parser library will try to use
829834
the encoding provided by the document).
830835
836+
decimal : str, default '.'
837+
Character to recognize as decimal point (e.g. use ',' for European
838+
data).
839+
840+
.. versionadded:: 0.18.2
841+
831842
Returns
832843
-------
833844
dfs : list of DataFrames
@@ -871,4 +882,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
871882
'data (you passed a negative value)')
872883
_validate_header_arg(header)
873884
return _parse(flavor, io, match, header, index_col, skiprows,
874-
parse_dates, tupleize_cols, thousands, attrs, encoding)
885+
parse_dates, tupleize_cols, thousands, attrs, encoding,
886+
decimal)

pandas/io/tests/test_html.py

+24
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,30 @@ def test_wikipedia_states_table(self):
664664
result = self.read_html(data, 'Arizona', header=1)[0]
665665
self.assertEqual(result['sq mi'].dtype, np.dtype('float64'))
666666

667+
def test_decimal_rows(self):
668+
669+
# GH 12907
670+
data = StringIO('''<html>
671+
<body>
672+
<table>
673+
<thead>
674+
<tr>
675+
<th>Header</th>
676+
</tr>
677+
</thead>
678+
<tbody>
679+
<tr>
680+
<td>1100#101</td>
681+
</tr>
682+
</tbody>
683+
</table>
684+
</body>
685+
</html>''')
686+
expected = DataFrame(data={'Header': 1100.101}, index=[0])
687+
result = self.read_html(data, decimal='#')[0]
688+
nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64'))
689+
tm.assert_frame_equal(result, expected)
690+
667691
def test_bool_header_arg(self):
668692
# GH 6114
669693
for arg in [True, False]:

0 commit comments

Comments
 (0)