ENH: support decimal argument in read_html pandas-dev#12907

ccronca · jreback · commit 0f1666d8adfa · 2016-05-26T20:14:16.000-04:00
closes pandas-dev#12907 Author: Camilo Cota <camilocot@gmail.com> Author: Camilo Cota <ccota@riplife.es> Closes pandas-dev#13272 from camilocot/issue-12907 and squashes the following commits: 0c15e37 [Camilo Cota] Remove bytes in decimal default value 111625f [Camilo Cota] ENH: support decimal argument in read_html pandas-dev#12907
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -66,7 +66,7 @@ Other enhancements
      idx = pd.Index(["a1a2", "b1", "c1"])
      idx.str.extractall("[ab](?P<digit>\d)")
 
-- ``Timestamp``s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`)
+- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`)
 
   .. ipython:: python
 
@@ -80,6 +80,7 @@ Other enhancements
 - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
 - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
 
+- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
 
 .. _whatsnew_0182.api:
 
@@ -121,10 +122,10 @@ New Behavior:
 
 .. _whatsnew_0182.api.promote:
 
-``Series`` type promotoion on assignment
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``Series`` type promotion on assignment
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-A ``Series`` will now correctly promote its dtype with assignment with incompat values to the current dtype (:issue:`13234`)
+A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`)
 
 
 .. ipython:: python
@@ -213,7 +214,7 @@ Bug Fixes
 
 
 - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`)
-- Bug in ``.to_records()`` when index name is a unicode string (:issue: `13172`)
+- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`)
 
 - Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`)
 
@@ -238,7 +239,7 @@ Bug Fixes
 
 
 
-- Bug in ``pd.read_csv()`` with ``engine=='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
+- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
 
 
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -612,7 +612,8 @@ def _expand_elements(body):
 
 
 def _data_to_frame(data, header, index_col, skiprows,
-                   parse_dates, tupleize_cols, thousands):
+                   parse_dates, tupleize_cols, thousands,
+                   decimal):
     head, body, foot = data
 
     if head:
@@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows,
     tp = TextParser(body, header=header, index_col=index_col,
                     skiprows=_get_skiprows(skiprows),
                     parse_dates=parse_dates, tupleize_cols=tupleize_cols,
-                    thousands=thousands)
+                    thousands=thousands, decimal=decimal)
     df = tp.read()
     return df
 
@@ -716,7 +717,8 @@ def _validate_flavor(flavor):
 
 
 def _parse(flavor, io, match, header, index_col, skiprows,
-           parse_dates, tupleize_cols, thousands, attrs, encoding):
+           parse_dates, tupleize_cols, thousands, attrs, encoding,
+           decimal):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
@@ -744,15 +746,18 @@ def _parse(flavor, io, match, header, index_col, skiprows,
                                       skiprows=skiprows,
                                       parse_dates=parse_dates,
                                       tupleize_cols=tupleize_cols,
-                                      thousands=thousands))
+                                      thousands=thousands,
+                                      decimal=decimal
+                                      ))
         except EmptyDataError:  # empty table
             continue
     return ret
 
 
 def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, attrs=None, parse_dates=False,
-              tupleize_cols=False, thousands=',', encoding=None):
+              tupleize_cols=False, thousands=',', encoding=None,
+              decimal='.'):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -828,6 +833,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
         underlying parser library (e.g., the parser library will try to use
         the encoding provided by the document).
 
+    decimal : str, default '.'
+        Character to recognize as decimal point (e.g. use ',' for European
+        data).
+
+        .. versionadded:: 0.18.2
+
     Returns
     -------
     dfs : list of DataFrames
@@ -871,4 +882,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
                          'data (you passed a negative value)')
     _validate_header_arg(header)
     return _parse(flavor, io, match, header, index_col, skiprows,
-                  parse_dates, tupleize_cols, thousands, attrs, encoding)
+                  parse_dates, tupleize_cols, thousands, attrs, encoding,
+                  decimal)
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -664,6 +664,30 @@ def test_wikipedia_states_table(self):
         result = self.read_html(data, 'Arizona', header=1)[0]
         self.assertEqual(result['sq mi'].dtype, np.dtype('float64'))
 
+    def test_decimal_rows(self):
+
+        # GH 12907
+        data = StringIO('''<html>
+            <body>
+             <table>
+                <thead>
+                    <tr>
+                        <th>Header</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1100#101</td>
+                    </tr>
+                </tbody>
+            </table>
+            </body>
+        </html>''')
+        expected = DataFrame(data={'Header': 1100.101}, index=[0])
+        result = self.read_html(data, decimal='#')[0]
+        nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64'))
+        tm.assert_frame_equal(result, expected)
+
     def test_bool_header_arg(self):
         # GH 6114
         for arg in [True, False]: