CLN: proper parse_dates support

cpcloud · cpcloud · commit bea34eb10fa5 · 2013-10-02T22:03:25.000-04:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -167,6 +167,8 @@ Improvements to existing features
   - Improve support for converting R datasets to pandas objects (more
     informative index for timeseries and numeric, support for factors, dist, and
     high-dimensional arrays).
+  - :func:`~pandas.read_html` now supports the ``parse_dates``,
+    ``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`).
 
 API Changes
 ~~~~~~~~~~~
@@ -373,6 +375,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
    ``core/generic.py`` (:issue:`4435`).
  - Refactor cum objects to core/generic.py (:issue:`4435`), note that these have a more numpy-like
    function signature.
+ - :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from
+   bs4/lxml (:issue:`4770`).
 
 .. _release.bug_fixes-0.13.0:
 
@@ -538,6 +542,15 @@ Bug Fixes
   - Make sure series-series boolean comparions are label based (:issue:`4947`)
   - Bug in multi-level indexing with a Timestamp partial indexer (:issue:`4294`)
   - Tests/fix for multi-index construction of an all-nan frame (:isue:`4078`)
+  - Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring
+    values of tables with commas (:issue:`5029`)
+  - Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable
+    ordering of returned tables (:issue:`4770`, :issue:`5029`).
+  - Fixed a bug where :func:`~pandas.read_html` was incorrectly parsing when
+    passed ``index_col=0`` (:issue:`5066`).
+  - Fixed a bug where :func:`~pandas.read_html` was incorrectly infering the
+    type of headers (:issue:`5048`).
+
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -16,7 +16,7 @@
 from pandas.io.common import _is_url, urlopen, parse_url
 from pandas.io.parsers import TextParser
 from pandas.compat import (lrange, lmap, u, string_types, iteritems, text_type,
-                           raise_with_traceback, OrderedDict)
+                           raise_with_traceback)
 from pandas.core import common as com
 from pandas import Series
 
@@ -485,8 +485,8 @@ def _parse_tables(self, doc, match, kwargs):
         pattern = match.pattern
 
         # 1. check all descendants for the given pattern and only search tables
-        # 2. go up the tree until we find a table or if we are a table use that
-        query = '//table/*[re:test(text(), %r)]/ancestor-or-self::table'
+        # 2. go up the tree until we find a table
+        query = '//table//*[re:test(text(), %r)]/ancestor::table'
         xpath_expr = u(query) % pattern
 
         # if any table attributes were given build an xpath expression to
@@ -786,9 +786,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
     tupleize_cols : bool, optional
         If ``False`` try to parse multiple header rows into a
-        :class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more
-        details. Defaults to ``False`` for backwards compatibility. This is in
-        contrast to other IO functions which default to ``True``.
+        :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to
+        ``False``.
 
     thousands : str, optional
         Separator to use to parse thousands. Defaults to ``','``.
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -18,7 +18,8 @@
 from numpy.random import rand
 from numpy.testing.decorators import slow
 
-from pandas import DataFrame, MultiIndex, read_csv, Timestamp, Index
+from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
+                    date_range, Series)
 from pandas.compat import map, zip, StringIO, string_types
 from pandas.io.common import URLError, urlopen
 from pandas.io.html import read_html
@@ -565,6 +566,21 @@ def test_different_number_of_rows(self):
         res = self.read_html(out, index_col=0)[0]
         tm.assert_frame_equal(expected, res)
 
+    def test_parse_dates_list(self):
+        df = DataFrame({'date': date_range('1/1/2001', periods=10)})
+        expected = df.to_html()
+        res = read_html(expected, parse_dates=[0], index_col=0)
+        tm.assert_frame_equal(df, res[0])
+
+    def test_parse_dates_combine(self):
+        raw_dates = Series(date_range('1/1/2001', periods=10))
+        df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
+                        'time': raw_dates.map(lambda x: str(x.time()))})
+        res = read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
+                        index_col=1)
+        newdf = DataFrame({'datetime': raw_dates})
+        tm.assert_frame_equal(newdf, res[0])
+
 
 class TestReadHtmlLxml(unittest.TestCase):
     def setUp(self):