Skip to content

Commit bea34eb

Browse files
committed
CLN: proper parse_dates support
1 parent e22fe1b commit bea34eb

File tree

3 files changed

+35
-7
lines changed

3 files changed

+35
-7
lines changed

doc/source/release.rst

+13
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ Improvements to existing features
167167
- Improve support for converting R datasets to pandas objects (more
168168
informative index for timeseries and numeric, support for factors, dist, and
169169
high-dimensional arrays).
170+
- :func:`~pandas.read_html` now supports the ``parse_dates``,
171+
``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`).
170172

171173
API Changes
172174
~~~~~~~~~~~
@@ -373,6 +375,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
373375
``core/generic.py`` (:issue:`4435`).
374376
- Refactor cum objects to core/generic.py (:issue:`4435`), note that these have a more numpy-like
375377
function signature.
378+
- :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from
379+
bs4/lxml (:issue:`4770`).
376380

377381
.. _release.bug_fixes-0.13.0:
378382

@@ -538,6 +542,15 @@ Bug Fixes
538542
- Make sure series-series boolean comparions are label based (:issue:`4947`)
539543
- Bug in multi-level indexing with a Timestamp partial indexer (:issue:`4294`)
540544
- Tests/fix for multi-index construction of an all-nan frame (:isue:`4078`)
545+
- Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring
546+
values of tables with commas (:issue:`5029`)
547+
- Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable
548+
ordering of returned tables (:issue:`4770`, :issue:`5029`).
549+
- Fixed a bug where :func:`~pandas.read_html` was incorrectly parsing when
550+
passed ``index_col=0`` (:issue:`5066`).
551+
- Fixed a bug where :func:`~pandas.read_html` was incorrectly infering the
552+
type of headers (:issue:`5048`).
553+
541554

542555
pandas 0.12.0
543556
-------------

pandas/io/html.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas.io.common import _is_url, urlopen, parse_url
1717
from pandas.io.parsers import TextParser
1818
from pandas.compat import (lrange, lmap, u, string_types, iteritems, text_type,
19-
raise_with_traceback, OrderedDict)
19+
raise_with_traceback)
2020
from pandas.core import common as com
2121
from pandas import Series
2222

@@ -485,8 +485,8 @@ def _parse_tables(self, doc, match, kwargs):
485485
pattern = match.pattern
486486

487487
# 1. check all descendants for the given pattern and only search tables
488-
# 2. go up the tree until we find a table or if we are a table use that
489-
query = '//table/*[re:test(text(), %r)]/ancestor-or-self::table'
488+
# 2. go up the tree until we find a table
489+
query = '//table//*[re:test(text(), %r)]/ancestor::table'
490490
xpath_expr = u(query) % pattern
491491

492492
# if any table attributes were given build an xpath expression to
@@ -786,9 +786,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
786786
787787
tupleize_cols : bool, optional
788788
If ``False`` try to parse multiple header rows into a
789-
:class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more
790-
details. Defaults to ``False`` for backwards compatibility. This is in
791-
contrast to other IO functions which default to ``True``.
789+
:class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to
790+
``False``.
792791
793792
thousands : str, optional
794793
Separator to use to parse thousands. Defaults to ``','``.

pandas/io/tests/test_html.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
from numpy.random import rand
1919
from numpy.testing.decorators import slow
2020

21-
from pandas import DataFrame, MultiIndex, read_csv, Timestamp, Index
21+
from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
22+
date_range, Series)
2223
from pandas.compat import map, zip, StringIO, string_types
2324
from pandas.io.common import URLError, urlopen
2425
from pandas.io.html import read_html
@@ -565,6 +566,21 @@ def test_different_number_of_rows(self):
565566
res = self.read_html(out, index_col=0)[0]
566567
tm.assert_frame_equal(expected, res)
567568

569+
def test_parse_dates_list(self):
570+
df = DataFrame({'date': date_range('1/1/2001', periods=10)})
571+
expected = df.to_html()
572+
res = read_html(expected, parse_dates=[0], index_col=0)
573+
tm.assert_frame_equal(df, res[0])
574+
575+
def test_parse_dates_combine(self):
576+
raw_dates = Series(date_range('1/1/2001', periods=10))
577+
df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())),
578+
'time': raw_dates.map(lambda x: str(x.time()))})
579+
res = read_html(df.to_html(), parse_dates={'datetime': [1, 2]},
580+
index_col=1)
581+
newdf = DataFrame({'datetime': raw_dates})
582+
tm.assert_frame_equal(newdf, res[0])
583+
568584

569585
class TestReadHtmlLxml(unittest.TestCase):
570586
def setUp(self):

0 commit comments

Comments
 (0)