diff --git a/RELEASE.rst b/RELEASE.rst index 3940cd6d10b51..1ab2cab84a70a 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -66,6 +66,7 @@ pandas 0.11.1 - ``melt`` now accepts the optional parameters ``var_name`` and ``value_name`` to specify custom column names of the returned DataFrame (GH3649_), thanks @hoechenberger + - ``read_html`` no longer performs hard date conversion **API Changes** diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index f4f0546427ef9..6ff3afeb69581 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -68,6 +68,21 @@ Enhancements - ``pd.read_html()`` can now parse HTML strings, files or urls and return DataFrames, courtesy of @cpcloud. (GH3477_, GH3605_, GH3606_, GH3616_). It works with a *single* parser backend: BeautifulSoup4 + html5lib + - You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so + + .. ipython :: python + + df = DataFrame({'a': range(3), 'b': list('abc')}) + print df + html = df.to_html() + alist = pd.read_html(html, infer_types=True, index_col=0) + print df == alist[0] + + Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and + ``DataFrame.to_html()`` are not inverses. + + - ``pd.read_html()`` no longer performs hard conversion of date strings + (GH3656_). - ``HDFStore`` @@ -211,3 +226,4 @@ on GitHub for a complete list. .. _GH3616: https://github.com/pydata/pandas/issues/3616 .. _GH3605: https://github.com/pydata/pandas/issues/3605 .. _GH3606: https://github.com/pydata/pandas/issues/3606 +.. _GH3656: https://github.com/pydata/pandas/issues/3656 diff --git a/pandas/io/html.py b/pandas/io/html.py index 732bd57bec418..915c30ecc3c40 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -636,7 +636,6 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows): # must be sequential since dates trump numbers if both args are given if infer_types: df = df.convert_objects(convert_numeric=True) - df = df.convert_objects(convert_dates='coerce') if index_col is not None: cols = df.columns[index_col] @@ -722,7 +721,7 @@ def _parse(parser, io, match, flavor, header, index_col, skiprows, infer_types, def read_html(io, match='.+', flavor='html5lib', header=None, index_col=None, - skiprows=None, infer_types=False, attrs=None): + skiprows=None, infer_types=True, attrs=None): r"""Read an HTML table into a DataFrame. Parameters diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 6e2f6ec00d8ac..7ece8f8e07d81 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -2,7 +2,6 @@ import re from cStringIO import StringIO from unittest import TestCase -import collections import numbers from urllib2 import urlopen from contextlib import closing @@ -408,7 +407,7 @@ def try_remove_ws(x): return x df = self.run_read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'}, infer_types=True)[0] + attrs={'id': 'table'})[0] ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) @@ -431,7 +430,9 @@ def try_remove_ws(x): 'Hamilton Bank, NA', 'The Citizens Savings Bank'] dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) - assert_frame_equal(dfnew, gtnew) + converted = dfnew.convert_objects(convert_numeric=True) + assert_frame_equal(converted.convert_objects(convert_dates='coerce'), + gtnew) @slow def test_gold_canyon(self): @@ -487,6 +488,3 @@ def test_lxml_finds_tbody(): url = ('http://ndb.nal.usda.gov/ndb/foods/show/1732?fg=&man=&' 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') assert get_lxml_elements(url, 'tbody') - - - diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f38fe61d453c2..823d2c81bb72c 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -126,13 +126,13 @@ def assert_almost_equal(a, b, check_less_precise = False): return assert_dict_equal(a, b) if isinstance(a, basestring): - assert a == b, "{0} != {1}".format(a, b) + assert a == b, "%s != %s" % (a, b) return True if isiterable(a): np.testing.assert_(isiterable(b)) na, nb = len(a), len(b) - assert na == nb, "{0} != {1}".format(na, nb) + assert na == nb, "%s != %s" % (na, nb) if np.array_equal(a, b): return True @@ -154,8 +154,6 @@ def assert_almost_equal(a, b, check_less_precise = False): if check_less_precise: dtype_a = np.dtype(type(a)) dtype_b = np.dtype(type(b)) - if dtype_a.kind == 'i' and dtype_b == 'i': - pass if dtype_a.kind == 'f' and dtype_b == 'f': if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4: decimal = 3