Skip to content

DOC: add doc for reading from DataFrame.to_html #3656

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 21, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pandas 0.11.1
- ``melt`` now accepts the optional parameters ``var_name`` and ``value_name``
to specify custom column names of the returned DataFrame (GH3649_),
thanks @hoechenberger
- ``read_html`` no longer performs hard date conversion

**API Changes**

Expand Down
16 changes: 16 additions & 0 deletions doc/source/v0.11.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ Enhancements
- ``pd.read_html()`` can now parse HTML strings, files or urls and return
DataFrames, courtesy of @cpcloud. (GH3477_, GH3605_, GH3606_, GH3616_).
It works with a *single* parser backend: BeautifulSoup4 + html5lib
- You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so

.. ipython :: python

df = DataFrame({'a': range(3), 'b': list('abc')})
print df
html = df.to_html()
alist = pd.read_html(html, infer_types=True, index_col=0)
print df == alist[0]

Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and
``DataFrame.to_html()`` are not inverses.

- ``pd.read_html()`` no longer performs hard conversion of date strings
(GH3656_).

- ``HDFStore``

Expand Down Expand Up @@ -211,3 +226,4 @@ on GitHub for a complete list.
.. _GH3616: https://github.com/pydata/pandas/issues/3616
.. _GH3605: https://github.com/pydata/pandas/issues/3605
.. _GH3606: https://github.com/pydata/pandas/issues/3606
.. _GH3656: https://github.com/pydata/pandas/issues/3656
3 changes: 1 addition & 2 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,6 @@ def _data_to_frame(data, header, index_col, infer_types, skiprows):
# must be sequential since dates trump numbers if both args are given
if infer_types:
df = df.convert_objects(convert_numeric=True)
df = df.convert_objects(convert_dates='coerce')

if index_col is not None:
cols = df.columns[index_col]
Expand Down Expand Up @@ -722,7 +721,7 @@ def _parse(parser, io, match, flavor, header, index_col, skiprows, infer_types,


def read_html(io, match='.+', flavor='html5lib', header=None, index_col=None,
skiprows=None, infer_types=False, attrs=None):
skiprows=None, infer_types=True, attrs=None):
r"""Read an HTML table into a DataFrame.

Parameters
Expand Down
10 changes: 4 additions & 6 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import re
from cStringIO import StringIO
from unittest import TestCase
import collections
import numbers
from urllib2 import urlopen
from contextlib import closing
Expand Down Expand Up @@ -408,7 +407,7 @@ def try_remove_ws(x):
return x

df = self.run_read_html(self.banklist_data, 'Metcalf',
attrs={'id': 'table'}, infer_types=True)[0]
attrs={'id': 'table'})[0]
ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'),
converters={'Updated Date': Timestamp,
'Closing Date': Timestamp})
Expand All @@ -431,7 +430,9 @@ def try_remove_ws(x):
'Hamilton Bank, NA', 'The Citizens Savings Bank']
dfnew = df.applymap(try_remove_ws).replace(old, new)
gtnew = ground_truth.applymap(try_remove_ws)
assert_frame_equal(dfnew, gtnew)
converted = dfnew.convert_objects(convert_numeric=True)
assert_frame_equal(converted.convert_objects(convert_dates='coerce'),
gtnew)

@slow
def test_gold_canyon(self):
Expand Down Expand Up @@ -487,6 +488,3 @@ def test_lxml_finds_tbody():
url = ('http://ndb.nal.usda.gov/ndb/foods/show/1732?fg=&man=&'
'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam')
assert get_lxml_elements(url, 'tbody')



6 changes: 2 additions & 4 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,13 @@ def assert_almost_equal(a, b, check_less_precise = False):
return assert_dict_equal(a, b)

if isinstance(a, basestring):
assert a == b, "{0} != {1}".format(a, b)
assert a == b, "%s != %s" % (a, b)
return True

if isiterable(a):
np.testing.assert_(isiterable(b))
na, nb = len(a), len(b)
assert na == nb, "{0} != {1}".format(na, nb)
assert na == nb, "%s != %s" % (na, nb)

if np.array_equal(a, b):
return True
Expand All @@ -154,8 +154,6 @@ def assert_almost_equal(a, b, check_less_precise = False):
if check_less_precise:
dtype_a = np.dtype(type(a))
dtype_b = np.dtype(type(b))
if dtype_a.kind == 'i' and dtype_b == 'i':
pass
if dtype_a.kind == 'f' and dtype_b == 'f':
if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4:
decimal = 3
Expand Down