From c10635fad6da9c4ff0fa7905b5f0107cb3616bb6 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Mon, 26 Aug 2013 18:02:48 +0100 Subject: [PATCH] Add examples using urllib2 and urllib.request for HTTP Content-Type I've seen many use html5lib completely ignoring any HTTP-layer given character encoding. Would be better to lead them in the right direction. --- README.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.rst b/README.rst index ddcfa672..9e0a0f74 100644 --- a/README.rst +++ b/README.rst @@ -41,6 +41,29 @@ a treebuilder: with open("mydocument.html", "rb") as f: lxml_etree_document = html5lib.parse(f, treebuilder="lxml") +When using with ``urllib2`` (Python 2), the charset from HTTP should be +pass into html5lib as follows: + +.. code-block:: python + + from contextlib import closing + from urllib2 import urlopen + import html5lib + + with closing(urlopen("http://example.com/")) as f: + document = html5lib.parse(f, encoding=f.info().getparam("charset")) + +When using with ``urllib.request`` (Python 3), the charset from HTTP +should be pass into html5lib as follows: + +.. code-block:: python + + from urllib.request import urlopen + import html5lib + + with urlopen("http://example.com/") as f: + document = html5lib.parse(f, encoding=f.info().get_content_charset()) + To have more control over the parser, create a parser object explicitly. For instance, to make the parser raise exceptions on parse errors, use: