Skip to content

Commit 89983c3

Browse files
committed
Merge pull request #7323 from cpcloud/html-encoding
UNI/HTML/WIP: add encoding argument to read_html
2 parents 87660ef + 341ace6 commit 89983c3

File tree

7 files changed

+144
-55
lines changed

7 files changed

+144
-55
lines changed

doc/source/v0.14.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ Known Issues
5858
Enhancements
5959
~~~~~~~~~~~~
6060
- Tests for basic reading of public S3 buckets now exist (:issue:`7281`).
61+
- ``read_html`` now sports an ``encoding`` argument that is passed to the
62+
underlying parser library. You can use this to read non-ascii encoded web
63+
pages (:issue:`7323`).
6164

6265
- Support for dateutil timezones, which can now be used in the same way as
6366
pytz timezones across pandas. (:issue:`4688`)

pandas/io/html.py

+32-21
Original file line numberDiff line numberDiff line change
@@ -98,30 +98,33 @@ def _get_skiprows(skiprows):
9898
type(skiprows).__name__)
9999

100100

101-
def _read(io):
101+
def _read(obj):
102102
"""Try to read from a url, file or string.
103103
104104
Parameters
105105
----------
106-
io : str, unicode, or file-like
106+
obj : str, unicode, or file-like
107107
108108
Returns
109109
-------
110110
raw_text : str
111111
"""
112-
if _is_url(io):
113-
with urlopen(io) as url:
114-
raw_text = url.read()
115-
elif hasattr(io, 'read'):
116-
raw_text = io.read()
117-
elif os.path.isfile(io):
118-
with open(io) as f:
119-
raw_text = f.read()
120-
elif isinstance(io, string_types):
121-
raw_text = io
112+
if _is_url(obj):
113+
with urlopen(obj) as url:
114+
text = url.read()
115+
elif hasattr(obj, 'read'):
116+
text = obj.read()
117+
elif isinstance(obj, string_types):
118+
text = obj
119+
try:
120+
if os.path.isfile(text):
121+
with open(text, 'rb') as f:
122+
return f.read()
123+
except TypeError:
124+
pass
122125
else:
123-
raise TypeError("Cannot read object of type %r" % type(io).__name__)
124-
return raw_text
126+
raise TypeError("Cannot read object of type %r" % type(obj).__name__)
127+
return text
125128

126129

127130
class _HtmlFrameParser(object):
@@ -165,10 +168,11 @@ class _HtmlFrameParser(object):
165168
See each method's respective documentation for details on their
166169
functionality.
167170
"""
168-
def __init__(self, io, match, attrs):
171+
def __init__(self, io, match, attrs, encoding):
169172
self.io = io
170173
self.match = match
171174
self.attrs = attrs
175+
self.encoding = encoding
172176

173177
def parse_tables(self):
174178
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -422,7 +426,8 @@ def _setup_build_doc(self):
422426

423427
def _build_doc(self):
424428
from bs4 import BeautifulSoup
425-
return BeautifulSoup(self._setup_build_doc(), features='html5lib')
429+
return BeautifulSoup(self._setup_build_doc(), features='html5lib',
430+
from_encoding=self.encoding)
426431

427432

428433
def _build_xpath_expr(attrs):
@@ -519,7 +524,7 @@ def _build_doc(self):
519524
from lxml.html import parse, fromstring, HTMLParser
520525
from lxml.etree import XMLSyntaxError
521526

522-
parser = HTMLParser(recover=False)
527+
parser = HTMLParser(recover=False, encoding=self.encoding)
523528

524529
try:
525530
# try to parse the input in the simplest way
@@ -689,15 +694,15 @@ def _validate_flavor(flavor):
689694

690695

691696
def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
692-
parse_dates, tupleize_cols, thousands, attrs):
697+
parse_dates, tupleize_cols, thousands, attrs, encoding):
693698
flavor = _validate_flavor(flavor)
694699
compiled_match = re.compile(match) # you can pass a compiled regex here
695700

696701
# hack around python 3 deleting the exception variable
697702
retained = None
698703
for flav in flavor:
699704
parser = _parser_dispatch(flav)
700-
p = parser(io, compiled_match, attrs)
705+
p = parser(io, compiled_match, attrs, encoding)
701706

702707
try:
703708
tables = p.parse_tables()
@@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
715720

716721
def read_html(io, match='.+', flavor=None, header=None, index_col=None,
717722
skiprows=None, infer_types=None, attrs=None, parse_dates=False,
718-
tupleize_cols=False, thousands=','):
723+
tupleize_cols=False, thousands=',', encoding=None):
719724
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
720725
721726
Parameters
@@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
792797
thousands : str, optional
793798
Separator to use to parse thousands. Defaults to ``','``.
794799
800+
encoding : str or None, optional
801+
The encoding used to decode the web page. Defaults to ``None``.``None``
802+
preserves the previous encoding behavior, which depends on the
803+
underlying parser library (e.g., the parser library will try to use
804+
the encoding provided by the document).
805+
795806
Returns
796807
-------
797808
dfs : list of DataFrames
@@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
837848
raise ValueError('cannot skip rows starting from the end of the '
838849
'data (you passed a negative value)')
839850
return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
840-
parse_dates, tupleize_cols, thousands, attrs)
851+
parse_dates, tupleize_cols, thousands, attrs, encoding)
Binary file not shown.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<table border="1" class="dataframe">
2+
<thead>
3+
<tr style="text-align: right;">
4+
<th></th>
5+
<th>0</th>
6+
<th>1</th>
7+
</tr>
8+
</thead>
9+
<tbody>
10+
<tr>
11+
<th>0</th>
12+
<td> 漊煻獌</td>
13+
<td> 漊煻獌</td>
14+
</tr>
15+
<tr>
16+
<th>1</th>
17+
<td> 袟袘觕</td>
18+
<td> 袟袘觕</td>
19+
</tr>
20+
<tr>
21+
<th>2</th>
22+
<td> 埱娵徖</td>
23+
<td> 埱娵徖</td>
24+
</tr>
25+
</tbody>
26+
</table>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<table border="1" class="dataframe">
2+
<thead>
3+
<tr style="text-align: right;">
4+
<th></th>
5+
<th>0</th>
6+
<th>1</th>
7+
</tr>
8+
</thead>
9+
<tbody>
10+
<tr>
11+
<th>0</th>
12+
<td> Gét</td>
13+
<td> Gét</td>
14+
</tr>
15+
<tr>
16+
<th>1</th>
17+
<td></td>
18+
<td></td>
19+
</tr>
20+
<tr>
21+
<th>2</th>
22+
<td> iech</td>
23+
<td> iech</td>
24+
</tr>
25+
</tbody>
26+
</table>

0 commit comments

Comments
 (0)