@@ -98,30 +98,33 @@ def _get_skiprows(skiprows):
98
98
type (skiprows ).__name__ )
99
99
100
100
101
- def _read (io ):
101
+ def _read (obj ):
102
102
"""Try to read from a url, file or string.
103
103
104
104
Parameters
105
105
----------
106
- io : str, unicode, or file-like
106
+ obj : str, unicode, or file-like
107
107
108
108
Returns
109
109
-------
110
110
raw_text : str
111
111
"""
112
- if _is_url (io ):
113
- with urlopen (io ) as url :
114
- raw_text = url .read ()
115
- elif hasattr (io , 'read' ):
116
- raw_text = io .read ()
117
- elif os .path .isfile (io ):
118
- with open (io ) as f :
119
- raw_text = f .read ()
120
- elif isinstance (io , string_types ):
121
- raw_text = io
112
+ if _is_url (obj ):
113
+ with urlopen (obj ) as url :
114
+ text = url .read ()
115
+ elif hasattr (obj , 'read' ):
116
+ text = obj .read ()
117
+ elif isinstance (obj , string_types ):
118
+ text = obj
119
+ try :
120
+ if os .path .isfile (text ):
121
+ with open (text , 'rb' ) as f :
122
+ return f .read ()
123
+ except TypeError :
124
+ pass
122
125
else :
123
- raise TypeError ("Cannot read object of type %r" % type (io ).__name__ )
124
- return raw_text
126
+ raise TypeError ("Cannot read object of type %r" % type (obj ).__name__ )
127
+ return text
125
128
126
129
127
130
class _HtmlFrameParser (object ):
@@ -165,10 +168,11 @@ class _HtmlFrameParser(object):
165
168
See each method's respective documentation for details on their
166
169
functionality.
167
170
"""
168
- def __init__ (self , io , match , attrs ):
171
+ def __init__ (self , io , match , attrs , encoding ):
169
172
self .io = io
170
173
self .match = match
171
174
self .attrs = attrs
175
+ self .encoding = encoding
172
176
173
177
def parse_tables (self ):
174
178
tables = self ._parse_tables (self ._build_doc (), self .match , self .attrs )
@@ -422,7 +426,8 @@ def _setup_build_doc(self):
422
426
423
427
def _build_doc (self ):
424
428
from bs4 import BeautifulSoup
425
- return BeautifulSoup (self ._setup_build_doc (), features = 'html5lib' )
429
+ return BeautifulSoup (self ._setup_build_doc (), features = 'html5lib' ,
430
+ from_encoding = self .encoding )
426
431
427
432
428
433
def _build_xpath_expr (attrs ):
@@ -519,7 +524,7 @@ def _build_doc(self):
519
524
from lxml .html import parse , fromstring , HTMLParser
520
525
from lxml .etree import XMLSyntaxError
521
526
522
- parser = HTMLParser (recover = False )
527
+ parser = HTMLParser (recover = False , encoding = self . encoding )
523
528
524
529
try :
525
530
# try to parse the input in the simplest way
@@ -689,15 +694,15 @@ def _validate_flavor(flavor):
689
694
690
695
691
696
def _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
692
- parse_dates , tupleize_cols , thousands , attrs ):
697
+ parse_dates , tupleize_cols , thousands , attrs , encoding ):
693
698
flavor = _validate_flavor (flavor )
694
699
compiled_match = re .compile (match ) # you can pass a compiled regex here
695
700
696
701
# hack around python 3 deleting the exception variable
697
702
retained = None
698
703
for flav in flavor :
699
704
parser = _parser_dispatch (flav )
700
- p = parser (io , compiled_match , attrs )
705
+ p = parser (io , compiled_match , attrs , encoding )
701
706
702
707
try :
703
708
tables = p .parse_tables ()
@@ -715,7 +720,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
715
720
716
721
def read_html (io , match = '.+' , flavor = None , header = None , index_col = None ,
717
722
skiprows = None , infer_types = None , attrs = None , parse_dates = False ,
718
- tupleize_cols = False , thousands = ',' ):
723
+ tupleize_cols = False , thousands = ',' , encoding = None ):
719
724
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
720
725
721
726
Parameters
@@ -792,6 +797,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
792
797
thousands : str, optional
793
798
Separator to use to parse thousands. Defaults to ``','``.
794
799
800
+ encoding : str or None, optional
801
+ The encoding used to decode the web page. Defaults to ``None``.``None``
802
+ preserves the previous encoding behavior, which depends on the
803
+ underlying parser library (e.g., the parser library will try to use
804
+ the encoding provided by the document).
805
+
795
806
Returns
796
807
-------
797
808
dfs : list of DataFrames
@@ -837,4 +848,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
837
848
raise ValueError ('cannot skip rows starting from the end of the '
838
849
'data (you passed a negative value)' )
839
850
return _parse (flavor , io , match , header , index_col , skiprows , infer_types ,
840
- parse_dates , tupleize_cols , thousands , attrs )
851
+ parse_dates , tupleize_cols , thousands , attrs , encoding )
0 commit comments