7
7
from __future__ import annotations
8
8
9
9
from collections import abc
10
+ import errno
10
11
import numbers
12
+ import os
11
13
import re
12
14
from re import Pattern
13
15
from typing import (
14
16
TYPE_CHECKING ,
15
17
Literal ,
16
18
cast ,
17
19
)
18
- import warnings
19
20
20
21
from pandas ._libs import lib
21
22
from pandas .compat ._optional import import_optional_dependency
24
25
EmptyDataError ,
25
26
)
26
27
from pandas .util ._decorators import doc
27
- from pandas .util ._exceptions import find_stack_level
28
28
from pandas .util ._validators import check_dtype_backend
29
29
30
30
from pandas .core .dtypes .common import is_list_like
36
36
from pandas .core .shared_docs import _shared_docs
37
37
38
38
from pandas .io .common import (
39
- file_exists ,
40
39
get_handle ,
41
- is_file_like ,
42
- is_fsspec_url ,
43
40
is_url ,
44
41
stringify_path ,
45
42
validate_header_arg ,
@@ -134,21 +131,17 @@ def _read(
134
131
-------
135
132
raw_text : str
136
133
"""
137
- text : str | bytes
138
- if (
139
- is_url (obj )
140
- or hasattr (obj , "read" )
141
- or (isinstance (obj , str ) and file_exists (obj ))
142
- ):
134
+ try :
143
135
with get_handle (
144
136
obj , "r" , encoding = encoding , storage_options = storage_options
145
137
) as handles :
146
- text = handles .handle .read ()
147
- elif isinstance (obj , (str , bytes )):
148
- text = obj
149
- else :
150
- raise TypeError (f"Cannot read object of type '{ type (obj ).__name__ } '" )
151
- return text
138
+ return handles .handle .read ()
139
+ except OSError as err :
140
+ if not is_url (obj ):
141
+ raise FileNotFoundError (
142
+ f"[Errno { errno .ENOENT } ] { os .strerror (errno .ENOENT )} : { obj } "
143
+ ) from err
144
+ raise
152
145
153
146
154
147
class _HtmlFrameParser :
@@ -158,7 +151,7 @@ class _HtmlFrameParser:
158
151
Parameters
159
152
----------
160
153
io : str or file-like
161
- This can be either a string of raw HTML , a valid URL using the HTTP,
154
+ This can be either a string path , a valid URL using the HTTP,
162
155
FTP, or FILE protocols or a file-like object.
163
156
164
157
match : str or regex
@@ -780,36 +773,26 @@ def _build_doc(self):
780
773
from lxml .etree import XMLSyntaxError
781
774
from lxml .html import (
782
775
HTMLParser ,
783
- fromstring ,
784
776
parse ,
785
777
)
786
778
787
779
parser = HTMLParser (recover = True , encoding = self .encoding )
788
780
789
- try :
790
- if is_url (self .io ):
791
- with get_handle (
792
- self .io , "r" , storage_options = self .storage_options
793
- ) as f :
794
- r = parse (f .handle , parser = parser )
795
- else :
796
- # try to parse the input in the simplest way
797
- r = parse (self .io , parser = parser )
781
+ if is_url (self .io ):
782
+ with get_handle (self .io , "r" , storage_options = self .storage_options ) as f :
783
+ r = parse (f .handle , parser = parser )
784
+ else :
785
+ # try to parse the input in the simplest way
798
786
try :
799
- r = r .getroot ()
800
- except AttributeError :
801
- pass
802
- except (UnicodeDecodeError , OSError ) as e :
803
- # if the input is a blob of html goop
804
- if not is_url (self .io ):
805
- r = fromstring (self .io , parser = parser )
806
-
807
- try :
808
- r = r .getroot ()
809
- except AttributeError :
810
- pass
811
- else :
812
- raise e
787
+ r = parse (self .io , parser = parser )
788
+ except OSError as err :
789
+ raise FileNotFoundError (
790
+ f"[Errno { errno .ENOENT } ] { os .strerror (errno .ENOENT )} : { self .io } "
791
+ ) from err
792
+ try :
793
+ r = r .getroot ()
794
+ except AttributeError :
795
+ pass
813
796
else :
814
797
if not hasattr (r , "text_content" ):
815
798
raise XMLSyntaxError ("no text parsed from document" , 0 , 0 , 0 )
@@ -1059,7 +1042,7 @@ def read_html(
1059
1042
io : str, path object, or file-like object
1060
1043
String, path object (implementing ``os.PathLike[str]``), or file-like
1061
1044
object implementing a string ``read()`` function.
1062
- The string can represent a URL or the HTML itself . Note that
1045
+ The string can represent a URL. Note that
1063
1046
lxml only accepts the http, ftp and file url protocols. If you have a
1064
1047
URL that starts with ``'https'`` you might try removing the ``'s'``.
1065
1048
@@ -1227,22 +1210,6 @@ def read_html(
1227
1210
1228
1211
io = stringify_path (io )
1229
1212
1230
- if isinstance (io , str ) and not any (
1231
- [
1232
- is_file_like (io ),
1233
- file_exists (io ),
1234
- is_url (io ),
1235
- is_fsspec_url (io ),
1236
- ]
1237
- ):
1238
- warnings .warn (
1239
- "Passing literal html to 'read_html' is deprecated and "
1240
- "will be removed in a future version. To read from a "
1241
- "literal string, wrap it in a 'StringIO' object." ,
1242
- FutureWarning ,
1243
- stacklevel = find_stack_level (),
1244
- )
1245
-
1246
1213
return _parse (
1247
1214
flavor = flavor ,
1248
1215
io = io ,
0 commit comments