Skip to content

Commit 2b07e11

Browse files
LiamImNo-Stream
authored andcommitted
read_html(): rewinding [wip] (pandas-dev#18017)
1 parent 198cbeb commit 2b07e11

File tree

3 files changed

+63
-1
lines changed

3 files changed

+63
-1
lines changed

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ Indexing
107107
I/O
108108
^^^
109109

110-
-
110+
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
111111
-
112112
-
113113

pandas/io/html.py

+12
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,18 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
742742
try:
743743
tables = p.parse_tables()
744744
except Exception as caught:
745+
# if `io` is an io-like object, check if it's seekable
746+
# and try to rewind it before trying the next parser
747+
if hasattr(io, 'seekable') and io.seekable():
748+
io.seek(0)
749+
elif hasattr(io, 'seekable') and not io.seekable():
750+
# if we couldn't rewind it, let the user know
751+
raise ValueError('The flavor {} failed to parse your input. '
752+
'Since you passed a non-rewindable file '
753+
'object, we can\'t rewind it to try '
754+
'another parser. Try read_html() with a '
755+
'different flavor.'.format(flav))
756+
745757
retained = caught
746758
else:
747759
break

pandas/tests/io/test_html.py

+50
Original file line numberDiff line numberDiff line change
@@ -968,3 +968,53 @@ def test_importcheck_thread_safety():
968968
while helper_thread1.is_alive() or helper_thread2.is_alive():
969969
pass
970970
assert None is helper_thread1.err is helper_thread2.err
971+
972+
973+
def test_parse_failure_unseekable():
974+
# Issue #17975
975+
_skip_if_no('lxml')
976+
977+
class UnseekableStringIO(StringIO):
978+
def seekable(self):
979+
return False
980+
981+
good = UnseekableStringIO('''
982+
<table><tr><td>spam<br />eggs</td></tr></table>''')
983+
bad = UnseekableStringIO('''
984+
<table><tr><td>spam<foobr />eggs</td></tr></table>''')
985+
986+
assert read_html(good)
987+
assert read_html(bad, flavor='bs4')
988+
989+
bad.seek(0)
990+
991+
with pytest.raises(ValueError,
992+
match='passed a non-rewindable file object'):
993+
read_html(bad)
994+
995+
996+
def test_parse_failure_rewinds():
997+
# Issue #17975
998+
_skip_if_no('lxml')
999+
1000+
class MockFile(object):
1001+
def __init__(self, data):
1002+
self.data = data
1003+
self.at_end = False
1004+
1005+
def read(self, size=None):
1006+
data = '' if self.at_end else self.data
1007+
self.at_end = True
1008+
return data
1009+
1010+
def seek(self, offset):
1011+
self.at_end = False
1012+
1013+
def seekable(self):
1014+
return True
1015+
1016+
good = MockFile('<table><tr><td>spam<br />eggs</td></tr></table>')
1017+
bad = MockFile('<table><tr><td>spam<foobr />eggs</td></tr></table>')
1018+
1019+
assert read_html(good)
1020+
assert read_html(bad)

0 commit comments

Comments
 (0)