Skip to content

Commit 44c4168

Browse files
authored
ENH: Add support for storage_options in pd.read_html (#52620)
* Add support for storage_options in pd.read_html * Run pre-commit * Run pre-commit * Replace call to urlopen with _get_filepath_or_buffer, update whatsnew * Replace defined function with lambda func for read_html * Remove unused function * Change expected type for 'obj' param in _read() Signed-off-by: serhatgktp <[email protected]> * Remove unnecessary comment * Add missing space to fix workflow error Signed-off-by: serhatgktp <[email protected]> * Delete trailing whitespaces --------- Signed-off-by: serhatgktp <[email protected]>
1 parent 3713834 commit 44c4168

File tree

4 files changed

+101
-8
lines changed

4 files changed

+101
-8
lines changed

doc/source/user_guide/io.rst

+41
Original file line numberDiff line numberDiff line change
@@ -2490,6 +2490,47 @@ Read a URL with no options:
24902490

24912491
The data from the above URL changes every Monday so the resulting data above may be slightly different.
24922492

2493+
Read a URL while passing headers alongside the HTTP request:
2494+
2495+
.. code-block:: ipython
2496+
2497+
In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector
2498+
In [323]: pd.read_html(url)
2499+
Out[323]:
2500+
[ 0 1
2501+
0 Remote Socket: 51.15.105.256:51760
2502+
1 Protocol Version: HTTP/1.1
2503+
2 Request Method: GET
2504+
3 Request URI: /notes/request/
2505+
4 Request Query: NaN,
2506+
0 Accept-Encoding: identity
2507+
1 Host: www.sump.org
2508+
2 User-Agent: Python-urllib/3.8
2509+
3 Connection: close]
2510+
In [324]: headers = {
2511+
In [325]: 'User-Agent':'Mozilla Firefox v14.0',
2512+
In [326]: 'Accept':'application/json',
2513+
In [327]: 'Connection':'keep-alive',
2514+
In [328]: 'Auth':'Bearer 2*/f3+fe68df*4'
2515+
In [329]: }
2516+
In [340]: pd.read_html(url, storage_options=headers)
2517+
Out[340]:
2518+
[ 0 1
2519+
0 Remote Socket: 51.15.105.256:51760
2520+
1 Protocol Version: HTTP/1.1
2521+
2 Request Method: GET
2522+
3 Request URI: /notes/request/
2523+
4 Request Query: NaN,
2524+
0 User-Agent: Mozilla Firefox v14.0
2525+
1 AcceptEncoding: gzip, deflate, br
2526+
2 Accept: application/json
2527+
3 Connection: keep-alive
2528+
4 Auth: Bearer 2*/f3+fe68df*4]
2529+
2530+
.. note::
2531+
2532+
We see above that the headers we passed are reflected in the HTTP request.
2533+
24932534
Read in the content of the file from the above URL and pass it to ``read_html``
24942535
as a string:
24952536

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ Other enhancements
141141
- :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`)
142142
- :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`)
143143
- :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
144+
- :meth:`pandas.read_html` now supports the ``storage_options`` keyword when used with a URL, allowing users to add headers the outbound HTTP request (:issue:`49944`)
144145
- Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`)
145146
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
146147
- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`)

pandas/io/html.py

+38-8
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
is_fsspec_url,
4141
is_url,
4242
stringify_path,
43-
urlopen,
4443
validate_header_arg,
4544
)
4645
from pandas.io.formats.printing import pprint_thing
@@ -57,6 +56,7 @@
5756
DtypeBackend,
5857
FilePath,
5958
ReadBuffer,
59+
StorageOptions,
6060
)
6161

6262
from pandas import DataFrame
@@ -115,7 +115,11 @@ def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequenc
115115
raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
116116

117117

118-
def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
118+
def _read(
119+
obj: FilePath | BaseBuffer,
120+
encoding: str | None,
121+
storage_options: StorageOptions | None,
122+
) -> str | bytes:
119123
"""
120124
Try to read from a url, file or string.
121125
@@ -133,7 +137,9 @@ def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
133137
or hasattr(obj, "read")
134138
or (isinstance(obj, str) and file_exists(obj))
135139
):
136-
with get_handle(obj, "r", encoding=encoding) as handles:
140+
with get_handle(
141+
obj, "r", encoding=encoding, storage_options=storage_options
142+
) as handles:
137143
text = handles.handle.read()
138144
elif isinstance(obj, (str, bytes)):
139145
text = obj
@@ -219,13 +225,15 @@ def __init__(
219225
encoding: str,
220226
displayed_only: bool,
221227
extract_links: Literal[None, "header", "footer", "body", "all"],
228+
storage_options: StorageOptions = None,
222229
) -> None:
223230
self.io = io
224231
self.match = match
225232
self.attrs = attrs
226233
self.encoding = encoding
227234
self.displayed_only = displayed_only
228235
self.extract_links = extract_links
236+
self.storage_options = storage_options
229237

230238
def parse_tables(self):
231239
"""
@@ -637,7 +645,7 @@ def _parse_tfoot_tr(self, table):
637645
return table.select("tfoot tr")
638646

639647
def _setup_build_doc(self):
640-
raw_text = _read(self.io, self.encoding)
648+
raw_text = _read(self.io, self.encoding, self.storage_options)
641649
if not raw_text:
642650
raise ValueError(f"No text parsed from document: {self.io}")
643651
return raw_text
@@ -777,8 +785,10 @@ def _build_doc(self):
777785

778786
try:
779787
if is_url(self.io):
780-
with urlopen(self.io) as f:
781-
r = parse(f, parser=parser)
788+
with get_handle(
789+
self.io, "r", storage_options=self.storage_options
790+
) as f:
791+
r = parse(f.handle, parser=parser)
782792
else:
783793
# try to parse the input in the simplest way
784794
r = parse(self.io, parser=parser)
@@ -945,14 +955,32 @@ def _validate_flavor(flavor):
945955
return flavor
946956

947957

948-
def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
958+
def _parse(
959+
flavor,
960+
io,
961+
match,
962+
attrs,
963+
encoding,
964+
displayed_only,
965+
extract_links,
966+
storage_options,
967+
**kwargs,
968+
):
949969
flavor = _validate_flavor(flavor)
950970
compiled_match = re.compile(match) # you can pass a compiled regex here
951971

952972
retained = None
953973
for flav in flavor:
954974
parser = _parser_dispatch(flav)
955-
p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
975+
p = parser(
976+
io,
977+
compiled_match,
978+
attrs,
979+
encoding,
980+
displayed_only,
981+
extract_links,
982+
storage_options,
983+
)
956984

957985
try:
958986
tables = p.parse_tables()
@@ -1017,6 +1045,7 @@ def read_html(
10171045
displayed_only: bool = True,
10181046
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
10191047
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
1048+
storage_options: StorageOptions = None,
10201049
) -> list[DataFrame]:
10211050
r"""
10221051
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1224,4 +1253,5 @@ def read_html(
12241253
displayed_only=displayed_only,
12251254
extract_links=extract_links,
12261255
dtype_backend=dtype_backend,
1256+
storage_options=storage_options,
12271257
)

pandas/tests/io/test_user_agent.py

+21
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,17 @@ def do_GET(self):
112112
self.write_back_bytes(response_bytes)
113113

114114

115+
class HTMLUserAgentResponder(BaseUserAgentResponder):
116+
def do_GET(self):
117+
response_df = self.start_processing_headers()
118+
self.send_header("Content-Type", "text/html")
119+
self.end_headers()
120+
121+
response_bytes = response_df.to_html(index=False).encode("utf-8")
122+
123+
self.write_back_bytes(response_bytes)
124+
125+
115126
class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder):
116127
def do_GET(self):
117128
response_df = self.start_processing_headers()
@@ -244,6 +255,11 @@ def responder(request):
244255
[
245256
(CSVUserAgentResponder, pd.read_csv, None),
246257
(JSONUserAgentResponder, pd.read_json, None),
258+
(
259+
HTMLUserAgentResponder,
260+
lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
261+
None,
262+
),
247263
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
248264
pytest.param(
249265
ParquetFastParquetUserAgentResponder,
@@ -281,6 +297,11 @@ def test_server_and_default_headers(responder, read_method, parquet_engine):
281297
[
282298
(CSVUserAgentResponder, pd.read_csv, None),
283299
(JSONUserAgentResponder, pd.read_json, None),
300+
(
301+
HTMLUserAgentResponder,
302+
lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
303+
None,
304+
),
284305
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
285306
pytest.param(
286307
ParquetFastParquetUserAgentResponder,

0 commit comments

Comments
 (0)