ENH: Add support for storage_options in pd.read_html (#52620)

serhatgktp · web-flow · commit 44c416821d80 · 2023-07-18T16:00:27.000-07:00
* Add support for storage_options in pd.read_html

* Run pre-commit

* Run pre-commit

* Replace call to urlopen with _get_filepath_or_buffer, update whatsnew

* Replace defined function with lambda func for read_html

* Remove unused function

* Change expected type for 'obj' param in _read()

Signed-off-by: serhatgktp &lt;efkan@ibm.com&gt;

* Remove unnecessary comment

* Add missing space to fix workflow error

Signed-off-by: serhatgktp &lt;efkan@ibm.com&gt;

* Delete trailing whitespaces

---------

Signed-off-by: serhatgktp &lt;efkan@ibm.com&gt;
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2490,6 +2490,47 @@ Read a URL with no options:
 
    The data from the above URL changes every Monday so the resulting data above may be slightly different.
 
+Read a URL while passing headers alongside the HTTP request:
+
+.. code-block:: ipython
+
+   In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector
+   In [323]: pd.read_html(url)
+   Out[323]:
+   [                   0                    1
+    0     Remote Socket:  51.15.105.256:51760
+    1  Protocol Version:             HTTP/1.1
+    2    Request Method:                  GET
+    3       Request URI:      /notes/request/
+    4     Request Query:                  NaN,
+    0   Accept-Encoding:             identity
+    1              Host:         www.sump.org
+    2        User-Agent:    Python-urllib/3.8
+    3        Connection:                close]
+   In [324]: headers = {
+   In [325]:    'User-Agent':'Mozilla Firefox v14.0',
+   In [326]:    'Accept':'application/json',
+   In [327]:    'Connection':'keep-alive',
+   In [328]:    'Auth':'Bearer 2*/f3+fe68df*4'
+   In [329]: }
+   In [340]: pd.read_html(url, storage_options=headers)
+   Out[340]:
+   [                   0                    1
+    0     Remote Socket:  51.15.105.256:51760
+    1  Protocol Version:             HTTP/1.1
+    2    Request Method:                  GET
+    3       Request URI:      /notes/request/
+    4     Request Query:                  NaN,
+    0        User-Agent: Mozilla Firefox v14.0
+    1    AcceptEncoding:   gzip,  deflate,  br
+    2            Accept:      application/json
+    3        Connection:             keep-alive
+    4              Auth:  Bearer 2*/f3+fe68df*4]
+
+.. note::
+
+   We see above that the headers we passed are reflected in the HTTP request.
+
 Read in the content of the file from the above URL and pass it to ``read_html``
 as a string:
 
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -141,6 +141,7 @@ Other enhancements
 - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`)
 - :meth:`arrays.DatetimeArray.map`, :meth:`arrays.TimedeltaArray.map` and :meth:`arrays.PeriodArray.map` can now take a ``na_action`` argument (:issue:`51644`)
 - :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
+- :meth:`pandas.read_html` now supports the ``storage_options`` keyword when used with a URL, allowing users to add headers the outbound HTTP request (:issue:`49944`)
 - Add :meth:`diff()` and :meth:`round()` for :class:`Index` (:issue:`19708`)
 - Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
 - Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`)
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -40,7 +40,6 @@
     is_fsspec_url,
     is_url,
     stringify_path,
-    urlopen,
     validate_header_arg,
 )
 from pandas.io.formats.printing import pprint_thing
@@ -57,6 +56,7 @@
         DtypeBackend,
         FilePath,
         ReadBuffer,
+        StorageOptions,
     )
 
     from pandas import DataFrame
@@ -115,7 +115,11 @@ def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequenc
     raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
 
 
-def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
+def _read(
+    obj: FilePath | BaseBuffer,
+    encoding: str | None,
+    storage_options: StorageOptions | None,
+) -> str | bytes:
     """
     Try to read from a url, file or string.
 
@@ -133,7 +137,9 @@ def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
         or hasattr(obj, "read")
         or (isinstance(obj, str) and file_exists(obj))
     ):
-        with get_handle(obj, "r", encoding=encoding) as handles:
+        with get_handle(
+            obj, "r", encoding=encoding, storage_options=storage_options
+        ) as handles:
             text = handles.handle.read()
     elif isinstance(obj, (str, bytes)):
         text = obj
@@ -219,13 +225,15 @@ def __init__(
         encoding: str,
         displayed_only: bool,
         extract_links: Literal[None, "header", "footer", "body", "all"],
+        storage_options: StorageOptions = None,
     ) -> None:
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
         self.extract_links = extract_links
+        self.storage_options = storage_options
 
     def parse_tables(self):
         """
@@ -637,7 +645,7 @@ def _parse_tfoot_tr(self, table):
         return table.select("tfoot tr")
 
     def _setup_build_doc(self):
-        raw_text = _read(self.io, self.encoding)
+        raw_text = _read(self.io, self.encoding, self.storage_options)
         if not raw_text:
             raise ValueError(f"No text parsed from document: {self.io}")
         return raw_text
@@ -777,8 +785,10 @@ def _build_doc(self):
 
         try:
             if is_url(self.io):
-                with urlopen(self.io) as f:
-                    r = parse(f, parser=parser)
+                with get_handle(
+                    self.io, "r", storage_options=self.storage_options
+                ) as f:
+                    r = parse(f.handle, parser=parser)
             else:
                 # try to parse the input in the simplest way
                 r = parse(self.io, parser=parser)
@@ -945,14 +955,32 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
+def _parse(
+    flavor,
+    io,
+    match,
+    attrs,
+    encoding,
+    displayed_only,
+    extract_links,
+    storage_options,
+    **kwargs,
+):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
+        p = parser(
+            io,
+            compiled_match,
+            attrs,
+            encoding,
+            displayed_only,
+            extract_links,
+            storage_options,
+        )
 
         try:
             tables = p.parse_tables()
@@ -1017,6 +1045,7 @@ def read_html(
     displayed_only: bool = True,
     extract_links: Literal[None, "header", "footer", "body", "all"] = None,
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    storage_options: StorageOptions = None,
 ) -> list[DataFrame]:
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1224,4 +1253,5 @@ def read_html(
         displayed_only=displayed_only,
         extract_links=extract_links,
         dtype_backend=dtype_backend,
+        storage_options=storage_options,
     )
diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py
@@ -112,6 +112,17 @@ def do_GET(self):
         self.write_back_bytes(response_bytes)
 
 
+class HTMLUserAgentResponder(BaseUserAgentResponder):
+    def do_GET(self):
+        response_df = self.start_processing_headers()
+        self.send_header("Content-Type", "text/html")
+        self.end_headers()
+
+        response_bytes = response_df.to_html(index=False).encode("utf-8")
+
+        self.write_back_bytes(response_bytes)
+
+
 class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder):
     def do_GET(self):
         response_df = self.start_processing_headers()
@@ -244,6 +255,11 @@ def responder(request):
     [
         (CSVUserAgentResponder, pd.read_csv, None),
         (JSONUserAgentResponder, pd.read_json, None),
+        (
+            HTMLUserAgentResponder,
+            lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
+            None,
+        ),
         (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
         pytest.param(
             ParquetFastParquetUserAgentResponder,
@@ -281,6 +297,11 @@ def test_server_and_default_headers(responder, read_method, parquet_engine):
     [
         (CSVUserAgentResponder, pd.read_csv, None),
         (JSONUserAgentResponder, pd.read_json, None),
+        (
+            HTMLUserAgentResponder,
+            lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
+            None,
+        ),
         (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
         pytest.param(
             ParquetFastParquetUserAgentResponder,