Read csv headers (pandas-dev#37966)

cdknox · luckyvs1 · commit 2b847a569e35 · 2021-01-19T23:18:35.000-08:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1627,6 +1627,20 @@ functions - the following example shows reading a CSV file:
 
    df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t")
 
+.. versionadded:: 1.3.0
+
+A custom header can be sent alongside HTTP(s) requests by passing a dictionary
+of header key value mappings to the ``storage_options`` keyword argument as shown below:
+
+.. code-block:: python
+
+   headers = {"User-Agent": "pandas"}
+   df = pd.read_csv(
+       "https://download.bls.gov/pub/time.series/cu/cu.item",
+       sep="\t",
+       storage_options=headers
+   )
+
 All URLs which are not local files or HTTP(s) are handled by
 `fsspec`_, if installed, and its various filesystem implementations
 (including Amazon S3, Google Cloud, SSH, FTP, webHDFS...).
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -13,6 +13,26 @@ including other versions of pandas.
 Enhancements
 ~~~~~~~~~~~~
 
+.. _whatsnew_130.read_csv_json_http_headers:
+
+Custom HTTP(s) headers when reading csv or json files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When reading from a remote URL that is not handled by fsspec (ie. HTTP and
+HTTPS) the dictionary passed to ``storage_options`` will be used to create the
+headers included in the request.  This can be used to control the User-Agent
+header or send other custom headers (:issue:`36688`).
+For example:
+
+.. ipython:: python
+
+    headers = {"User-Agent": "pandas"}
+    df = pd.read_csv(
+        "https://download.bls.gov/pub/time.series/cu/cu.item",
+        sep="\t",
+        storage_options=headers
+    )
+
 
 .. _whatsnew_130.enhancements.other:
 
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -383,8 +383,7 @@
     "storage_options"
 ] = """storage_options : dict, optional
     Extra options that make sense for a particular storage connection, e.g.
-    host, port, username, password, etc., if using a URL that will
-    be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
-    will be raised if providing this argument with a non-fsspec URL.
-    See the fsspec and backend storage implementation docs for the set of
-    allowed keys and values."""
+    host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+    are forwarded to ``urllib`` as header options. For other URLs (e.g.
+    starting with "s3://", and "gcs://") the key-value pairs are forwarded to
+    ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details."""
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -280,12 +280,18 @@ def _get_filepath_or_buffer(
         fsspec_mode += "b"
 
     if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
-        # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
-        if storage_options:
-            raise ValueError(
-                "storage_options passed with file object or non-fsspec file path"
-            )
-        req = urlopen(filepath_or_buffer)
+        # TODO: fsspec can also handle HTTP via requests, but leaving this
+        # unchanged. using fsspec appears to break the ability to infer if the
+        # server responded with gzipped data
+        storage_options = storage_options or {}
+
+        # waiting until now for importing to match intended lazy logic of
+        # urlopen function defined elsewhere in this module
+        import urllib.request
+
+        # assuming storage_options is to be interpretted as headers
+        req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
+        req = urlopen(req_info)
         content_encoding = req.headers.get("Content-Encoding", None)
         if content_encoding == "gzip":
             # Override compression based on Content-Encoding header
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -14,7 +14,13 @@
 from pandas import DataFrame, MultiIndex, get_option
 from pandas.core import generic
 
-from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path
+from pandas.io.common import (
+    IOHandles,
+    get_handle,
+    is_fsspec_url,
+    is_url,
+    stringify_path,
+)
 
 
 def get_engine(engine: str) -> "BaseImpl":
@@ -66,8 +72,10 @@ def _get_path_or_handle(
         fs, path_or_handle = fsspec.core.url_to_fs(
             path_or_handle, **(storage_options or {})
         )
-    elif storage_options:
-        raise ValueError("storage_options passed with buffer or non-fsspec filepath")
+    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
+        # can't write to a remote url
+        # without making use of fsspec at the moment
+        raise ValueError("storage_options passed with buffer, or non-supported URL")
 
     handles = None
     if (
@@ -79,7 +87,9 @@ def _get_path_or_handle(
         # use get_handle only when we are very certain that it is not a directory
         # fsspec resources can also point to directories
         # this branch is used for example when reading from non-fsspec URLs
-        handles = get_handle(path_or_handle, mode, is_text=False)
+        handles = get_handle(
+            path_or_handle, mode, is_text=False, storage_options=storage_options
+        )
         fs = None
         path_or_handle = handles.handle
     return path_or_handle, handles, fs
@@ -307,7 +317,9 @@ def read(
             # use get_handle only when we are very certain that it is not a directory
             # fsspec resources can also point to directories
             # this branch is used for example when reading from non-fsspec URLs
-            handles = get_handle(path, "rb", is_text=False)
+            handles = get_handle(
+                path, "rb", is_text=False, storage_options=storage_options
+            )
             path = handles.handle
         parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
 
@@ -404,10 +416,12 @@ def to_parquet(
         return None
 
 
+@doc(storage_options=generic._shared_docs["storage_options"])
 def read_parquet(
     path,
     engine: str = "auto",
     columns=None,
+    storage_options: StorageOptions = None,
     use_nullable_dtypes: bool = False,
     **kwargs,
 ):
@@ -432,13 +446,18 @@ def read_parquet(
         By file-like object, we refer to objects with a ``read()`` method,
         such as a file handle (e.g. via builtin ``open`` function)
         or ``StringIO``.
-    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
         Parquet library to use. If 'auto', then the option
         ``io.parquet.engine`` is used. The default ``io.parquet.engine``
         behavior is to try 'pyarrow', falling back to 'fastparquet' if
         'pyarrow' is unavailable.
     columns : list, default=None
         If not None, only these columns will be read from the file.
+
+    {storage_options}
+
+        .. versionadded:: 1.3.0
+
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
         for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
@@ -448,6 +467,7 @@ def read_parquet(
         support dtypes) may change without notice.
 
         .. versionadded:: 1.2.0
+
     **kwargs
         Any additional kwargs are passed to the engine.
 
@@ -456,6 +476,11 @@ def read_parquet(
     DataFrame
     """
     impl = get_engine(engine)
+
     return impl.read(
-        path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
+        path,
+        columns=columns,
+        storage_options=storage_options,
+        use_nullable_dtypes=use_nullable_dtypes,
+        **kwargs,
     )
diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py