pandas-dev · jreback · Dec 15, 2020 · Nov 14, 2020 · Nov 15, 2020 · Nov 19, 2020
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1625,6 +1625,18 @@ functions - the following example shows reading a CSV file:
 
    df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t")
 
+A custom header can be sent alongside HTTP(s) requests by passing a dictionary
+of header key value mappings to the ``storage_options`` keyword argument as shown below:
+
+.. code-block:: python
+
+   headers = {"User-Agent": "pandas"}
+   df = pd.read_csv(
+            "https://download.bls.gov/pub/time.series/cu/cu.item",
+            sep="\t",
+            storage_options=headers
+        )
+
 All URLs which are not local files or HTTP(s) are handled by
 `fsspec`_, if installed, and its various filesystem implementations
 (including Amazon S3, Google Cloud, SSH, FTP, webHDFS...).

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -221,6 +221,23 @@ Additionally ``mean`` supports execution via `Numba <https://numba.pydata.org/>`
 the  ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency
 to use this feature.
 
+.. _whatsnew_120.read_csv_json_http_headers:
+
+Custom HTTP(s) headers when reading csv or json files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`read_csv` and :meth:`read_json` use the dictionary passed to ``storage_options`` to create custom HTTP(s) headers.
+For example:
+
+.. ipython:: python
+
+    headers = {"User-Agent": "pandas"}
+    df = pd.read_csv(
+             "https://download.bls.gov/pub/time.series/cu/cu.item",
+             sep="\t",
+             storage_options=headers
+         )
+
 .. _whatsnew_120.enhancements.other:
 
 Other enhancements

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -288,12 +288,17 @@ def _get_filepath_or_buffer(
         fsspec_mode += "b"
 
     if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
-        # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
-        if storage_options:
-            raise ValueError(
-                "storage_options passed with file object or non-fsspec file path"
-            )
-        req = urlopen(filepath_or_buffer)
+        # TODO: fsspec can also handle HTTP via requests, but leaving this
+        # unchanged. using fsspec appears to break the ability to infer if the
+        # server responded with gzipped data
+        storage_options = storage_options or dict()
+        # waiting until now for importing to match intended lazy logic of
+        # urlopen function defined elsewhere in this module
+        import urllib.request
+
+        # assuming storage_options is to be interpretted as headers
+        req = urllib.request.Request(filepath_or_buffer, headers=storage_options)
+        req = urlopen(req)
         content_encoding = req.headers.get("Content-Encoding", None)
         if content_encoding == "gzip":
             # Override compression based on Content-Encoding header

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -1,7 +1,8 @@
 """
 Tests for the pandas.io.common functionalities
 """
-from io import StringIO
+import gzip
+from io import StringIO, BytesIO
 import mmap
 import os
 from pathlib import Path
@@ -16,6 +17,8 @@
 
 import pandas.io.common as icom
 
+from unittest.mock import MagicMock, patch
+
 
 class CustomFSPath:
     """For testing fspath on unknown objects"""
@@ -411,3 +414,151 @@ def test_is_fsspec_url():
     assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
     assert not icom.is_fsspec_url("/local/path")
     assert not icom.is_fsspec_url("relative/local/path")
+
+
+def test_plain_text_read_csv_http_custom_headers():
+    true_df = pd.DataFrame({"column_name": ["column_value"]})
+    df_csv_bytes = true_df.to_csv(index=False).encode("utf-8")
+    headers = {
+        "User-Agent": "custom",
+        "Auth": "other_custom",
+    }
+
+    class DummyResponse:
+        headers = {
+            "Content-Type": "text/csv",
+        }
+
+        @staticmethod
+        def read():
+            return df_csv_bytes
+
+        @staticmethod
+        def close():
+            pass
+
+    def dummy_response_getter(url):
+        return DummyResponse()
+
+    dummy_request = MagicMock()
+    with patch("urllib.request.Request", new=dummy_request):
+        with patch("urllib.request.urlopen", new=dummy_response_getter):
+            received_df = pd.read_csv(
+                "http://localhost:80/test.csv", storage_options=headers
+            )
+            assert dummy_request.called_with(headers=headers)
+            assert (received_df == true_df).all(axis=None)
+
+
+def test_gzip_read_csv_http_custom_headers():
+    true_df = pd.DataFrame({"column_name": ["column_value"]})
+    df_csv_bytes = true_df.to_csv(index=False).encode("utf-8")
+    headers = {
+        "User-Agent": "custom",
+        "Auth": "other_custom",
+    }
+
+    class DummyResponse:
+        headers = {
+            "Content-Type": "text/csv",
+            "Content-Encoding": "gzip",
+        }
+
+        @staticmethod
+        def read():
+            bio = BytesIO()
+            zipper = gzip.GzipFile(fileobj=bio, mode="w")
+            zipper.write(df_csv_bytes)
+            zipper.close()
+            gzipped_response = bio.getvalue()
+            return gzipped_response
+
+        @staticmethod
+        def close():
+            pass
+
+    def dummy_response_getter(url):
+        return DummyResponse()
+
+    dummy_request = MagicMock()
+    with patch("urllib.request.Request", new=dummy_request):
+        with patch("urllib.request.urlopen", new=dummy_response_getter):
+            received_df = pd.read_csv(
+                "http://localhost:80/test.csv", storage_options=headers
+            )
+            assert dummy_request.called_with(headers=headers)
+            assert (received_df == true_df).all(axis=None)
+
+
+def test_plain_text_read_json_http_custom_headers():
+    true_df = pd.DataFrame({"column_name": ["column_value"]})
+    df_json_bytes = true_df.to_json().encode("utf-8")
+    headers = {
+        "User-Agent": "custom",
+        "Auth": "other_custom",
+    }
+
+    class DummyResponse:
+        headers = {
+            "Content-Type": "application/json",
+        }
+
+        @staticmethod
+        def read():
+            return df_json_bytes
+
+        @staticmethod
+        def close():
+            pass
+
+    def dummy_response_getter(url):
+        return DummyResponse()
+
+    dummy_request = MagicMock()
+    with patch("urllib.request.Request", new=dummy_request):
+        with patch("urllib.request.urlopen", new=dummy_response_getter):
+            received_df = pd.read_json(
+                "http://localhost:80/test.json", storage_options=headers
+            )
+            assert dummy_request.called_with(headers=headers)
+            assert (received_df == true_df).all(axis=None)
+
+
+def test_gzip_read_json_http_custom_headers():
+    true_df = pd.DataFrame({"column_name": ["column_value"]})
+    df_json_bytes = true_df.to_json().encode("utf-8")
+    headers = {
+        "User-Agent": "custom",
+        "Auth": "other_custom",
+    }
+
+    class DummyResponse:
+        headers = {
+            "Content-Type": "application/json",
+            "Content-Encoding": "gzip",
+        }
+
+        @staticmethod
+        def read():
+            bio = BytesIO()
+            zipper = gzip.GzipFile(fileobj=bio, mode="w")
+            zipper.write(df_json_bytes)
+            zipper.close()
+            gzipped_response = bio.getvalue()
+            return gzipped_response
+
+        @staticmethod
+        def close():
+            pass
+
+    def dummy_response_getter(url):
+        return DummyResponse()
+
+    dummy_request = MagicMock()
+    with patch("urllib.request.Request", new=dummy_request):
+        with patch("urllib.request.urlopen", new=dummy_response_getter):
+            received_df = pd.read_json(
+                "http://localhost:80/test.json", storage_options=headers
+            )
+            assert dummy_request.called_with(headers=headers)
+            assert (received_df == true_df).all(axis=None)