diff --git a/pandas/io/common.py b/pandas/io/common.py index c147ae9fd0aa8..05542d596e961 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -148,9 +148,9 @@ def urlopen(*args, **kwargs): Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of the stdlib. """ - import urllib.request + from urllib.request import Request, urlopen as _urlopen - return urllib.request.urlopen(*args, **kwargs) + return _urlopen(Request(*args, **kwargs)) def is_fsspec_url(url: FilePathOrBuffer) -> bool: @@ -176,6 +176,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, + headers: Dict[str, Any] = {}, ) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -251,7 +252,7 @@ def get_filepath_or_buffer( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - req = urlopen(filepath_or_buffer) + req = urlopen(filepath_or_buffer, headers=headers) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ef684469dffbb..e75e8d166215e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import IO, Any, Callable, List, Optional, Type +from typing import IO, Any, Callable, Dict, List, Optional, Type import numpy as np @@ -377,6 +377,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, + headers: Dict[str, Any] = {}, ): """ Convert a JSON string to pandas object. @@ -527,6 +528,10 @@ def read_json( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. + headers : dict, optional + HTTP headers that are passed to urlopen. Allows to specify the User-Agent + in case the urllib User-Agent is blocked for example + .. versionadded:: 1.2.0 Returns @@ -614,6 +619,7 @@ def read_json( encoding=encoding, compression=compression, storage_options=storage_options, + headers=headers, ) json_reader = JsonReader( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dd3588faedf7a..7db96657f6b8f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -352,6 +352,10 @@ a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. +headers : dict, optional + HTTP headers that are passed to urlopen. Allows to specify the User-Agent + in case the urllib User-Agent is blocked for example + .. versionadded:: 1.2 Returns @@ -432,9 +436,14 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding compression = kwds.get("compression", "infer") + headers = kwds.get("headers", {}) ioargs = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, storage_options=storage_options + filepath_or_buffer, + encoding, + compression, + storage_options=storage_options, + headers=headers, ) kwds["compression"] = ioargs.compression @@ -599,6 +608,7 @@ def read_csv( memory_map=False, float_precision=None, storage_options: StorageOptions = None, + headers: Dict[str, Any] = {}, ): # gh-23761 # @@ -686,6 +696,7 @@ def read_csv( infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, storage_options=storage_options, + headers=headers, ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ede8d61490778..d216024b2fa33 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -417,3 +417,11 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + + +def test_urlopen_headers(): + headers = {"User-Agent": "Pandas 1.1.0"} + # this returns the User-Agent + url = "http://ifconfig.me/ua" + r = icom.urlopen(url, headers=headers) + assert r.read().decode("utf-8") == headers["User-Agent"]