From fe5a9ad83a343e52d9164f4d2a7427c29899b863 Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Tue, 29 Sep 2020 12:47:31 -0400 Subject: [PATCH 1/7] Add headers option in read_csv() for Python3 --- pandas/io/common.py | 12 ++++++++++-- pandas/io/parsers.py | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index c147ae9fd0aa8..2ad4e24c3b06a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -150,7 +150,14 @@ def urlopen(*args, **kwargs): """ import urllib.request - return urllib.request.urlopen(*args, **kwargs) + # Request class is only available in Python3, which + # allows headers to be specified + if hasattr(urllib.request, 'Request'): + r = urllib.request.urlopen(urllib.request.Request(*args, **kwargs)) + else: + r = urllib.request.urlopen(*args, **kwargs) + + return r def is_fsspec_url(url: FilePathOrBuffer) -> bool: @@ -176,6 +183,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, + headers: dict = {} ) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -251,7 +259,7 @@ def get_filepath_or_buffer( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - req = urlopen(filepath_or_buffer) + req = urlopen(filepath_or_buffer, headers=headers) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dd3588faedf7a..5679634c000c2 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -432,9 +432,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding compression = kwds.get("compression", "infer") + headers = kwds.get("headers", {}) ioargs = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, storage_options=storage_options + filepath_or_buffer, encoding, compression, storage_options=storage_options, headers=headers ) kwds["compression"] = ioargs.compression @@ -599,6 +600,7 @@ def read_csv( memory_map=False, float_precision=None, storage_options: StorageOptions = None, + headers={} ): # gh-23761 # @@ -686,6 +688,7 @@ def read_csv( infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, storage_options=storage_options, + headers=headers ) return _read(filepath_or_buffer, kwds) From fc04aeb1b54fc99cb1c7860fe11aa96972ff7863 Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Tue, 29 Sep 2020 13:02:59 -0400 Subject: [PATCH 2/7] Add headers option to read_json() --- pandas/io/json/_json.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ef684469dffbb..a6895c30ab357 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -377,6 +377,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, + headers: dict = {} ): """ Convert a JSON string to pandas object. @@ -614,6 +615,7 @@ def read_json( encoding=encoding, compression=compression, storage_options=storage_options, + headers=headers ) json_reader = JsonReader( From cf59e1602efd0c5fb15a86dfae74bd0ab76efee9 Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Wed, 30 Sep 2020 19:15:13 -0400 Subject: [PATCH 3/7] Black pandas modifications --- pandas/io/common.py | 6 +++--- pandas/io/json/_json.py | 4 ++-- pandas/io/parsers.py | 10 +++++++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 2ad4e24c3b06a..4f84edcefe1f6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -150,9 +150,9 @@ def urlopen(*args, **kwargs): """ import urllib.request - # Request class is only available in Python3, which + # Request class is only available in Python3, which # allows headers to be specified - if hasattr(urllib.request, 'Request'): + if hasattr(urllib.request, "Request"): r = urllib.request.urlopen(urllib.request.Request(*args, **kwargs)) else: r = urllib.request.urlopen(*args, **kwargs) @@ -183,7 +183,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, - headers: dict = {} + headers: dict = {}, ) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index a6895c30ab357..c9b484c90cf96 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -377,7 +377,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, - headers: dict = {} + headers: dict = {}, ): """ Convert a JSON string to pandas object. @@ -615,7 +615,7 @@ def read_json( encoding=encoding, compression=compression, storage_options=storage_options, - headers=headers + headers=headers, ) json_reader = JsonReader( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5679634c000c2..4633abaa7d009 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -435,7 +435,11 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): headers = kwds.get("headers", {}) ioargs = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, storage_options=storage_options, headers=headers + filepath_or_buffer, + encoding, + compression, + storage_options=storage_options, + headers=headers, ) kwds["compression"] = ioargs.compression @@ -600,7 +604,7 @@ def read_csv( memory_map=False, float_precision=None, storage_options: StorageOptions = None, - headers={} + headers={}, ): # gh-23761 # @@ -688,7 +692,7 @@ def read_csv( infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, storage_options=storage_options, - headers=headers + headers=headers, ) return _read(filepath_or_buffer, kwds) From 52eb087b0d111d3d25f6afe5ff3a6d5cde776c74 Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Wed, 30 Sep 2020 22:26:17 -0400 Subject: [PATCH 4/7] Remove Python2 compatibility check --- pandas/io/common.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 4f84edcefe1f6..edb24104baa2a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -148,16 +148,9 @@ def urlopen(*args, **kwargs): Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of the stdlib. """ - import urllib.request + from urllib.request import Request, urlopen as _urlopen - # Request class is only available in Python3, which - # allows headers to be specified - if hasattr(urllib.request, "Request"): - r = urllib.request.urlopen(urllib.request.Request(*args, **kwargs)) - else: - r = urllib.request.urlopen(*args, **kwargs) - - return r + return _urlopen(Request(*args, **kwargs)) def is_fsspec_url(url: FilePathOrBuffer) -> bool: From 7432d88ea8fbd2273764dd626f8b11f17fcacb5e Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Wed, 30 Sep 2020 23:21:08 -0400 Subject: [PATCH 5/7] Use specific Dict types --- pandas/io/common.py | 2 +- pandas/io/json/_json.py | 4 ++-- pandas/io/parsers.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index edb24104baa2a..05542d596e961 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -176,7 +176,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, - headers: dict = {}, + headers: Dict[str, Any] = {}, ) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c9b484c90cf96..86ae63f511461 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import IO, Any, Callable, List, Optional, Type +from typing import IO, Any, Callable, Dict, List, Optional, Type import numpy as np @@ -377,7 +377,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, - headers: dict = {}, + headers: Dict[str, Any] = {}, ): """ Convert a JSON string to pandas object. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4633abaa7d009..9690b430c7eb9 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -604,7 +604,7 @@ def read_csv( memory_map=False, float_precision=None, storage_options: StorageOptions = None, - headers={}, + headers: Dict[str, Any] = {}, ): # gh-23761 # From 4538944735b1b3e02ab6b5df11f3a337b1663601 Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Sat, 3 Oct 2020 00:35:13 -0400 Subject: [PATCH 6/7] Add urlopen test --- pandas/tests/io/test_common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index ede8d61490778..d216024b2fa33 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -417,3 +417,11 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + + +def test_urlopen_headers(): + headers = {"User-Agent": "Pandas 1.1.0"} + # this returns the User-Agent + url = "http://ifconfig.me/ua" + r = icom.urlopen(url, headers=headers) + assert r.read().decode("utf-8") == headers["User-Agent"] From be529b1e635fb1384a150eb7a5460da6c902e4a0 Mon Sep 17 00:00:00 2001 From: Antetokounpo Date: Sat, 3 Oct 2020 00:47:53 -0400 Subject: [PATCH 7/7] Add docstrings for headers parameter --- pandas/io/json/_json.py | 4 ++++ pandas/io/parsers.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 86ae63f511461..e75e8d166215e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -528,6 +528,10 @@ def read_json( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. + headers : dict, optional + HTTP headers that are passed to urlopen. Allows to specify the User-Agent + in case the urllib User-Agent is blocked for example + .. versionadded:: 1.2.0 Returns diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9690b430c7eb9..7db96657f6b8f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -352,6 +352,10 @@ a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. +headers : dict, optional + HTTP headers that are passed to urlopen. Allows to specify the User-Agent + in case the urllib User-Agent is blocked for example + .. versionadded:: 1.2 Returns