-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
wrap urlopen with requests #21504
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
wrap urlopen with requests #21504
Changes from all commits
153c58c
f2c9540
e507a90
999bf24
b23ac1c
b46dd04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -30,14 +30,13 @@ | |||||||||||||||||||
|
||||||||||||||||||||
if compat.PY3: | ||||||||||||||||||||
from urllib.request import urlopen, pathname2url | ||||||||||||||||||||
_urlopen = urlopen | ||||||||||||||||||||
from urllib.parse import urlparse as parse_url | ||||||||||||||||||||
from urllib.parse import (uses_relative, uses_netloc, uses_params, | ||||||||||||||||||||
urlencode, urljoin) | ||||||||||||||||||||
from urllib.error import URLError | ||||||||||||||||||||
from http.client import HTTPException # noqa | ||||||||||||||||||||
else: | ||||||||||||||||||||
from urllib2 import urlopen as _urlopen | ||||||||||||||||||||
from urllib2 import urlopen as urlopen2 | ||||||||||||||||||||
from urllib import urlencode, pathname2url # noqa | ||||||||||||||||||||
from urlparse import urlparse as parse_url | ||||||||||||||||||||
from urlparse import uses_relative, uses_netloc, uses_params, urljoin | ||||||||||||||||||||
|
@@ -46,10 +45,10 @@ | |||||||||||||||||||
from contextlib import contextmanager, closing # noqa | ||||||||||||||||||||
from functools import wraps # noqa | ||||||||||||||||||||
|
||||||||||||||||||||
# @wraps(_urlopen) | ||||||||||||||||||||
# @wraps(urlopen2) | ||||||||||||||||||||
@contextmanager | ||||||||||||||||||||
def urlopen(*args, **kwargs): | ||||||||||||||||||||
with closing(_urlopen(*args, **kwargs)) as f: | ||||||||||||||||||||
with closing(urlopen2(*args, **kwargs)) as f: | ||||||||||||||||||||
yield f | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
|
@@ -91,6 +90,34 @@ def _is_url(url): | |||||||||||||||||||
return False | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
def _urlopen(url, session=None): | ||||||||||||||||||||
compression = None | ||||||||||||||||||||
content_encoding = None | ||||||||||||||||||||
try: | ||||||||||||||||||||
import requests | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no need to check if |
||||||||||||||||||||
if session: | ||||||||||||||||||||
if not isinstance(session, requests.sessions.Session): | ||||||||||||||||||||
raise ValueError( | ||||||||||||||||||||
'Expected a requests.sessions.Session object, ' | ||||||||||||||||||||
'got {!r}'.format(session) | ||||||||||||||||||||
) | ||||||||||||||||||||
r = session.get(url) | ||||||||||||||||||||
else: | ||||||||||||||||||||
r = requests.get(url) | ||||||||||||||||||||
r.raise_for_status() | ||||||||||||||||||||
content = r.content | ||||||||||||||||||||
r.close() | ||||||||||||||||||||
except ImportError: | ||||||||||||||||||||
with urlopen(url) as r: | ||||||||||||||||||||
content = r.read() | ||||||||||||||||||||
content_encoding = r.headers.get('Content-Encoding', None) | ||||||||||||||||||||
if content_encoding == 'gzip': | ||||||||||||||||||||
# Override compression based on Content-Encoding header. | ||||||||||||||||||||
compression = 'gzip' | ||||||||||||||||||||
reader = BytesIO(content) | ||||||||||||||||||||
return reader, compression | ||||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
def _expand_user(filepath_or_buffer): | ||||||||||||||||||||
"""Return the argument with an initial component of ~ or ~user | ||||||||||||||||||||
replaced by that user's home directory. | ||||||||||||||||||||
|
@@ -177,7 +204,7 @@ def is_gcs_url(url): | |||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, | ||||||||||||||||||||
compression=None, mode=None): | ||||||||||||||||||||
compression=None, mode=None, session=None): | ||||||||||||||||||||
""" | ||||||||||||||||||||
If the filepath_or_buffer is a url, translate and return the buffer. | ||||||||||||||||||||
Otherwise passthrough. | ||||||||||||||||||||
|
@@ -188,6 +215,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, | |||||||||||||||||||
or buffer | ||||||||||||||||||||
encoding : the encoding to use to decode py3 bytes, default is 'utf-8' | ||||||||||||||||||||
mode : str, optional | ||||||||||||||||||||
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' | ||||||||||||||||||||
For on-the-fly decompression of on-disk data. If 'infer' and | ||||||||||||||||||||
`filepath_or_buffer` is path-like, then detect compression from the | ||||||||||||||||||||
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no | ||||||||||||||||||||
decompression). If using 'zip', the ZIP file must contain only one data | ||||||||||||||||||||
file to be read in. Set to None for no decompression. | ||||||||||||||||||||
|
||||||||||||||||||||
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression. | ||||||||||||||||||||
|
||||||||||||||||||||
Returns | ||||||||||||||||||||
------- | ||||||||||||||||||||
|
@@ -199,13 +234,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, | |||||||||||||||||||
filepath_or_buffer = _stringify_path(filepath_or_buffer) | ||||||||||||||||||||
|
||||||||||||||||||||
if _is_url(filepath_or_buffer): | ||||||||||||||||||||
req = _urlopen(filepath_or_buffer) | ||||||||||||||||||||
content_encoding = req.headers.get('Content-Encoding', None) | ||||||||||||||||||||
if content_encoding == 'gzip': | ||||||||||||||||||||
# Override compression based on Content-Encoding header | ||||||||||||||||||||
compression = 'gzip' | ||||||||||||||||||||
reader = BytesIO(req.read()) | ||||||||||||||||||||
req.close() | ||||||||||||||||||||
reader, compression = _urlopen(filepath_or_buffer, session=session) | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
And yes, template / Appender approach sounds like a good idea! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need some guidance on how to do this when crossing modules, The current situation is:
It is unclear to me how to fix this. Some questions I have:
My guess is that I should do get the common options from _common_params = r"""
Parameters
----------
encoding : str, default None
Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
standard encodings
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_
session : requests.Session
object with the a requests session configuration for remote file.
(requires the requests library)
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer' and
`filepath_or_buffer` is path-like, then detect compression from the
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
decompression). If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
""" Then compose the PS: if that is correct I'd prefer if a pandas doc expert did this instead of me, I'm kind of lost in the |
||||||||||||||||||||
return reader, encoding, compression, True | ||||||||||||||||||||
|
||||||||||||||||||||
if is_s3_url(filepath_or_buffer): | ||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -332,7 +332,8 @@ def read_excel(io, | |
"`sheet`") | ||
|
||
if not isinstance(io, ExcelFile): | ||
io = ExcelFile(io, engine=engine) | ||
session = kwds.get('session', None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just list session as a kwarg in read_excel (and in ExcelFile), then just pass it in |
||
io = ExcelFile(io, engine=engine, session=session) | ||
|
||
return io.parse( | ||
sheet_name=sheet_name, | ||
|
@@ -396,10 +397,11 @@ def __init__(self, io, **kwds): | |
if engine is not None and engine != 'xlrd': | ||
raise ValueError("Unknown engine: {engine}".format(engine=engine)) | ||
|
||
session = kwds.pop('session', None) | ||
# If io is a url, want to keep the data as bytes so can't pass | ||
# to get_filepath_or_buffer() | ||
if _is_url(self._io): | ||
io = _urlopen(self._io) | ||
io, _ = _urlopen(self._io, session=session) | ||
elif not isinstance(self.io, (ExcelFile, xlrd.Book)): | ||
io, _, _, _ = get_filepath_or_buffer(self._io) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,7 +228,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii, | |
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | ||
convert_axes=True, convert_dates=True, keep_default_dates=True, | ||
numpy=False, precise_float=False, date_unit=None, encoding=None, | ||
lines=False, chunksize=None, compression='infer'): | ||
lines=False, chunksize=None, compression='infer', session=None): | ||
""" | ||
Convert a JSON string to pandas object | ||
|
||
|
@@ -410,6 +410,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | |
compression = _infer_compression(path_or_buf, compression) | ||
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( | ||
path_or_buf, encoding=encoding, compression=compression, | ||
session=session, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded tag when you add session |
||
) | ||
|
||
json_reader = JsonReader( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -319,6 +319,9 @@ | |
values. The options are `None` for the ordinary converter, | ||
`high` for the high-precision converter, and `round_trip` for the | ||
round-trip converter. | ||
session : requests.Session | ||
object with the a requests session configuration for remote file. | ||
(requires the requests library) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a versionadded tag |
||
|
||
Returns | ||
------- | ||
|
@@ -401,10 +404,11 @@ def _read(filepath_or_buffer, kwds): | |
encoding = re.sub('_', '-', encoding).lower() | ||
kwds['encoding'] = encoding | ||
|
||
session = kwds.get('session', None) | ||
compression = kwds.get('compression') | ||
compression = _infer_compression(filepath_or_buffer, compression) | ||
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( | ||
filepath_or_buffer, encoding, compression) | ||
filepath_or_buffer, encoding, compression, session=session) | ||
kwds['compression'] = compression | ||
|
||
if kwds.get('date_parser', None) is not None: | ||
|
@@ -590,7 +594,8 @@ def parser_f(filepath_or_buffer, | |
delim_whitespace=False, | ||
low_memory=_c_parser_defaults['low_memory'], | ||
memory_map=False, | ||
float_precision=None): | ||
float_precision=None, | ||
session=None): | ||
|
||
# deprecate read_table GH21948 | ||
if name == "read_table": | ||
|
@@ -690,7 +695,8 @@ def parser_f(filepath_or_buffer, | |
mangle_dupe_cols=mangle_dupe_cols, | ||
tupleize_cols=tupleize_cols, | ||
infer_datetime_format=infer_datetime_format, | ||
skip_blank_lines=skip_blank_lines) | ||
skip_blank_lines=skip_blank_lines, | ||
session=session) | ||
|
||
return _read(filepath_or_buffer, kwds) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure this workaround is used anywhere.
Maybe we can remove this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would not remove this for now. Investigate in another PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NP. I'll leave it there. Just renamed the
urlopen
variants to try to make it a little bit clear of what is going on.