Skip to content

wrap urlopen with requests #21504

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 41 additions & 12 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,13 @@

if compat.PY3:
from urllib.request import urlopen, pathname2url
_urlopen = urlopen
from urllib.parse import urlparse as parse_url
from urllib.parse import (uses_relative, uses_netloc, uses_params,
urlencode, urljoin)
from urllib.error import URLError
from http.client import HTTPException # noqa
else:
from urllib2 import urlopen as _urlopen
from urllib2 import urlopen as urlopen2
from urllib import urlencode, pathname2url # noqa
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params, urljoin
Expand All @@ -46,10 +45,10 @@
from contextlib import contextmanager, closing # noqa
from functools import wraps # noqa

# @wraps(_urlopen)
# @wraps(urlopen2)
@contextmanager
def urlopen(*args, **kwargs):
with closing(_urlopen(*args, **kwargs)) as f:
with closing(urlopen2(*args, **kwargs)) as f:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this workaround is used anywhere.
Maybe we can remove this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not remove this for now. Investigate in another PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NP. I'll leave it there. Just renamed the urlopen variants to try to make it a little bit clear of what is going on.

yield f


Expand Down Expand Up @@ -91,6 +90,34 @@ def _is_url(url):
return False


def _urlopen(url, session=None):
compression = None
content_encoding = None
try:
import requests
Copy link
Author

@ocefpaf ocefpaf Jun 15, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no need to check if requests is installed when using a session object b/c requests is needed to create the object, this is slightly simpler than the approach in #17087 and introduces less code to maintain.

if session:
if not isinstance(session, requests.sessions.Session):
raise ValueError(
'Expected a requests.sessions.Session object, '
'got {!r}'.format(session)
)
r = session.get(url)
else:
r = requests.get(url)
r.raise_for_status()
content = r.content
r.close()
except ImportError:
with urlopen(url) as r:
content = r.read()
content_encoding = r.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
# Override compression based on Content-Encoding header.
compression = 'gzip'
reader = BytesIO(content)
return reader, compression


def _expand_user(filepath_or_buffer):
"""Return the argument with an initial component of ~ or ~user
replaced by that user's home directory.
Expand Down Expand Up @@ -177,7 +204,7 @@ def is_gcs_url(url):


def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
compression=None, mode=None):
compression=None, mode=None, session=None):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
Expand All @@ -188,6 +215,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
or buffer
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
mode : str, optional
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer' and
`filepath_or_buffer` is path-like, then detect compression from the
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
decompression). If using 'zip', the ZIP file must contain only one data
file to be read in. Set to None for no decompression.

.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.

Returns
-------
Expand All @@ -199,13 +234,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
filepath_or_buffer = _stringify_path(filepath_or_buffer)

if _is_url(filepath_or_buffer):
req = _urlopen(filepath_or_buffer)
content_encoding = req.headers.get('Content-Encoding', None)
if content_encoding == 'gzip':
# Override compression based on Content-Encoding header
compression = 'gzip'
reader = BytesIO(req.read())
req.close()
reader, compression = _urlopen(filepath_or_buffer, session=session)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

compression is not mentioned in the docs and I wonder if session should.
If so we can use the template/Appender approach, just not sure how to do that in a clean way.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. compression should be mentioned in the docs. Would be great to add that.
  2. Given my first point, session should be as well.

And yes, template / Appender approach sounds like a good idea!

Copy link
Author

@ocefpaf ocefpaf Jun 18, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need some guidance on how to do this when crossing modules, common.py in get_filepath_or_buffer, and parsers.py for all the parsers defined in _parser_params.

The current situation is:

options/module-function get_filepath_or_buffer _parser_params
filepath_or_buffer present NA
mode present NA
encoding present present
compression missing (added in this PR) present
session missing present

It is unclear to me how to fix this. Some questions I have:

  • Should the compression and encoding docstring be standardized?
  • If yes on the question above what is the best strategy for this? Should encoding, compression, and session be defined in common.py and imported in parsers.py for appending in _parser_params? Or the other way around?

My guess is that I should do get the common options from parser.py and add it to common.py like:

_common_params = r"""
Parameters
----------
encoding : str, default None
    Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
    standard encodings
    <https://docs.python.org/3/library/codecs.html#standard-encodings>`_
session : requests.Session
    object with the a requests session configuration for remote file.
    (requires the requests library)
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
    For on-the-fly decompression of on-disk data. If 'infer' and
    `filepath_or_buffer` is path-like, then detect compression from the
    following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
    decompression). If using 'zip', the ZIP file must contain only one data
    file to be read in. Set to None for no decompression.

    .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
"""

Then compose the get_filepath_or_buffer with the missing filepath_or_buffer and mode, and later import _common_params in parsers.py to compose _parser_params. Does that sound OK?

PS: if that is correct I'd prefer if a pandas doc expert did this instead of me, I'm kind of lost in the docstrings and Appenders 😄

return reader, encoding, compression, True

if is_s3_url(filepath_or_buffer):
Expand Down
6 changes: 4 additions & 2 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ def read_excel(io,
"`sheet`")

if not isinstance(io, ExcelFile):
io = ExcelFile(io, engine=engine)
session = kwds.get('session', None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just list session as a kwarg in read_excel (and in ExcelFile), then just pass it in

io = ExcelFile(io, engine=engine, session=session)

return io.parse(
sheet_name=sheet_name,
Expand Down Expand Up @@ -396,10 +397,11 @@ def __init__(self, io, **kwds):
if engine is not None and engine != 'xlrd':
raise ValueError("Unknown engine: {engine}".format(engine=engine))

session = kwds.pop('session', None)
# If io is a url, want to keep the data as bytes so can't pass
# to get_filepath_or_buffer()
if _is_url(self._io):
io = _urlopen(self._io)
io, _ = _urlopen(self._io, session=session)
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
io, _, _, _ = get_filepath_or_buffer(self._io)

Expand Down
24 changes: 13 additions & 11 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
from pandas.errors import AbstractMethodError, EmptyDataError

from pandas.core.dtypes.common import is_list_like

from pandas import Series

from pandas.io.common import _is_url, _validate_header_arg, urlopen
from pandas.io.common import _is_url, _urlopen, _validate_header_arg, urlopen
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import TextParser

Expand Down Expand Up @@ -115,7 +114,7 @@ def _get_skiprows(skiprows):
type(skiprows).__name__)


def _read(obj):
def _read(obj, session=None):
"""Try to read from a url, file or string.

Parameters
Expand All @@ -127,8 +126,7 @@ def _read(obj):
raw_text : str
"""
if _is_url(obj):
with urlopen(obj) as url:
text = url.read()
text, _ = _urlopen(obj, session=session)
elif hasattr(obj, 'read'):
text = obj.read()
elif isinstance(obj, char_types):
Expand Down Expand Up @@ -203,12 +201,14 @@ class _HtmlFrameParser(object):
functionality.
"""

def __init__(self, io, match, attrs, encoding, displayed_only):
def __init__(self, io, match, attrs, encoding, displayed_only,
session=None):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only
self.session = session

def parse_tables(self):
"""
Expand Down Expand Up @@ -592,7 +592,7 @@ def _parse_tfoot_tr(self, table):
return table.select('tfoot tr')

def _setup_build_doc(self):
raw_text = _read(self.io)
raw_text = _read(self.io, self.session)
if not raw_text:
raise ValueError('No text parsed from document: {doc}'
.format(doc=self.io))
Expand Down Expand Up @@ -715,7 +715,7 @@ def _build_doc(self):

try:
if _is_url(self.io):
with urlopen(self.io) as f:
with _urlopen(self.io) as f:
r = parse(f, parser=parser)
else:
# try to parse the input in the simplest way
Expand Down Expand Up @@ -890,9 +890,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):

# hack around python 3 deleting the exception variable
retained = None
session = kwargs.get('session', None)
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding, displayed_only)
p = parser(io, compiled_match, attrs, encoding, displayed_only,
session)

try:
tables = p.parse_tables()
Expand Down Expand Up @@ -928,7 +930,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, attrs=None, parse_dates=False,
tupleize_cols=None, thousands=',', encoding=None,
decimal='.', converters=None, na_values=None,
keep_default_na=True, displayed_only=True):
keep_default_na=True, displayed_only=True, session=None):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -1091,4 +1093,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
thousands=thousands, attrs=attrs, encoding=encoding,
decimal=decimal, converters=converters, na_values=na_values,
keep_default_na=keep_default_na,
displayed_only=displayed_only)
displayed_only=displayed_only, session=session)
3 changes: 2 additions & 1 deletion pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii,
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
convert_axes=True, convert_dates=True, keep_default_dates=True,
numpy=False, precise_float=False, date_unit=None, encoding=None,
lines=False, chunksize=None, compression='infer'):
lines=False, chunksize=None, compression='infer', session=None):
"""
Convert a JSON string to pandas object

Expand Down Expand Up @@ -410,6 +410,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
compression = _infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression,
session=session,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag when you add session

)

json_reader = JsonReader(
Expand Down
12 changes: 9 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@
values. The options are `None` for the ordinary converter,
`high` for the high-precision converter, and `round_trip` for the
round-trip converter.
session : requests.Session
object with the a requests session configuration for remote file.
(requires the requests library)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag


Returns
-------
Expand Down Expand Up @@ -401,10 +404,11 @@ def _read(filepath_or_buffer, kwds):
encoding = re.sub('_', '-', encoding).lower()
kwds['encoding'] = encoding

session = kwds.get('session', None)
compression = kwds.get('compression')
compression = _infer_compression(filepath_or_buffer, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
filepath_or_buffer, encoding, compression)
filepath_or_buffer, encoding, compression, session=session)
kwds['compression'] = compression

if kwds.get('date_parser', None) is not None:
Expand Down Expand Up @@ -590,7 +594,8 @@ def parser_f(filepath_or_buffer,
delim_whitespace=False,
low_memory=_c_parser_defaults['low_memory'],
memory_map=False,
float_precision=None):
float_precision=None,
session=None):

# deprecate read_table GH21948
if name == "read_table":
Expand Down Expand Up @@ -690,7 +695,8 @@ def parser_f(filepath_or_buffer,
mangle_dupe_cols=mangle_dupe_cols,
tupleize_cols=tupleize_cols,
infer_datetime_format=infer_datetime_format,
skip_blank_lines=skip_blank_lines)
skip_blank_lines=skip_blank_lines,
session=session)

return _read(filepath_or_buffer, kwds)

Expand Down