diff --git a/pandas/io/common.py b/pandas/io/common.py index 3a67238a66450..22655ad86263e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -30,14 +30,13 @@ if compat.PY3: from urllib.request import urlopen, pathname2url - _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, urlencode, urljoin) from urllib.error import URLError from http.client import HTTPException # noqa else: - from urllib2 import urlopen as _urlopen + from urllib2 import urlopen as urlopen2 from urllib import urlencode, pathname2url # noqa from urlparse import urlparse as parse_url from urlparse import uses_relative, uses_netloc, uses_params, urljoin @@ -46,10 +45,10 @@ from contextlib import contextmanager, closing # noqa from functools import wraps # noqa - # @wraps(_urlopen) + # @wraps(urlopen2) @contextmanager def urlopen(*args, **kwargs): - with closing(_urlopen(*args, **kwargs)) as f: + with closing(urlopen2(*args, **kwargs)) as f: yield f @@ -91,6 +90,34 @@ def _is_url(url): return False +def _urlopen(url, session=None): + compression = None + content_encoding = None + try: + import requests + if session: + if not isinstance(session, requests.sessions.Session): + raise ValueError( + 'Expected a requests.sessions.Session object, ' + 'got {!r}'.format(session) + ) + r = session.get(url) + else: + r = requests.get(url) + r.raise_for_status() + content = r.content + r.close() + except ImportError: + with urlopen(url) as r: + content = r.read() + content_encoding = r.headers.get('Content-Encoding', None) + if content_encoding == 'gzip': + # Override compression based on Content-Encoding header. + compression = 'gzip' + reader = BytesIO(content) + return reader, compression + + def _expand_user(filepath_or_buffer): """Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -177,7 +204,7 @@ def is_gcs_url(url): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): + compression=None, mode=None, session=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -188,6 +215,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' mode : str, optional + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and + `filepath_or_buffer` is path-like, then detect compression from the + following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + decompression). If using 'zip', the ZIP file must contain only one data + file to be read in. Set to None for no decompression. + + .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression. Returns ------- @@ -199,13 +234,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): - req = _urlopen(filepath_or_buffer) - content_encoding = req.headers.get('Content-Encoding', None) - if content_encoding == 'gzip': - # Override compression based on Content-Encoding header - compression = 'gzip' - reader = BytesIO(req.read()) - req.close() + reader, compression = _urlopen(filepath_or_buffer, session=session) return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 1328713736b03..fe1e256205744 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -332,7 +332,8 @@ def read_excel(io, "`sheet`") if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + session = kwds.get('session', None) + io = ExcelFile(io, engine=engine, session=session) return io.parse( sheet_name=sheet_name, @@ -396,10 +397,11 @@ def __init__(self, io, **kwds): if engine is not None and engine != 'xlrd': raise ValueError("Unknown engine: {engine}".format(engine=engine)) + session = kwds.pop('session', None) # If io is a url, want to keep the data as bytes so can't pass # to get_filepath_or_buffer() if _is_url(self._io): - io = _urlopen(self._io) + io, _ = _urlopen(self._io, session=session) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): io, _, _, _ = get_filepath_or_buffer(self._io) diff --git a/pandas/io/html.py b/pandas/io/html.py index c967bdd29df1f..8c31aac1f1563 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -15,10 +15,9 @@ from pandas.errors import AbstractMethodError, EmptyDataError from pandas.core.dtypes.common import is_list_like - from pandas import Series -from pandas.io.common import _is_url, _validate_header_arg, urlopen +from pandas.io.common import _is_url, _urlopen, _validate_header_arg, urlopen from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser @@ -115,7 +114,7 @@ def _get_skiprows(skiprows): type(skiprows).__name__) -def _read(obj): +def _read(obj, session=None): """Try to read from a url, file or string. Parameters @@ -127,8 +126,7 @@ def _read(obj): raw_text : str """ if _is_url(obj): - with urlopen(obj) as url: - text = url.read() + text, _ = _urlopen(obj, session=session) elif hasattr(obj, 'read'): text = obj.read() elif isinstance(obj, char_types): @@ -203,12 +201,14 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding, displayed_only): + def __init__(self, io, match, attrs, encoding, displayed_only, + session=None): self.io = io self.match = match self.attrs = attrs self.encoding = encoding self.displayed_only = displayed_only + self.session = session def parse_tables(self): """ @@ -592,7 +592,7 @@ def _parse_tfoot_tr(self, table): return table.select('tfoot tr') def _setup_build_doc(self): - raw_text = _read(self.io) + raw_text = _read(self.io, self.session) if not raw_text: raise ValueError('No text parsed from document: {doc}' .format(doc=self.io)) @@ -715,7 +715,7 @@ def _build_doc(self): try: if _is_url(self.io): - with urlopen(self.io) as f: + with _urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way @@ -890,9 +890,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): # hack around python 3 deleting the exception variable retained = None + session = kwargs.get('session', None) for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding, displayed_only) + p = parser(io, compiled_match, attrs, encoding, displayed_only, + session) try: tables = p.parse_tables() @@ -928,7 +930,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True, displayed_only=True): + keep_default_na=True, displayed_only=True, session=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -1091,4 +1093,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, keep_default_na=keep_default_na, - displayed_only=displayed_only) + displayed_only=displayed_only, session=session) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 38f8cd5412015..024af642a2222 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -228,7 +228,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii, def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False, chunksize=None, compression='infer'): + lines=False, chunksize=None, compression='infer', session=None): """ Convert a JSON string to pandas object @@ -410,6 +410,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, + session=session, ) json_reader = JsonReader( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index acb9bca2545c0..1f0e175d649b2 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -319,6 +319,9 @@ values. The options are `None` for the ordinary converter, `high` for the high-precision converter, and `round_trip` for the round-trip converter. +session : requests.Session + object with the a requests session configuration for remote file. + (requires the requests library) Returns ------- @@ -401,10 +404,11 @@ def _read(filepath_or_buffer, kwds): encoding = re.sub('_', '-', encoding).lower() kwds['encoding'] = encoding + session = kwds.get('session', None) compression = kwds.get('compression') compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression) + filepath_or_buffer, encoding, compression, session=session) kwds['compression'] = compression if kwds.get('date_parser', None) is not None: @@ -590,7 +594,8 @@ def parser_f(filepath_or_buffer, delim_whitespace=False, low_memory=_c_parser_defaults['low_memory'], memory_map=False, - float_precision=None): + float_precision=None, + session=None): # deprecate read_table GH21948 if name == "read_table": @@ -690,7 +695,8 @@ def parser_f(filepath_or_buffer, mangle_dupe_cols=mangle_dupe_cols, tupleize_cols=tupleize_cols, infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines) + skip_blank_lines=skip_blank_lines, + session=session) return _read(filepath_or_buffer, kwds)