pandas-dev · ocefpaf · Jun 15, 2018 · Jun 18, 2018 · Jun 18, 2018 · Jun 18, 2018
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -30,14 +30,13 @@
 
 if compat.PY3:
     from urllib.request import urlopen, pathname2url
-    _urlopen = urlopen
     from urllib.parse import urlparse as parse_url
     from urllib.parse import (uses_relative, uses_netloc, uses_params,
                               urlencode, urljoin)
     from urllib.error import URLError
     from http.client import HTTPException  # noqa
 else:
-    from urllib2 import urlopen as _urlopen
+    from urllib2 import urlopen as urlopen2
     from urllib import urlencode, pathname2url  # noqa
     from urlparse import urlparse as parse_url
     from urlparse import uses_relative, uses_netloc, uses_params, urljoin
@@ -46,10 +45,10 @@
     from contextlib import contextmanager, closing  # noqa
     from functools import wraps  # noqa
 
-    # @wraps(_urlopen)
+    # @wraps(urlopen2)
     @contextmanager
     def urlopen(*args, **kwargs):
-        with closing(_urlopen(*args, **kwargs)) as f:
+        with closing(urlopen2(*args, **kwargs)) as f:
             yield f
 
 
@@ -91,6 +90,34 @@ def _is_url(url):
         return False
 
 
+def _urlopen(url, session=None):
+    compression = None
+    content_encoding = None
+    try:
+        import requests
+        if session:
+            if not isinstance(session, requests.sessions.Session):
+                raise ValueError(
+                    'Expected a requests.sessions.Session object, '
+                    'got {!r}'.format(session)
+                )
+            r = session.get(url)
+        else:
+            r = requests.get(url)
+        r.raise_for_status()
+        content = r.content
+        r.close()
+    except ImportError:
+        with urlopen(url) as r:
+            content = r.read()
+            content_encoding = r.headers.get('Content-Encoding', None)
+    if content_encoding == 'gzip':
+        # Override compression based on Content-Encoding header.
+        compression = 'gzip'
+    reader = BytesIO(content)
+    return reader, compression
+
+
 def _expand_user(filepath_or_buffer):
     """Return the argument with an initial component of ~ or ~user
        replaced by that user's home directory.
@@ -177,7 +204,7 @@ def is_gcs_url(url):
 
 
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
-                           compression=None, mode=None):
+                           compression=None, mode=None, session=None):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
     Otherwise passthrough.
@@ -188,6 +215,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                          or buffer
     encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
     mode : str, optional
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        `filepath_or_buffer` is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
+        decompression). If using 'zip', the ZIP file must contain only one data
+        file to be read in. Set to None for no decompression.
+
+    .. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
 
     Returns
     -------
@@ -199,13 +234,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if _is_url(filepath_or_buffer):
-        req = _urlopen(filepath_or_buffer)
-        content_encoding = req.headers.get('Content-Encoding', None)
-        if content_encoding == 'gzip':
-            # Override compression based on Content-Encoding header
-            compression = 'gzip'
-        reader = BytesIO(req.read())
-        req.close()
+        reader, compression = _urlopen(filepath_or_buffer, session=session)
         return reader, encoding, compression, True
 
     if is_s3_url(filepath_or_buffer):

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -332,7 +332,8 @@ def read_excel(io,
                         "`sheet`")
 
     if not isinstance(io, ExcelFile):
-        io = ExcelFile(io, engine=engine)
+        session = kwds.get('session', None)
+        io = ExcelFile(io, engine=engine, session=session)
 
     return io.parse(
         sheet_name=sheet_name,
@@ -396,10 +397,11 @@ def __init__(self, io, **kwds):
         if engine is not None and engine != 'xlrd':
             raise ValueError("Unknown engine: {engine}".format(engine=engine))
 
+        session = kwds.pop('session', None)
         # If io is a url, want to keep the data as bytes so can't pass
         # to get_filepath_or_buffer()
         if _is_url(self._io):
-            io = _urlopen(self._io)
+            io, _ = _urlopen(self._io, session=session)
         elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
             io, _, _, _ = get_filepath_or_buffer(self._io)
 

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -15,10 +15,9 @@
 from pandas.errors import AbstractMethodError, EmptyDataError
 
 from pandas.core.dtypes.common import is_list_like
-
 from pandas import Series
 
-from pandas.io.common import _is_url, _validate_header_arg, urlopen
+from pandas.io.common import _is_url, _urlopen, _validate_header_arg, urlopen
 from pandas.io.formats.printing import pprint_thing
 from pandas.io.parsers import TextParser
 
@@ -115,7 +114,7 @@ def _get_skiprows(skiprows):
                     type(skiprows).__name__)
 
 
-def _read(obj):
+def _read(obj, session=None):
     """Try to read from a url, file or string.
 
     Parameters
@@ -127,8 +126,7 @@ def _read(obj):
     raw_text : str
     """
     if _is_url(obj):
-        with urlopen(obj) as url:
-            text = url.read()
+        text, _ = _urlopen(obj, session=session)
     elif hasattr(obj, 'read'):
         text = obj.read()
     elif isinstance(obj, char_types):
@@ -203,12 +201,14 @@ class _HtmlFrameParser(object):
     functionality.
     """
 
-    def __init__(self, io, match, attrs, encoding, displayed_only):
+    def __init__(self, io, match, attrs, encoding, displayed_only,
+                 session=None):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
+        self.session = session
 
     def parse_tables(self):
         """
@@ -592,7 +592,7 @@ def _parse_tfoot_tr(self, table):
         return table.select('tfoot tr')
 
     def _setup_build_doc(self):
-        raw_text = _read(self.io)
+        raw_text = _read(self.io, self.session)
         if not raw_text:
             raise ValueError('No text parsed from document: {doc}'
                              .format(doc=self.io))
@@ -715,7 +715,7 @@ def _build_doc(self):
 
         try:
             if _is_url(self.io):
-                with urlopen(self.io) as f:
+                with _urlopen(self.io) as f:
                     r = parse(f, parser=parser)
             else:
                 # try to parse the input in the simplest way
@@ -890,9 +890,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
 
     # hack around python 3 deleting the exception variable
     retained = None
+    session = kwargs.get('session', None)
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only,
+                   session)
 
         try:
             tables = p.parse_tables()
@@ -928,7 +930,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, attrs=None, parse_dates=False,
               tupleize_cols=None, thousands=',', encoding=None,
               decimal='.', converters=None, na_values=None,
-              keep_default_na=True, displayed_only=True):
+              keep_default_na=True, displayed_only=True, session=None):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -1091,4 +1093,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
                   thousands=thousands, attrs=attrs, encoding=encoding,
                   decimal=decimal, converters=converters, na_values=na_values,
                   keep_default_na=keep_default_na,
-                  displayed_only=displayed_only)
+                  displayed_only=displayed_only, session=session)
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -228,7 +228,7 @@ def _write(self, obj, orient, double_precision, ensure_ascii,
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False, chunksize=None, compression='infer'):
+              lines=False, chunksize=None, compression='infer', session=None):
     """
     Convert a JSON string to pandas object
 
@@ -410,6 +410,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     compression = _infer_compression(path_or_buf, compression)
     filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
         path_or_buf, encoding=encoding, compression=compression,
+        session=session,
     )
 
     json_reader = JsonReader(

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -319,6 +319,9 @@
     values. The options are `None` for the ordinary converter,
     `high` for the high-precision converter, and `round_trip` for the
     round-trip converter.
+session : requests.Session
+    object with the a requests session configuration for remote file.
+    (requires the requests library)
 
 Returns
 -------
@@ -401,10 +404,11 @@ def _read(filepath_or_buffer, kwds):
         encoding = re.sub('_', '-', encoding).lower()
         kwds['encoding'] = encoding
 
+    session = kwds.get('session', None)
     compression = kwds.get('compression')
     compression = _infer_compression(filepath_or_buffer, compression)
     filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
-        filepath_or_buffer, encoding, compression)
+        filepath_or_buffer, encoding, compression, session=session)
     kwds['compression'] = compression
 
     if kwds.get('date_parser', None) is not None:
@@ -590,7 +594,8 @@ def parser_f(filepath_or_buffer,
                  delim_whitespace=False,
                  low_memory=_c_parser_defaults['low_memory'],
                  memory_map=False,
-                 float_precision=None):
+                 float_precision=None,
+                 session=None):
 
         # deprecate read_table GH21948
         if name == "read_table":
@@ -690,7 +695,8 @@ def parser_f(filepath_or_buffer,
                     mangle_dupe_cols=mangle_dupe_cols,
                     tupleize_cols=tupleize_cols,
                     infer_datetime_format=infer_datetime_format,
-                    skip_blank_lines=skip_blank_lines)
+                    skip_blank_lines=skip_blank_lines,
+                    session=session)
 
         return _read(filepath_or_buffer, kwds)