pandas-dev · skynss · Dec 5, 2017 · Dec 5, 2017 · Dec 5, 2017 · jreback
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -63,6 +63,56 @@ levels <merging.merge_on_columns_and_levels>` documentation section.
    left.merge(right, on=['key1', 'key2'])
 
 
+.. _whatsnew_0220.enhancements.read_csv:
+
+``read_csv`` use `python-requests` (if installed) to support basic auth and much more
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If `python-requests` library is installed try to use it first. If not, continue using urllib
+The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`,
+:meth:`DataFrame.read_excel` now allow optional param of ``http_params`` to pass in 
+parameters for basic auth, disable ssl strict check or even a requests.Session() object
+
+
+.. ipython:: python
+  import pandas as pd
+
+  # http_params is optional parameter. If it is non-empty, it attempts to use python-requests library
+  df = pd.read_csv('https://uname:[email protected]/bb.csv', http_params= {'auth': None} ) # now url can contain username and pwd
+  # Note - all basic auth scenarios require python-requests library
+
+  # Basic Auth
+  df = pd.read_csv('https://aa.com/bb.csv', http_params={ 'auth': ('john', 'pwd') } ) # now url can contain username and pwd
+
+  # Basic Auth And disable verification of SSL certificate eg: testing
+  up = { 'auth': ('john', 'pwd') , 'verify' : False}
+  df = pd.read_csv('https://aa.com/bb.csv', http_params=up ) # now url can contain username and pwd
+
+  # Optionally, A requests.Session() can also be passed into http_params
+  import requests
+  s = requests.Session()
+  s.auth = MyAuthProvider('secret-key') # custom auth provider supported by requests
+  df = pd.read_csv(url, http_params=s)
+
+  # For advanced users, this may provide extensibility. However, testing on pandas side is limited to basic scenarios
+  # here is an example of advanced scenario
+  s = Session()
+  s.auth = ('darth', 'l0rd')  # if user wants to perform basic auth Skip if url itself contains username and pwd
+  s.timeout = (3.05, 27)                           # if user wants to modify timeout
+  s.verify = False                                      # if user wants to disable ssl cert verification
+  s.headers.update( {'User-Agent': 'Custom user agent'} )  # extensible to set any custom header needed
+  s.proxies = { 'http': 'http://a.com:100'}  # if user has proxies 
+  s.cert = '/path/client.cert'                     # if custom cert is needed
+  df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s)
+
+  def print_http_status(r, *args, **kwargs):
+      print(r.status_code)
+      print(r.headers['Content-Length'])
+  s = Session()
+  s.hooks = dict(response=print_http_status)
+  df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s)
+
+
 .. _whatsnew_0220.enhancements.other:
 
 Other Enhancements

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -28,6 +28,13 @@
 ])
 
 
+try:
+    import requests
+    _REQUESTS_INSTALLED = True
+except ImportError:
+    _REQUESTS_INSTALLED = False
+
+
 if compat.PY3:
     from urllib.request import urlopen, pathname2url
     _urlopen = urlopen
@@ -168,8 +175,87 @@ def _stringify_path(filepath_or_buffer):
     return filepath_or_buffer
 
 
+def _is_handled_by_requests(o):
+    return _is_url(o) and parse_url(o).scheme in ['http', 'https']
+
+
+def gen_session(http_params):
+    """
+    Generate python-requests session from http_params dict
+    """
+    s = None
+    if http_params and type(http_params) is requests.sessions.Session:
+        s = http_params
+    else:
+        s = requests.Session()
+        s.stream = True
+        # Setting accept-encoding to None for backwards compatibility with
+        # urlopen. ideally we want to allow gzip download
+        # urlopen doesnt decompress automatically, requests does.
+        s.headers.update({'Accept-Encoding': None})
+    if http_params and type(http_params) is dict:
+        if http_params.get('auth', None) and not s.auth:
+            s.auth = http_params.get('auth')
+        if http_params.get('verify', True) is False and s.verify is not False:
+            s.verify = False
+    return s
+
+
+def fetch_url(url, http_params=None, skip_requests=False):
+    """
+    If url is url, first try python-requests else try urllib.
+    Note if requests library is used, auto gunzip is
+    disabled for backwards compatibility of code with urlopen
+
+    Parameters
+    ----------
+    url : str
+        Could be:
+            'http://cnn.com'
+            'file:///home/sky/aaa.csv'
+
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (username, password)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, default True
+                 If False, allow self signed and invalid SSL cert for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
+   skip_requests : boolean, default False
+       for testing - disable `requests` library Internal use only
+
+        .. versionadded:: 0.22.0
+    Raises
+    ------
+    ValueError if http_params specified without installed python-requests pkg
+    """
+    if not http_params:
+        skip_requests = True
+    if (not skip_requests) and \
+            _REQUESTS_INSTALLED and \
+            _is_handled_by_requests(url):
+        s = gen_session(http_params)
+        resp = s.get(url)
+        resp.raise_for_status()
+        content_bytes = resp.content
+    else:
+        if http_params and (skip_requests or not _REQUESTS_INSTALLED):
+            msg = 'To utilize http_params, python-requests library is ' + \
+                  'required but not detected'
+            raise ValueError(msg)
+        resp = _urlopen(url)
+        content_bytes = resp.read()
+    return resp, content_bytes
+
+
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
-                           compression=None):
+                           compression=None, http_params=None,
+                           skip_requests=False):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
     Otherwise passthrough.
@@ -180,19 +266,45 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                          or buffer
     encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
 
+    compression : str, default None
+        indicate the compression such as 'gzip'.
+
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (unae, pwd)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, default True
+                 If False, allow self signed and invalid SSL cert for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
+   skip_requests : boolean, default False
+       for testing - disable `requests` library Internal use only
+
+        .. versionadded:: 0.22.0
+
     Returns
     -------
     a filepath_or_buffer, the encoding, the compression
+
+    Raises
+    ------
+    ValueError if http_params specified without installed python-requests pkg
     """
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if _is_url(filepath_or_buffer):
-        req = _urlopen(filepath_or_buffer)
+        req, content_bytes = fetch_url(filepath_or_buffer,
+                                       http_params,
+                                       skip_requests)
+        reader = BytesIO(content_bytes)
         content_encoding = req.headers.get('Content-Encoding', None)
         if content_encoding == 'gzip':
             # Override compression based on Content-Encoding header
             compression = 'gzip'
-        reader = BytesIO(req.read())
         return reader, encoding, compression
 
     if _is_s3_url(filepath_or_buffer):

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -15,10 +15,11 @@
     is_integer, is_float,
     is_bool, is_list_like)
 
+from pandas.compat import BytesIO
 from pandas.core.frame import DataFrame
 from pandas.io.parsers import TextParser
 from pandas.errors import EmptyDataError
-from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
+from pandas.io.common import (_is_url, fetch_url, _validate_header_arg,
                               get_filepath_or_buffer, _NA_VALUES,
                               _stringify_path)
 from pandas.core.indexes.period import Period
@@ -148,6 +149,19 @@
     data will be read in as floats: Excel stores all numbers as floats
     internally
 
+http_params : dict or requests.Session(), default None
+    A python dict containing:
+        'auth': tuple (str, str) eg (unae, pwd)
+        'auth': Any other auth object accepted by requests
+        'verify': boolean, Default True
+             If False, allow self signed and invalid SSL certs for https
+    or
+    A python requests.Session object if http(s) path to enable basic auth
+    and many other scenarios that requests allows
+
+    .. versionadded:: 0.22.0
+
+
 Returns
 -------
 parsed : DataFrame or Dict of DataFrames
@@ -199,7 +213,6 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
                convert_float=True, converters=None, dtype=None,
                true_values=None, false_values=None, engine=None,
                squeeze=False, **kwds):
-
     # Can't use _deprecate_kwarg since sheetname=None has a special meaning
     if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
         warnings.warn("The `sheetname` keyword is deprecated, use "
@@ -210,7 +223,10 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
                         "Use just `sheet_name`")
 
     if not isinstance(io, ExcelFile):
-        io = ExcelFile(io, engine=engine)
+        ukwds = {}
+        if kwds.get('http_params', None) is not None:
+            ukwds['http_params'] = kwds.get('http_params')
+        io = ExcelFile(io, engine=engine, **ukwds)
 
     return io._parse_excel(
         sheetname=sheet_name, header=header, skiprows=skiprows, names=names,
@@ -263,7 +279,9 @@ def __init__(self, io, **kwds):
         # If io is a url, want to keep the data as bytes so can't pass
         # to get_filepath_or_buffer()
         if _is_url(self._io):
-            io = _urlopen(self._io)
+            hp = kwds.get('http_params', None)
+            req, content = fetch_url(self._io, http_params=hp)
+            io = BytesIO(content)
         elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
             io, _, _ = get_filepath_or_buffer(self._io)