Rebase with pandas-master. Changing version to v0.22

Sky NSS · Sky NSS · commit cb66c6cef3d5 · 2017-12-05T11:19:31.000-08:00
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -63,6 +63,56 @@ levels <merging.merge_on_columns_and_levels>` documentation section.
    left.merge(right, on=['key1', 'key2'])
 
 
+.. _whatsnew_0220.enhancements.read_csv:
+
+``read_csv`` use `python-requests` (if installed) to support basic auth and much more
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If `python-requests` library is installed try to use it first. If not, continue using urllib
+The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`,
+:meth:`DataFrame.read_excel` now allow optional param of ``http_params`` to pass in 
+parameters for basic auth, disable ssl strict check or even a requests.Session() object
+
+
+.. ipython:: python
+  import pandas as pd
+
+  # http_params is optional parameter. If it is non-empty, it attempts to use python-requests library
+  df = pd.read_csv('https://uname:pwd@aa.com/bb.csv', http_params= {'auth': None} ) # now url can contain username and pwd
+  # Note - all basic auth scenarios require python-requests library
+
+  # Basic Auth
+  df = pd.read_csv('https://aa.com/bb.csv', http_params={ 'auth': ('john', 'pwd') } ) # now url can contain username and pwd
+  
+  # Basic Auth And disable verification of SSL certificate eg: testing
+  up = { 'auth': ('john', 'pwd') , 'verify' : False}
+  df = pd.read_csv('https://aa.com/bb.csv', http_params=up ) # now url can contain username and pwd
+
+  # Optionally, A requests.Session() can also be passed into http_params
+  import requests
+  s = requests.Session()
+  s.auth = MyAuthProvider('secret-key') # custom auth provider supported by requests
+  df = pd.read_csv(url, http_params=s)
+
+  # For advanced users, this may provide extensibility. However, testing on pandas side is limited to basic scenarios
+  # here is an example of advanced scenario
+  s = Session()
+  s.auth = ('darth', 'l0rd')  # if user wants to perform basic auth Skip if url itself contains username and pwd
+  s.timeout = (3.05, 27)                           # if user wants to modify timeout
+  s.verify = False                                      # if user wants to disable ssl cert verification
+  s.headers.update( {'User-Agent': 'Custom user agent'} )  # extensible to set any custom header needed
+  s.proxies = { 'http': 'http://a.com:100'}  # if user has proxies 
+  s.cert = '/path/client.cert'                     # if custom cert is needed
+  df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s)
+
+  def print_http_status(r, *args, **kwargs):
+      print(r.status_code)
+      print(r.headers['Content-Length'])
+  s = Session()
+  s.hooks = dict(response=print_http_status)
+  df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s)
+
+
 .. _whatsnew_0220.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -28,6 +28,13 @@
 ])
 
 
+try:
+    import requests
+    _REQUESTS_INSTALLED = True
+except ImportError:
+    _REQUESTS_INSTALLED = False
+
+
 if compat.PY3:
     from urllib.request import urlopen, pathname2url
     _urlopen = urlopen
@@ -168,8 +175,87 @@ def _stringify_path(filepath_or_buffer):
     return filepath_or_buffer
 
 
+def _is_handled_by_requests(o):
+    return _is_url(o) and parse_url(o).scheme in ['http', 'https']
+
+
+def gen_session(http_params):
+    """
+    Generate python-requests session from http_params dict
+    """
+    s = None
+    if http_params and type(http_params) is requests.sessions.Session:
+        s = http_params
+    else:
+        s = requests.Session()
+        s.stream = True
+        # Setting accept-encoding to None for backwards compatibility with
+        # urlopen. ideally we want to allow gzip download
+        # urlopen doesnt decompress automatically, requests does.
+        s.headers.update({'Accept-Encoding': None})
+    if http_params and type(http_params) is dict:
+        if http_params.get('auth', None) and not s.auth:
+            s.auth = http_params.get('auth')
+        if http_params.get('verify', True) is False and s.verify is not False:
+            s.verify = http_params.get('verify')
+    return s
+
+
+def fetch_url(url, http_params=None, skip_requests=False):
+    """
+    If url is url, first try python-requests else try urllib.
+    Note if requests library is used, auto gunzip is
+    disabled for backwards compatibility of code with urlopen
+
+    Parameters
+    ----------
+    url : str
+        Could be:
+            'http://cnn.com'
+            'file:///home/sky/aaa.csv'
+
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (username, password)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, default True
+                 If False, allow self signed and invalid SSL cert for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
+   skip_requests : boolean, default False
+       for testing - disable `requests` library Internal use only
+
+        .. versionadded:: 0.22.0
+    Raises
+    ------
+    ValueError if http_params specified without installed python-requests pkg
+    """
+    if not http_params:
+        skip_requests = True
+    if (not skip_requests) and \
+            _REQUESTS_INSTALLED and \
+            _is_handled_by_requests(url):
+        s = gen_session(http_params)
+        resp = s.get(url)
+        resp.raise_for_status()
+        content_bytes = resp.content
+    else:
+        if http_params and (skip_requests or not _REQUESTS_INSTALLED):
+            msg = 'To utilize http_params, python-requests library is ' + \
+                  'required but not detected'
+            raise ValueError(msg)
+        resp = _urlopen(url)
+        content_bytes = resp.read()
+    return resp, content_bytes
+
+
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
-                           compression=None):
+                           compression=None, http_params=None,
+                           skip_requests=False):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
     Otherwise passthrough.
@@ -180,19 +266,45 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                          or buffer
     encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
 
+    compression : str, default None
+        indicate the compression such as 'gzip'.
+
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (unae, pwd)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, default True
+                 If False, allow self signed and invalid SSL cert for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
+   skip_requests : boolean, default False
+       for testing - disable `requests` library Internal use only
+
+        .. versionadded:: 0.22.0
+
     Returns
     -------
     a filepath_or_buffer, the encoding, the compression
+
+    Raises
+    ------
+    ValueError if http_params specified without installed python-requests pkg
     """
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if _is_url(filepath_or_buffer):
-        req = _urlopen(filepath_or_buffer)
+        req, content_bytes = fetch_url(filepath_or_buffer,
+                                       http_params,
+                                       skip_requests)
+        reader = BytesIO(content_bytes)
         content_encoding = req.headers.get('Content-Encoding', None)
         if content_encoding == 'gzip':
             # Override compression based on Content-Encoding header
             compression = 'gzip'
-        reader = BytesIO(req.read())
         return reader, encoding, compression
 
     if _is_s3_url(filepath_or_buffer):
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -15,10 +15,11 @@
     is_integer, is_float,
     is_bool, is_list_like)
 
+from pandas.compat import BytesIO
 from pandas.core.frame import DataFrame
 from pandas.io.parsers import TextParser
 from pandas.errors import EmptyDataError
-from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
+from pandas.io.common import (_is_url, fetch_url, _validate_header_arg,
                               get_filepath_or_buffer, _NA_VALUES,
                               _stringify_path)
 from pandas.core.indexes.period import Period
@@ -263,7 +264,9 @@ def __init__(self, io, **kwds):
         # If io is a url, want to keep the data as bytes so can't pass
         # to get_filepath_or_buffer()
         if _is_url(self._io):
-            io = _urlopen(self._io)
+            rs = kwds.get('http_params', None)
+            req, content = fetch_url(self._io, http_params=rs)
+            io = BytesIO(content)
         elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
             io, _, _ = get_filepath_or_buffer(self._io)
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -14,7 +14,7 @@
 
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
-from pandas.io.common import (_is_url, urlopen,
+from pandas.io.common import (_is_url, fetch_url,
                               parse_url, _validate_header_arg)
 from pandas.io.parsers import TextParser
 from pandas.compat import (lrange, lmap, u, string_types, iteritems,
@@ -116,20 +116,31 @@ def _get_skiprows(skiprows):
                     type(skiprows).__name__)
 
 
-def _read(obj):
+def _read(obj, http_params=None):
     """Try to read from a url, file or string.
 
     Parameters
     ----------
     obj : str, unicode, or file-like
 
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (unae, pwd)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, default True
+                 If False, allow self signed and invalid SSL certs for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
     Returns
     -------
     raw_text : str
     """
     if _is_url(obj):
-        with urlopen(obj) as url:
-            text = url.read()
+        req, text = fetch_url(obj, http_params)
     elif hasattr(obj, 'read'):
         text = obj.read()
     elif isinstance(obj, char_types):
@@ -172,6 +183,24 @@ class _HtmlFrameParser(object):
         A dictionary of valid table attributes to use to search for table
         elements.
 
+    encoding : str or None, optional
+        The encoding used to decode the web page. Defaults to ``None``.``None``
+        preserves the previous encoding behavior, which depends on the
+        underlying parser library (e.g., the parser library will try to use
+        the encoding provided by the document).
+
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (username, password)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, default True
+                 If False, allow self signed and invalid SSL cert for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
     Notes
     -----
     To subclass this class effectively you must override the following methods:
@@ -187,11 +216,12 @@ class _HtmlFrameParser(object):
     functionality.
     """
 
-    def __init__(self, io, match, attrs, encoding):
+    def __init__(self, io, match, attrs, encoding, http_params=None):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
+        self.http_params = http_params
 
     def parse_tables(self):
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -444,7 +474,7 @@ def _parse_tables(self, doc, match, attrs):
         return result
 
     def _setup_build_doc(self):
-        raw_text = _read(self.io)
+        raw_text = _read(self.io, self.http_params)
         if not raw_text:
             raise ValueError('No text parsed from document: {doc}'
                              .format(doc=self.io))
@@ -737,7 +767,8 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding)
+        p = parser(io, compiled_match, attrs, encoding,
+                   http_params=kwargs.get('http_params', None))
 
         try:
             tables = p.parse_tables()
@@ -773,7 +804,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, attrs=None, parse_dates=False,
               tupleize_cols=None, thousands=',', encoding=None,
               decimal='.', converters=None, na_values=None,
-              keep_default_na=True):
+              keep_default_na=True, http_params=None):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -877,6 +908,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
         .. versionadded:: 0.19.0
 
+    http_params : requests.Session(), default None
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
     Returns
     -------
     dfs : list of DataFrames
@@ -924,4 +961,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
                   parse_dates=parse_dates, tupleize_cols=tupleize_cols,
                   thousands=thousands, attrs=attrs, encoding=encoding,
                   decimal=decimal, converters=converters, na_values=na_values,
-                  keep_default_na=keep_default_na)
+                  keep_default_na=keep_default_na, http_params=http_params)
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -182,7 +182,8 @@ def write(self):
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False, chunksize=None, compression='infer'):
+              lines=False, chunksize=None, compression='infer',
+              http_params=None):
     """
     Convert a JSON string to pandas object
 
@@ -290,6 +291,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
         .. versionadded:: 0.21.0
 
+    http_params : dict or requests.Session(), default None
+        A python dict containing:
+            'auth': tuple (str, str) eg (unae, pwd)
+            'auth': Any other auth object accepted by requests
+            'verify': boolean, Default True
+                 If False, allow self signed and invalid SSL certs for https
+        or
+        A python requests.Session object if http(s) path to enable basic auth
+        and many other scenarios that requests allows
+
+        .. versionadded:: 0.22.0
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.
@@ -350,6 +363,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     compression = _infer_compression(path_or_buf, compression)
     filepath_or_buffer, _, compression = get_filepath_or_buffer(
         path_or_buf, encoding=encoding, compression=compression,
+        http_params=http_params
     )
 
     json_reader = JsonReader(
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py