pandas-dev · skynss · Jul 13, 2017 · Jul 13, 2017 · Jul 13, 2017 · Jul 13, 2017
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -40,7 +40,8 @@ Other Enhancements
 - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`)
 - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`)
 - :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`)
-
+- :func:`read_csv`, :func:`read_html`, :func:`read_json`, :func:`read_html`  now accept auth in url //<user>:<password>@<host>:<port>/<url-path>, or ``auth`` tuple (username, password) parameter 
+- :func:`read_csv`, :func:`read_html`, :func:`read_json`, :func:`read_html`  now accept ``verify_ssl`` False to disable https/ssl certificate verification (eg: self signed ssl certs in testing)  (:issue:`16716`)
 .. _whatsnew_0210.api_breaking:
 
 Backwards incompatible API changes

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -4,6 +4,8 @@
 import csv
 import codecs
 import mmap
+import ssl
+import base64
 from contextlib import contextmanager, closing
 
 from pandas.compat import StringIO, BytesIO, string_types, text_type
@@ -49,7 +51,7 @@
 
 
 if compat.PY3:
-    from urllib.request import urlopen, pathname2url
+    from urllib.request import urlopen, pathname2url, Request
     _urlopen = urlopen
     from urllib.parse import urlparse as parse_url
     from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -58,6 +60,7 @@
     from http.client import HTTPException  # noqa
 else:
     from urllib2 import urlopen as _urlopen
+    from urllib2 import Request
     from urllib import urlencode, pathname2url  # noqa
     from urlparse import urlparse as parse_url
     from urlparse import uses_relative, uses_netloc, uses_params, urljoin
@@ -177,7 +180,8 @@ def _stringify_path(filepath_or_buffer):
 
 
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
-                           compression=None):
+                           compression=None, auth=None,
+                           verify_ssl=None):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
     Otherwise passthrough.
@@ -186,16 +190,39 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     ----------
     filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                          or buffer
+            now supports 'https://<user>:<password>@<host>:<port>/<url-path>'
+
+            .. versionadded:: 0.21.0
+
     encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
 
+    compression : string, default None
+
+            .. versionadded:: 0.18.1
+
+    auth : tuple, default None 
+            A tuple of string with (username, password) string for 
+            HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life')
+
+            .. versionadded:: 0.21.0
+
+    verify_ssl : boolean, Default True
+            If False, allow self signed and invalid SSL certificates for https
+
+            .. versionadded:: 0.21.0
+
+
     Returns
     -------
     a filepath_or_buffer, the encoding, the compression
     """
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if _is_url(filepath_or_buffer):
-        req = _urlopen(filepath_or_buffer)
+        ureq, kwargs = get_urlopen_args(filepath_or_buffer,
+                                        auth=auth,
+                                        verify_ssl=verify_ssl)
+        req = _urlopen(ureq, **kwargs)
         content_encoding = req.headers.get('Content-Encoding', None)
         if content_encoding == 'gzip':
             # Override compression based on Content-Encoding header
@@ -244,6 +271,93 @@ def file_path_to_url(path):
 }
 
 
+def split_auth_from_url(url_with_uname):
+    """
+    If a url contains username and password, it is extracted and returned
+    along with a url that does not contain it.
+
+    Parameters
+    ----------
+    url_with_uname : string
+            a url that may or may not contain username and password
+            see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt
+            //<user>:<password>@<host>:<port>/<url-path>
+
+            .. versionadded:: 0.21.0
+
+    Returns
+    -------
+    (username, password), url_no_usrpwd : tuple, string  Default ('', '') url
+            A tuple with (username, pwd) pair and 
+            url without username or password (if it contained it )
+
+    Raises
+    ------
+    ValueError for empty url
+    """
+    if not url_with_uname: 
+        msg = "Empty url: {_type}"
+        raise ValueError(msg.format(_type=type(url_with_uname)))
+    o = parse_url(url_with_uname)
+    uname = o.username if o.username else ''
+    pwd = o.password if o.password else ''
+    url_no_usrpwd = url_with_uname
+    if uname or pwd:
+        usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname)
+        url_no_usrpwd = url_with_uname.replace(usrch, o.hostname)
+    return (uname, pwd), url_no_usrpwd
+
+
+def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True):
+    """
+    generate args to pass to urlopen - including basic auth and and support
+    for disabling verification of SSL certificates ( useful where
+    self-signed SSL certificates are acceptable security risk -eg: Testing )
+
+    Parameters
+    ----------
+    url_with_uname : string
+            a url that may or may not contain username and password
+            see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt
+            //<user>:<password>@<host>:<port>/<url-path>
+
+            .. versionadded:: 0.21.0
+
+    auth : tuple, default None 
+            A tuple of string with (username, password) string for 
+            HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life')
+
+            .. versionadded:: 0.21.0
+
+    verify_ssl : boolean, Default True
+            If False, allow self signed and invalid SSL certificates for https
+
+            .. versionadded:: 0.21.0
+
+    Returns
+    -------
+    Request, kwargs to pass to urlopen. kwargs may be {} or {'context': obj }
+    """
+    uname = pwd = None
+    url_no_usrpwd = url_with_uname
+    if auth and len(auth) == 2:
+        uname, pwd = auth
+    if not uname and not pwd:
+        (uname, pwd), url_no_usrpwd = split_auth_from_url(url_with_uname)
+    req = Request(url_no_usrpwd)
+    if uname or pwd:
+        upstr = '{}:{}'.format(uname, pwd)
+        if compat.PY3:
+            b64str = base64.b64encode(bytes(upstr, 'ascii')).decode('utf-8')
+        else:
+            b64str = base64.encodestring(upstr).replace('\n', '')
+        req.add_header("Authorization", "Basic {}".format(b64str))
+    kwargs = {}
+    if verify_ssl not in [None, True]:
+        kwargs['context'] = ssl._create_unverified_context()
+    return req, kwargs
+
+
 def _infer_compression(filepath_or_buffer, compression):
     """
     Get the compression method for filepath_or_buffer. If compression='infer',

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -20,7 +20,7 @@
 from pandas.errors import EmptyDataError
 from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
                               get_filepath_or_buffer, _NA_VALUES,
-                              _stringify_path)
+                              _stringify_path, get_urlopen_args)
 from pandas.core.indexes.period import Period
 import pandas._libs.json as json
 from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -200,7 +200,6 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
                convert_float=True, converters=None, dtype=None,
                true_values=None, false_values=None, engine=None,
                squeeze=False, **kwds):
-
     # Can't use _deprecate_kwarg since sheetname=None has a special meaning
     if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
         warnings.warn("The `sheetname` keyword is deprecated, use "
@@ -211,7 +210,10 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
                         "Use just `sheet_name`")
 
     if not isinstance(io, ExcelFile):
-        io = ExcelFile(io, engine=engine)
+        io = ExcelFile(io,
+                       engine=engine,
+                       auth=kwds.get('auth', None),
+                       verify_ssl=kwds.get('verify_ssl', None))
 
     return io._parse_excel(
         sheetname=sheet_name, header=header, skiprows=skiprows, names=names,
@@ -259,7 +261,11 @@ def __init__(self, io, **kwds):
         # If io is a url, want to keep the data as bytes so can't pass
         # to get_filepath_or_buffer()
         if _is_url(self._io):
-            io = _urlopen(self._io)
+            verify_ssl = kwds.get('verify_ssl', None)
+            ureq, kwargs = get_urlopen_args(self._io,
+                                            auth=kwds.get('auth', None),
+                                            verify_ssl=verify_ssl)
+            io = _urlopen(ureq, **kwargs)
         elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
             io, _, _ = get_filepath_or_buffer(self._io)
 

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -15,7 +15,8 @@
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
 from pandas.io.common import (_is_url, urlopen,
-                              parse_url, _validate_header_arg)
+                              parse_url, _validate_header_arg,
+                              get_urlopen_args)
 from pandas.io.parsers import TextParser
 from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
@@ -116,19 +117,29 @@ def _get_skiprows(skiprows):
                     type(skiprows).__name__)
 
 
-def _read(obj):
+def _read(obj, auth=None, verify_ssl=None):
     """Try to read from a url, file or string.
 
     Parameters
     ----------
     obj : str, unicode, or file-like
+    auth : tuple, default None 
+            A tuple of string with (username, password) string for 
+            HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life')
+
+            .. versionadded:: 0.21.0
 
+    verify_ssl : boolean, Default True
+            If False, allow self signed and invalid SSL certificates for https
+
+            .. versionadded:: 0.21.0
     Returns
     -------
     raw_text : str
     """
     if _is_url(obj):
-        with urlopen(obj) as url:
+        ureq, kwargs = get_urlopen_args(obj, auth, verify_ssl)
+        with urlopen(ureq, **kwargs) as url:
             text = url.read()
     elif hasattr(obj, 'read'):
         text = obj.read()
@@ -187,11 +198,14 @@ class _HtmlFrameParser(object):
     functionality.
     """
 
-    def __init__(self, io, match, attrs, encoding):
+    def __init__(self, io, match, attrs, encoding, auth=None,
+                 verify_ssl=None):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
+        self.auth = auth
+        self.verify_ssl = verify_ssl
 
     def parse_tables(self):
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -444,7 +458,7 @@ def _parse_tables(self, doc, match, attrs):
         return result
 
     def _setup_build_doc(self):
-        raw_text = _read(self.io)
+        raw_text = _read(self.io, self.auth, self.verify_ssl)
         if not raw_text:
             raise ValueError('No text parsed from document: %s' % self.io)
         return raw_text
@@ -731,8 +745,11 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding)
-
+        p = parser(io, compiled_match,
+                   attrs,
+                   encoding,
+                   auth=kwargs.get('auth', None),
+                   verify_ssl=kwargs.get('verify_ssl', None))
         try:
             tables = p.parse_tables()
         except Exception as caught:
@@ -755,7 +772,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, attrs=None, parse_dates=False,
               tupleize_cols=False, thousands=',', encoding=None,
               decimal='.', converters=None, na_values=None,
-              keep_default_na=True):
+              keep_default_na=True, auth=None,
+              verify_ssl=False):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -856,7 +874,18 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
         .. versionadded:: 0.19.0
 
-    Returns
+    auth : tuple, default None 
+            A tuple of string with (username, password) string for 
+            HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life')
+
+            .. versionadded:: 0.21.0
+
+    verify_ssl : boolean, Default True
+            If False, allow self signed and invalid SSL certificates for https
+
+            .. versionadded:: 0.21.0
+
+        Returns
     -------
     dfs : list of DataFrames
 
@@ -903,4 +932,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
                   parse_dates=parse_dates, tupleize_cols=tupleize_cols,
                   thousands=thousands, attrs=attrs, encoding=encoding,
                   decimal=decimal, converters=converters, na_values=na_values,
-                  keep_default_na=keep_default_na)
+                  keep_default_na=keep_default_na, auth=auth,
+                  verify_ssl=verify_ssl)
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -174,7 +174,7 @@ def write(self):
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
               numpy=False, precise_float=False, date_unit=None, encoding=None,
-              lines=False):
+              lines=False, auth=None, verify_ssl=None):
     """
     Convert a JSON string to pandas object
 
@@ -263,6 +263,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
 
         .. versionadded:: 0.19.0
 
+    auth : tuple, default None 
+            A tuple of string with (username, password) string for 
+            HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life')
+
+            .. versionadded:: 0.21.0
+
+    verify_ssl : boolean, Default True
+            If False, allow self signed and invalid SSL certificates for https
+
+            .. versionadded:: 0.21.0
+
     Returns
     -------
     result : Series or DataFrame, depending on the value of `typ`.
@@ -321,7 +332,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
     """
 
     filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
-                                                      encoding=encoding)
+                                                      encoding=encoding,
+                                                      auth=auth,
+                                                      verify_ssl=verify_ssl)
     if isinstance(filepath_or_buffer, compat.string_types):
         try:
             exists = os.path.exists(filepath_or_buffer)