From 8b6e426044251a673ab277ebb88e3b7c0cea9557 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Wed, 12 Jul 2017 19:43:59 -0700 Subject: [PATCH 01/14] Added http(s) basic auth and allow self signed ssl certs --- pandas/io/common.py | 71 ++++++++++++++++++++++++++++++++-- pandas/io/excel.py | 16 ++++++-- pandas/io/html.py | 46 ++++++++++++++++------ pandas/io/json/json.py | 12 +++++- pandas/io/parsers.py | 22 +++++++++-- pandas/tests/io/test_common.py | 11 ++++++ 6 files changed, 155 insertions(+), 23 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index cbfc33dbebb81..247a01026b16f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,6 +4,8 @@ import csv import codecs import mmap +import ssl +import base64 from contextlib import contextmanager, closing from pandas.compat import StringIO, BytesIO, string_types, text_type @@ -49,7 +51,11 @@ if compat.PY3: - from urllib.request import urlopen, pathname2url + from urllib.request import (urlopen, pathname2url, build_opener, + install_opener, + HTTPPasswordMgrWithDefaultRealm, + HTTPBasicAuthHandler, + HTTPSHandler) _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, @@ -58,6 +64,7 @@ from http.client import HTTPException # noqa else: from urllib2 import urlopen as _urlopen + from urllib2 import Request from urllib import urlencode, pathname2url # noqa from urlparse import urlparse as parse_url from urlparse import uses_relative, uses_netloc, uses_params, urljoin @@ -177,7 +184,8 @@ def _stringify_path(filepath_or_buffer): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None): + compression=None, username=None, + password=None, verify_ssl=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -186,7 +194,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer + support 'https://username:password@fqdn.com:port/aaa.csv' encoding : the encoding to use to decode py3 bytes, default is 'utf-8' + compression: + username: Authentication username (for https basic auth) + password: Authentication password (for https basic auth) + verify_ssl: Default True. If False, allow self signed and invalid SSL + certificates for https Returns ------- @@ -195,7 +209,11 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): - req = _urlopen(filepath_or_buffer) + ureq, kwargs = get_urlopen_args(filepath_or_buffer, + uname=username, + pwd=password, + verify_ssl=verify_ssl) + req = _urlopen(ureq, **kwargs) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header @@ -244,6 +262,53 @@ def file_path_to_url(path): } +def split_uname_from_url(url_with_uname): + o = parse_url(url_with_uname) + usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) + url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) + return o.username, o.password, url_no_usrpwd + + +def get_urlopen_args(url_with_uname, uname=None, pwd=None, verify_ssl=True): + if not uname and not pwd: + uname, pwd, url_no_usrpwd = split_uname_from_url(url_with_uname) + else: + url_no_usrpwd = url_with_uname + if compat.PY3: + fn = get_urlopen_args_py3 + else: + fn = get_urlopen_args_py2 + req, kwargs = fn(uname, pwd, url_no_usrpwd, verify_ssl=verify_ssl) + return req, kwargs + + +def get_urlopen_args_py2(uname, pwd, url_no_usrpwd, verify_ssl=True): + req = Request(url_no_usrpwd) + upstr = '{}:{}'.format(uname, pwd) + base64string = base64.encodestring(upstr).replace('\n', '') + req.add_header("Authorization", "Basic {}".format(base64string)) + # I hope pandas can support self signed certs too + kwargs = {} + if verify_ssl not in [None, True]: + kwargs['context'] = ssl._create_unverified_context() + return req, kwargs + + +def get_urlopen_args_py3(uname, pwd, url_no_usrpwd, verify_ssl=True): + passman = HTTPPasswordMgrWithDefaultRealm() + passman.add_password(None, url_no_usrpwd, uname, pwd) + authhandler = HTTPBasicAuthHandler(passman) + if verify_ssl in [None, True]: + opener = build_opener(authhandler) + else: + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + opener = build_opener(authhandler, HTTPSHandler(context=context)) + install_opener(opener) + return url_no_usrpwd, {} + + def _infer_compression(filepath_or_buffer, compression): """ Get the compression method for filepath_or_buffer. If compression='infer', diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5db4603c37be0..3a3e994dce335 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -20,7 +20,7 @@ from pandas.errors import EmptyDataError from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, get_filepath_or_buffer, _NA_VALUES, - _stringify_path) + _stringify_path, get_urlopen_args) from pandas.core.indexes.period import Period import pandas._libs.json as json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, @@ -200,7 +200,6 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, convert_float=True, converters=None, dtype=None, true_values=None, false_values=None, engine=None, squeeze=False, **kwds): - # Can't use _deprecate_kwarg since sheetname=None has a special meaning if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: warnings.warn("The `sheetname` keyword is deprecated, use " @@ -211,7 +210,11 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, "Use just `sheet_name`") if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, + engine=engine, + username=kwds.get('username', None), + password=kwds.get('password', None), + verify_ssl=kwds.get('verify_ssl', None)) return io._parse_excel( sheetname=sheet_name, header=header, skiprows=skiprows, names=names, @@ -259,7 +262,12 @@ def __init__(self, io, **kwds): # If io is a url, want to keep the data as bytes so can't pass # to get_filepath_or_buffer() if _is_url(self._io): - io = _urlopen(self._io) + verify_ssl = kwds.get('verify_ssl', None) + ureq, kwargs = get_urlopen_args(self._io, + uname=kwds.get('username', None), + pwd=kwds.get('password', None), + verify_ssl=verify_ssl) + io = _urlopen(ureq, **kwargs) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): io, _, _ = get_filepath_or_buffer(self._io) diff --git a/pandas/io/html.py b/pandas/io/html.py index 2613f26ae5f52..3f4e2f2773983 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -15,7 +15,8 @@ from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError from pandas.io.common import (_is_url, urlopen, - parse_url, _validate_header_arg) + parse_url, _validate_header_arg, + get_urlopen_args) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) @@ -116,19 +117,22 @@ def _get_skiprows(skiprows): type(skiprows).__name__) -def _read(obj): +def _read(obj, username=None, password=None, verify_ssl=None): """Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like - + username: username for http basic auth + password: password for http basic auth + verify_ssl: Default True. Set to False to disable cert verification Returns ------- raw_text : str """ if _is_url(obj): - with urlopen(obj) as url: + ureq, kwargs = get_urlopen_args(obj, username, password, verify_ssl) + with urlopen(ureq, **kwargs) as url: text = url.read() elif hasattr(obj, 'read'): text = obj.read() @@ -187,11 +191,15 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding): + def __init__(self, io, match, attrs, encoding, username=None, + password=None, verify_ssl=None): self.io = io self.match = match self.attrs = attrs self.encoding = encoding + self.username = username + self.password = password + self.verify_ssl = verify_ssl def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -444,7 +452,8 @@ def _parse_tables(self, doc, match, attrs): return result def _setup_build_doc(self): - raw_text = _read(self.io) + raw_text = _read(self.io, self.username, + self.password, self.verify_ssl) if not raw_text: raise ValueError('No text parsed from document: %s' % self.io) return raw_text @@ -731,8 +740,12 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding) - + p = parser(io, compiled_match, + attrs, + encoding, + username=kwargs.get('username', None), + password=kwargs.get('password', None), + verify_ssl=kwargs.get('verify_ssl', None)) try: tables = p.parse_tables() except Exception as caught: @@ -755,7 +768,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True): + keep_default_na=True, username=None, password=None, + verify_ssl=False): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -856,7 +870,16 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 - Returns + username : str, default None + username for HTTP(s) basic auth + + password : str, default None + password for HTTP(s) basic auth + + verify_ssl : bool, default True + If False, ssl certificate is not verified (allow self signed SSL certs) + + Returns ------- dfs : list of DataFrames @@ -903,4 +926,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na) + keep_default_na=keep_default_na, username=username, + password=password, verify_ssl=verify_ssl) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 31907ad586817..823b24ed2bf4a 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -174,7 +174,7 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False): + lines=False, username=None, password=None, verify_ssl=None): """ Convert a JSON string to pandas object @@ -263,6 +263,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 + username: str, default None. Authentication username for HTTP(s) basic auth + passowrd: str, default None. Authentication password for HTTP(s) basic auth + verify_ssl: boolean, default None (True). + If false, allow self siged SSL certificates + Returns ------- result : Series or DataFrame, depending on the value of `typ`. @@ -321,7 +326,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, - encoding=encoding) + encoding=encoding, + username=username, + password=password, + verify_ssl=verify_ssl) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 343bc7a74fde8..03997a918dd03 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -391,9 +391,13 @@ def _read(filepath_or_buffer, kwds): kwds['encoding'] = encoding compression = kwds.get('compression') + username = kwds.get('username', None) + password = kwds.get('password', None) + verify_ssl = kwds.get('verify_ssl', None) compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression) + filepath_or_buffer, encoding, compression, username, password, + verify_ssl) kwds['compression'] = compression if kwds.get('date_parser', None) is not None: @@ -574,7 +578,14 @@ def parser_f(filepath_or_buffer, low_memory=_c_parser_defaults['low_memory'], buffer_lines=None, memory_map=False, - float_precision=None): + float_precision=None, + + # Basic auth (http/https) + username=None, + password=None, + + # skip verify self signed SSL certificates + verify_ssl=None): # Alias sep -> delimiter. if delimiter is None: @@ -654,7 +665,12 @@ def parser_f(filepath_or_buffer, mangle_dupe_cols=mangle_dupe_cols, tupleize_cols=tupleize_cols, infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines) + skip_blank_lines=skip_blank_lines, + + username=username, + password=password, + verify_ssl=verify_ssl + ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b527e3c5dc254..9964e41efa9bb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -190,6 +190,17 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) + def test_split_url_extract_uname_pwd(self): + for url, uname, pwd, nurl in [('https://aaa:bbb@ccc.com:1010/aaa.txt', + 'aaa', + 'bbb', + 'https://ccc.com:1010/aaa.txt' + )]: + un, p, u = common.split_uname_from_url(url) + assert u == nurl + assert un == uname + assert p == pwd + class TestMMapWrapper(object): From 145c7f4b9f6ff2c1e677432f8134057a7c41d2d4 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Wed, 12 Jul 2017 21:42:13 -0700 Subject: [PATCH 02/14] Change username pwd to auth (username, password) --- pandas/io/common.py | 86 +++++++++++++++++----------------- pandas/io/excel.py | 6 +-- pandas/io/html.py | 32 +++++-------- pandas/io/json/json.py | 8 ++-- pandas/io/parsers.py | 13 ++--- pandas/tests/io/test_common.py | 11 ----- 6 files changed, 65 insertions(+), 91 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 247a01026b16f..72387993a740f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -184,8 +184,8 @@ def _stringify_path(filepath_or_buffer): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, username=None, - password=None, verify_ssl=None): + compression=None, auth=None, + verify_ssl=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -194,11 +194,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer - support 'https://username:password@fqdn.com:port/aaa.csv' + supports 'https://username:password@fqdn.com:port/aaa.csv' encoding : the encoding to use to decode py3 bytes, default is 'utf-8' compression: - username: Authentication username (for https basic auth) - password: Authentication password (for https basic auth) + auth: (str,str), default None. (username, password) for HTTP(s) basic auth verify_ssl: Default True. If False, allow self signed and invalid SSL certificates for https @@ -210,8 +209,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if _is_url(filepath_or_buffer): ureq, kwargs = get_urlopen_args(filepath_or_buffer, - uname=username, - pwd=password, + auth=auth, verify_ssl=verify_ssl) req = _urlopen(ureq, **kwargs) content_encoding = req.headers.get('Content-Encoding', None) @@ -262,16 +260,45 @@ def file_path_to_url(path): } -def split_uname_from_url(url_with_uname): - o = parse_url(url_with_uname) - usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) - url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) - return o.username, o.password, url_no_usrpwd - - -def get_urlopen_args(url_with_uname, uname=None, pwd=None, verify_ssl=True): +def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): + def split_auth_from_url(url_with_uname): + o = parse_url(url_with_uname) + usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) + url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) + return (o.username, o.password), url_no_usrpwd + + def get_urlopen_args_py2(uname, pwd, url_no_usrpwd, verify_ssl=True): + req = Request(url_no_usrpwd) + upstr = '{}:{}'.format(uname, pwd) + base64string = base64.encodestring(upstr).replace('\n', '') + req.add_header("Authorization", "Basic {}".format(base64string)) + # I hope pandas can support self signed certs too + kwargs = {} + if verify_ssl not in [None, True]: + kwargs['context'] = ssl._create_unverified_context() + return req, kwargs + + def get_urlopen_args_py3(uname, pwd, url_no_usrpwd, verify_ssl=True): + # not using urllib.request Request for PY3 because + # this looks like better code from extensibility purpose + passman = HTTPPasswordMgrWithDefaultRealm() + passman.add_password(None, url_no_usrpwd, uname, pwd) + authhandler = HTTPBasicAuthHandler(passman) + if verify_ssl in [None, True]: + opener = build_opener(authhandler) + else: + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + opener = build_opener(authhandler, HTTPSHandler(context=context)) + install_opener(opener) + return url_no_usrpwd, {} + + uname = pwd = None + if auth and len(auth) == 2: + uname, pwd = auth if not uname and not pwd: - uname, pwd, url_no_usrpwd = split_uname_from_url(url_with_uname) + (uname, pwd), url_no_usrpwd = split_auth_from_url(url_with_uname) else: url_no_usrpwd = url_with_uname if compat.PY3: @@ -282,33 +309,6 @@ def get_urlopen_args(url_with_uname, uname=None, pwd=None, verify_ssl=True): return req, kwargs -def get_urlopen_args_py2(uname, pwd, url_no_usrpwd, verify_ssl=True): - req = Request(url_no_usrpwd) - upstr = '{}:{}'.format(uname, pwd) - base64string = base64.encodestring(upstr).replace('\n', '') - req.add_header("Authorization", "Basic {}".format(base64string)) - # I hope pandas can support self signed certs too - kwargs = {} - if verify_ssl not in [None, True]: - kwargs['context'] = ssl._create_unverified_context() - return req, kwargs - - -def get_urlopen_args_py3(uname, pwd, url_no_usrpwd, verify_ssl=True): - passman = HTTPPasswordMgrWithDefaultRealm() - passman.add_password(None, url_no_usrpwd, uname, pwd) - authhandler = HTTPBasicAuthHandler(passman) - if verify_ssl in [None, True]: - opener = build_opener(authhandler) - else: - context = ssl.create_default_context() - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - opener = build_opener(authhandler, HTTPSHandler(context=context)) - install_opener(opener) - return url_no_usrpwd, {} - - def _infer_compression(filepath_or_buffer, compression): """ Get the compression method for filepath_or_buffer. If compression='infer', diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 3a3e994dce335..62a8cc670bd60 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -212,8 +212,7 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine, - username=kwds.get('username', None), - password=kwds.get('password', None), + auth=kwds.get('auth', None), verify_ssl=kwds.get('verify_ssl', None)) return io._parse_excel( @@ -264,8 +263,7 @@ def __init__(self, io, **kwds): if _is_url(self._io): verify_ssl = kwds.get('verify_ssl', None) ureq, kwargs = get_urlopen_args(self._io, - uname=kwds.get('username', None), - pwd=kwds.get('password', None), + auth=kwds.get('auth', None), verify_ssl=verify_ssl) io = _urlopen(ureq, **kwargs) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): diff --git a/pandas/io/html.py b/pandas/io/html.py index 3f4e2f2773983..f14050189de27 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -117,21 +117,20 @@ def _get_skiprows(skiprows): type(skiprows).__name__) -def _read(obj, username=None, password=None, verify_ssl=None): +def _read(obj, auth=None, verify_ssl=None): """Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like - username: username for http basic auth - password: password for http basic auth + auth: None or (username, password) for http basic auth verify_ssl: Default True. Set to False to disable cert verification Returns ------- raw_text : str """ if _is_url(obj): - ureq, kwargs = get_urlopen_args(obj, username, password, verify_ssl) + ureq, kwargs = get_urlopen_args(obj, auth, verify_ssl) with urlopen(ureq, **kwargs) as url: text = url.read() elif hasattr(obj, 'read'): @@ -191,14 +190,13 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding, username=None, - password=None, verify_ssl=None): + def __init__(self, io, match, attrs, encoding, auth=None, + verify_ssl=None): self.io = io self.match = match self.attrs = attrs self.encoding = encoding - self.username = username - self.password = password + self.auth = auth self.verify_ssl = verify_ssl def parse_tables(self): @@ -452,8 +450,7 @@ def _parse_tables(self, doc, match, attrs): return result def _setup_build_doc(self): - raw_text = _read(self.io, self.username, - self.password, self.verify_ssl) + raw_text = _read(self.io, self.auth, self.verify_ssl) if not raw_text: raise ValueError('No text parsed from document: %s' % self.io) return raw_text @@ -743,8 +740,7 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): p = parser(io, compiled_match, attrs, encoding, - username=kwargs.get('username', None), - password=kwargs.get('password', None), + auth=kwargs.get('auth', None), verify_ssl=kwargs.get('verify_ssl', None)) try: tables = p.parse_tables() @@ -768,7 +764,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True, username=None, password=None, + keep_default_na=True, auth=None, verify_ssl=False): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -870,11 +866,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 - username : str, default None - username for HTTP(s) basic auth - - password : str, default None - password for HTTP(s) basic auth + auth: (str,str), default None. (username, password) for HTTP(s) basic auth verify_ssl : bool, default True If False, ssl certificate is not verified (allow self signed SSL certs) @@ -926,5 +918,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na, username=username, - password=password, verify_ssl=verify_ssl) + keep_default_na=keep_default_na, auth=auth, + verify_ssl=verify_ssl) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 823b24ed2bf4a..da627cf6a3b11 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -174,7 +174,7 @@ def write(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False, username=None, password=None, verify_ssl=None): + lines=False, auth=None, verify_ssl=None): """ Convert a JSON string to pandas object @@ -263,8 +263,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 - username: str, default None. Authentication username for HTTP(s) basic auth - passowrd: str, default None. Authentication password for HTTP(s) basic auth + auth: (str,str), default None. (username, password) for HTTP(s) basic auth verify_ssl: boolean, default None (True). If false, allow self siged SSL certificates @@ -327,8 +326,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, encoding=encoding, - username=username, - password=password, + auth=auth, verify_ssl=verify_ssl) if isinstance(filepath_or_buffer, compat.string_types): try: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 03997a918dd03..3b0572872eb30 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -391,12 +391,11 @@ def _read(filepath_or_buffer, kwds): kwds['encoding'] = encoding compression = kwds.get('compression') - username = kwds.get('username', None) - password = kwds.get('password', None) + auth = kwds.get('auth', None) verify_ssl = kwds.get('verify_ssl', None) compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, username, password, + filepath_or_buffer, encoding, compression, auth, verify_ssl) kwds['compression'] = compression @@ -580,9 +579,8 @@ def parser_f(filepath_or_buffer, memory_map=False, float_precision=None, - # Basic auth (http/https) - username=None, - password=None, + # Basic auth (http/https) (username, password) + auth=None, # skip verify self signed SSL certificates verify_ssl=None): @@ -667,8 +665,7 @@ def parser_f(filepath_or_buffer, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, - username=username, - password=password, + auth=auth, verify_ssl=verify_ssl ) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 9964e41efa9bb..b527e3c5dc254 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -190,17 +190,6 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) - def test_split_url_extract_uname_pwd(self): - for url, uname, pwd, nurl in [('https://aaa:bbb@ccc.com:1010/aaa.txt', - 'aaa', - 'bbb', - 'https://ccc.com:1010/aaa.txt' - )]: - un, p, u = common.split_uname_from_url(url) - assert u == nurl - assert un == uname - assert p == pwd - class TestMMapWrapper(object): From 947331688227a338e673f7dc24d9e0b7eba8fc1a Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 02:01:41 -0700 Subject: [PATCH 03/14] simplify and unify py2 vs py3 --- pandas/io/common.py | 57 ++--- pandas/tests/test_common.py | 475 +++++++++++++++++++----------------- 2 files changed, 273 insertions(+), 259 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 72387993a740f..d4fdddbadfd94 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -51,11 +51,7 @@ if compat.PY3: - from urllib.request import (urlopen, pathname2url, build_opener, - install_opener, - HTTPPasswordMgrWithDefaultRealm, - HTTPBasicAuthHandler, - HTTPSHandler) + from urllib.request import urlopen, pathname2url, Request _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, @@ -260,40 +256,14 @@ def file_path_to_url(path): } -def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): - def split_auth_from_url(url_with_uname): - o = parse_url(url_with_uname) - usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) - url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) - return (o.username, o.password), url_no_usrpwd - - def get_urlopen_args_py2(uname, pwd, url_no_usrpwd, verify_ssl=True): - req = Request(url_no_usrpwd) - upstr = '{}:{}'.format(uname, pwd) - base64string = base64.encodestring(upstr).replace('\n', '') - req.add_header("Authorization", "Basic {}".format(base64string)) - # I hope pandas can support self signed certs too - kwargs = {} - if verify_ssl not in [None, True]: - kwargs['context'] = ssl._create_unverified_context() - return req, kwargs - - def get_urlopen_args_py3(uname, pwd, url_no_usrpwd, verify_ssl=True): - # not using urllib.request Request for PY3 because - # this looks like better code from extensibility purpose - passman = HTTPPasswordMgrWithDefaultRealm() - passman.add_password(None, url_no_usrpwd, uname, pwd) - authhandler = HTTPBasicAuthHandler(passman) - if verify_ssl in [None, True]: - opener = build_opener(authhandler) - else: - context = ssl.create_default_context() - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - opener = build_opener(authhandler, HTTPSHandler(context=context)) - install_opener(opener) - return url_no_usrpwd, {} +def split_auth_from_url(url_with_uname): + o = parse_url(url_with_uname) + usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) + url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) + return (o.username, o.password), url_no_usrpwd + +def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): uname = pwd = None if auth and len(auth) == 2: uname, pwd = auth @@ -301,11 +271,16 @@ def get_urlopen_args_py3(uname, pwd, url_no_usrpwd, verify_ssl=True): (uname, pwd), url_no_usrpwd = split_auth_from_url(url_with_uname) else: url_no_usrpwd = url_with_uname + upstr = '{}:{}'.format(uname, pwd) if compat.PY3: - fn = get_urlopen_args_py3 + b64str = base64.b64encode(bytes(upstr, 'ascii')).decode('utf-8') else: - fn = get_urlopen_args_py2 - req, kwargs = fn(uname, pwd, url_no_usrpwd, verify_ssl=verify_ssl) + b64str = base64.encodestring(upstr).replace('\n', '') + req = Request(url_no_usrpwd) + req.add_header("Authorization", "Basic {}".format(b64str)) + kwargs = {} + if verify_ssl not in [None, True]: + kwargs['context'] = ssl._create_unverified_context() return req, kwargs diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 57479be4d989f..9964e41efa9bb 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,223 +1,262 @@ -# -*- coding: utf-8 -*- - +""" + Tests for the pandas.io.common functionalities +""" +import mmap import pytest -import collections -from functools import partial - -import numpy as np +import os +from os.path import isabs -from pandas import Series, Timestamp -from pandas.compat import range, lmap -import pandas.core.common as com +import pandas as pd import pandas.util.testing as tm +from pandas.io import common +from pandas.compat import is_platform_windows, StringIO + +from pandas import read_csv, concat + +try: + from pathlib import Path +except ImportError: + pass + +try: + from py.path import local as LocalPath +except ImportError: + pass + + +class CustomFSPath(object): + """For testing fspath on unknown objects""" + def __init__(self, path): + self.path = path + + def __fspath__(self): + return self.path + + +HERE = os.path.dirname(__file__) + + +class TestCommonIOCapabilities(object): + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + def test_expand_user(self): + filename = '~/sometest' + expanded_name = common._expand_user(filename) + + assert expanded_name != filename + assert isabs(expanded_name) + assert os.path.expanduser(filename) == expanded_name + + def test_expand_user_normal_path(self): + filename = '/somefolder/sometest' + expanded_name = common._expand_user(filename) + + assert expanded_name == filename + assert os.path.expanduser(filename) == expanded_name + + def test_stringify_path_pathlib(self): + tm._skip_if_no_pathlib() + + rel_path = common._stringify_path(Path('.')) + assert rel_path == '.' + redundant_path = common._stringify_path(Path('foo//bar')) + assert redundant_path == os.path.join('foo', 'bar') + + def test_stringify_path_localpath(self): + tm._skip_if_no_localpath() + + path = os.path.join('foo', 'bar') + abs_path = os.path.abspath(path) + lpath = LocalPath(path) + assert common._stringify_path(lpath) == abs_path + + def test_stringify_path_fspath(self): + p = CustomFSPath('foo/bar.csv') + result = common._stringify_path(p) + assert result == 'foo/bar.csv' + + def test_get_filepath_or_buffer_with_path(self): + filename = '~/sometest' + filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) + assert filepath_or_buffer != filename + assert isabs(filepath_or_buffer) + assert os.path.expanduser(filename) == filepath_or_buffer + + def test_get_filepath_or_buffer_with_buffer(self): + input_buffer = StringIO() + filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) + assert filepath_or_buffer == input_buffer + + def test_iterator(self): + reader = read_csv(StringIO(self.data1), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(self.data1)) + tm.assert_frame_equal(result, expected) + + # GH12153 + it = read_csv(StringIO(self.data1), chunksize=1) + first = next(it) + tm.assert_frame_equal(first, expected.iloc[[0]]) + tm.assert_frame_equal(concat(it), expected.iloc[1:]) + + @pytest.mark.parametrize('reader, module, path', [ + (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), + (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), + (pd.read_fwf, 'os', os.path.join(HERE, 'data', + 'fixed_width_format.txt')), + (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), + (pd.read_feather, 'feather', os.path.join(HERE, 'data', + 'feather-0_3_1.feather')), + (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', + 'datetimetz_object.h5')), + (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), + (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', + 'test1.sas7bdat')), + (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', + 'tsframe_v012.json')), + (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', + 'frame.mp')), + (pd.read_pickle, 'os', os.path.join(HERE, 'data', + 'categorical_0_14_1.pickle')), + ]) + def test_read_fspath_all(self, reader, module, path): + pytest.importorskip(module) + + mypath = CustomFSPath(path) + result = reader(mypath) + expected = reader(path) + if path.endswith('.pickle'): + # categorical + tm.assert_categorical_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('writer_name, writer_kwargs, module', [ + ('to_csv', {}, 'os'), + ('to_excel', {'engine': 'xlwt'}, 'xlwt'), + ('to_feather', {}, 'feather'), + ('to_html', {}, 'os'), + ('to_json', {}, 'os'), + ('to_latex', {}, 'os'), + ('to_msgpack', {}, 'os'), + ('to_pickle', {}, 'os'), + ('to_stata', {}, 'os'), + ]) + def test_write_fspath_all(self, writer_name, writer_kwargs, module): + p1 = tm.ensure_clean('string') + p2 = tm.ensure_clean('fspath') + df = pd.DataFrame({"A": [1, 2]}) + + with p1 as string, p2 as fspath: + pytest.importorskip(module) + mypath = CustomFSPath(fspath) + writer = getattr(df, writer_name) + + writer(string, **writer_kwargs) + with open(string, 'rb') as f: + expected = f.read() + + writer(mypath, **writer_kwargs) + with open(fspath, 'rb') as f: + result = f.read() + + assert result == expected + + def test_write_fspath_hdf5(self): + # Same test as write_fspath_all, except HDF5 files aren't + # necessarily byte-for-byte identical for a given dataframe, so we'll + # have to read and compare equality + pytest.importorskip('tables') + + df = pd.DataFrame({"A": [1, 2]}) + p1 = tm.ensure_clean('string') + p2 = tm.ensure_clean('fspath') + + with p1 as string, p2 as fspath: + mypath = CustomFSPath(fspath) + df.to_hdf(mypath, key='bar') + df.to_hdf(string, key='bar') + + result = pd.read_hdf(fspath, key='bar') + expected = pd.read_hdf(string, key='bar') + + tm.assert_frame_equal(result, expected) + + def test_split_url_extract_uname_pwd(self): + for url, uname, pwd, nurl in [('https://aaa:bbb@ccc.com:1010/aaa.txt', + 'aaa', + 'bbb', + 'https://ccc.com:1010/aaa.txt' + )]: + un, p, u = common.split_uname_from_url(url) + assert u == nurl + assert un == uname + assert p == pwd + + +class TestMMapWrapper(object): + + def setup_method(self, method): + self.mmap_file = os.path.join(tm.get_data_path(), + 'test_mmap.csv') + + def test_constructor_bad_file(self): + non_file = StringIO('I am not a file') + non_file.fileno = lambda: -1 + + # the error raised is different on Windows + if is_platform_windows(): + msg = "The parameter is incorrect" + err = OSError + else: + msg = "[Errno 22]" + err = mmap.error + + tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) + + target = open(self.mmap_file, 'r') + target.close() + + msg = "I/O operation on closed file" + tm.assert_raises_regex( + ValueError, msg, common.MMapWrapper, target) + + def test_get_attr(self): + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + + attrs = dir(wrapper.mmap) + attrs = [attr for attr in attrs + if not attr.startswith('__')] + attrs.append('__next__') + + for attr in attrs: + assert hasattr(wrapper, attr) + + assert not hasattr(wrapper, 'foo') + + def test_next(self): + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + lines = target.readlines() + + for line in lines: + next_line = next(wrapper) + assert next_line.strip() == line.strip() + + pytest.raises(StopIteration, next, wrapper) -def test_mut_exclusive(): - msg = "mutually exclusive arguments: '[ab]' and '[ab]'" - with tm.assert_raises_regex(TypeError, msg): - com._mut_exclusive(a=1, b=2) - assert com._mut_exclusive(a=1, b=None) == 1 - assert com._mut_exclusive(major=None, major_axis=None) is None - assert com._mut_exclusive(a=None, b=2) == 2 - - -def test_get_callable_name(): - from functools import partial - getname = com._get_callable_name - - def fn(x): - return x - - lambda_ = lambda x: x - part1 = partial(fn) - part2 = partial(part1) - - class somecall(object): - - def __call__(self): - return x # noqa - - assert getname(fn) == 'fn' - assert getname(lambda_) - assert getname(part1) == 'fn' - assert getname(part2) == 'fn' - assert getname(somecall()) == 'somecall' - assert getname(1) is None - - -def test_any_none(): - assert (com._any_none(1, 2, 3, None)) - assert (not com._any_none(1, 2, 3, 4)) - - -def test_all_not_none(): - assert (com._all_not_none(1, 2, 3, 4)) - assert (not com._all_not_none(1, 2, 3, None)) - assert (not com._all_not_none(None, None, None, None)) - - -def test_iterpairs(): - data = [1, 2, 3, 4] - expected = [(1, 2), (2, 3), (3, 4)] - - result = list(com.iterpairs(data)) - - assert (result == expected) - - -def test_split_ranges(): - def _bin(x, width): - "return int(x) as a base2 string of given width" - return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1)) - - def test_locs(mask): - nfalse = sum(np.array(mask) == 0) - - remaining = 0 - for s, e in com.split_ranges(mask): - remaining += e - s - - assert 0 not in mask[s:e] - - # make sure the total items covered by the ranges are a complete cover - assert remaining + nfalse == len(mask) - - # exhaustively test all possible mask sequences of length 8 - ncols = 8 - for i in range(2 ** ncols): - cols = lmap(int, list(_bin(i, ncols))) # count up in base2 - mask = [cols[i] == 1 for i in range(len(cols))] - test_locs(mask) - - # base cases - test_locs([]) - test_locs([0]) - test_locs([1]) - - -def test_map_indices_py(): - data = [4, 3, 2, 1] - expected = {4: 0, 3: 1, 2: 2, 1: 3} - - result = com.map_indices_py(data) - - assert (result == expected) - - -def test_union(): - a = [1, 2, 3] - b = [4, 5, 6] - - union = sorted(com.union(a, b)) - - assert ((a + b) == union) - - -def test_difference(): - a = [1, 2, 3] - b = [1, 2, 3, 4, 5, 6] - - inter = sorted(com.difference(b, a)) - - assert ([4, 5, 6] == inter) - - -def test_intersection(): - a = [1, 2, 3] - b = [1, 2, 3, 4, 5, 6] - - inter = sorted(com.intersection(a, b)) - - assert (a == inter) - - -def test_groupby(): - values = ['foo', 'bar', 'baz', 'baz2', 'qux', 'foo3'] - expected = {'f': ['foo', 'foo3'], - 'b': ['bar', 'baz', 'baz2'], - 'q': ['qux']} - - grouped = com.groupby(values, lambda x: x[0]) - - for k, v in grouped: - assert v == expected[k] - - -def test_random_state(): - import numpy.random as npr - # Check with seed - state = com._random_state(5) - assert state.uniform() == npr.RandomState(5).uniform() - - # Check with random state object - state2 = npr.RandomState(10) - assert (com._random_state(state2).uniform() == - npr.RandomState(10).uniform()) - - # check with no arg random state - assert com._random_state() is np.random - - # Error for floats or strings - with pytest.raises(ValueError): - com._random_state('test') - - with pytest.raises(ValueError): - com._random_state(5.5) - - -def test_maybe_match_name(): - - matched = com._maybe_match_name( - Series([1], name='x'), Series( - [2], name='x')) - assert (matched == 'x') - - matched = com._maybe_match_name( - Series([1], name='x'), Series( - [2], name='y')) - assert (matched is None) - - matched = com._maybe_match_name(Series([1]), Series([2], name='x')) - assert (matched is None) - - matched = com._maybe_match_name(Series([1], name='x'), Series([2])) - assert (matched is None) - - matched = com._maybe_match_name(Series([1], name='x'), [2]) - assert (matched == 'x') - - matched = com._maybe_match_name([1], Series([2], name='y')) - assert (matched == 'y') - - -def test_dict_compat(): - data_datetime64 = {np.datetime64('1990-03-15'): 1, - np.datetime64('2015-03-15'): 2} - data_unchanged = {1: 2, 3: 4, 5: 6} - expected = {Timestamp('1990-3-15'): 1, Timestamp('2015-03-15'): 2} - assert (com._dict_compat(data_datetime64) == expected) - assert (com._dict_compat(expected) == expected) - assert (com._dict_compat(data_unchanged) == data_unchanged) - - -def test_standardize_mapping(): - # No uninitialized defaultdicts - with pytest.raises(TypeError): - com.standardize_mapping(collections.defaultdict) - - # No non-mapping subtypes, instance - with pytest.raises(TypeError): - com.standardize_mapping([]) - - # No non-mapping subtypes, class - with pytest.raises(TypeError): - com.standardize_mapping(list) - - fill = {'bad': 'data'} - assert (com.standardize_mapping(fill) == dict) - - # Convert instance to type - assert (com.standardize_mapping({}) == dict) - - dd = collections.defaultdict(list) - assert isinstance(com.standardize_mapping(dd), partial) + def test_unknown_engine(self): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path) + with tm.assert_raises_regex(ValueError, 'Unknown engine'): + read_csv(path, engine='pyt') From 9c7524d9a5f5c9c6d520cd2b5e12f9cd4a34e17a Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 02:29:25 -0700 Subject: [PATCH 04/14] added temporary test script for https/basic-auth/unsigned ssl cert testing --- pandas/tests/test_basic_auth_self_signed.py | 36 +++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 pandas/tests/test_basic_auth_self_signed.py diff --git a/pandas/tests/test_basic_auth_self_signed.py b/pandas/tests/test_basic_auth_self_signed.py new file mode 100644 index 0000000000000..ba8e3db099def --- /dev/null +++ b/pandas/tests/test_basic_auth_self_signed.py @@ -0,0 +1,36 @@ +# DO NOT MERGE +# live working test that tests both scenarios: +# pd.read_csv('https://uname:pwd@fqdn:/fname.csv', verify_ssl=False) +# pd.read_csv('https://fqdn:/fname.csv', username='uname', password='pwd', verify_ssl=False) + +import pandas as pd + +uname='pandasusr' +pwd='pandaspwd' +url = 'https://{}pandastest.mooo.com:5000/' +verify_ssl=False + +def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): + furl = url + fname + kwargs = {} + if uname or pwd: + kwargs['auth']=(uname, pwd) + if verify_ssl is not None: + kwargs['verify_ssl']=verify_ssl + print('\n' +furl) + df = pd_read_fn(furl, **kwargs) + if type(df) is list: # html + df = df[0] + print(df.to_string(index=False)) + print(df.to_json()) + +fparams = [(pd.read_csv, 'aaa.csv'), + (pd.read_json, 'jdoc.json'), + (pd.read_excel, 'ex_doc.xlsx'), + (pd.read_html, 'html_file.html')] + +for pd_read_fn, fname in fparams: + u = url.format('{}:{}@'.format(uname, pwd)) + get_df( u, None, None, verify_ssl, pd_read_fn, fname) + u2 = url.format('') + get_df( u2, uname, pwd, verify_ssl, pd_read_fn, fname) From 598cf7bc53cda72d203c1dd78faeecbb63e7ed32 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 12:17:51 -0700 Subject: [PATCH 05/14] Updated comments. Removed test_basic_auth_self_signed --- pandas/io/common.py | 32 ++++++++++++++++++ pandas/tests/io/test_common.py | 19 +++++++++++ pandas/tests/test_basic_auth_self_signed.py | 36 --------------------- 3 files changed, 51 insertions(+), 36 deletions(-) delete mode 100644 pandas/tests/test_basic_auth_self_signed.py diff --git a/pandas/io/common.py b/pandas/io/common.py index d4fdddbadfd94..1d839bba71eeb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -257,6 +257,21 @@ def file_path_to_url(path): def split_auth_from_url(url_with_uname): + """ + If a url contains username and password, it is extracted and returned + along with a url that does not contain it. + + Parameters + ---------- + url_with_uname : a url that may or may not contain username and password + see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt + //:@:/ + + Returns + ------- + (username, password), url_no_usrpwd : username or "", password or "", + url without username or password (if it contained it ) + """ o = parse_url(url_with_uname) usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) @@ -264,6 +279,23 @@ def split_auth_from_url(url_with_uname): def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): + """ + generate args to pass to urlopen - including basic auth and and support + for disabling verification of SSL certificates ( useful where + self-signed SSL certificates are acceptable security risk -eg: Testing ) + + Parameters + ---------- + url_with_uname : a url that may or may not contain username and password + see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt + //:@:/ + auth : ( username/""/None, password/"", None) tuple + verify_ssl: If False, SSL certificate verification is disabled. + + Returns + ------- + Request, kwargs to pass to urlopen. kwargs may be {} or {'context': obj } + """ uname = pwd = None if auth and len(auth) == 2: uname, pwd = auth diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b527e3c5dc254..dd489cf37e015 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -190,6 +190,25 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) + def test_split_url_extract_uname_pwd(self): + """ + Test extraction of username, pwd from url, if contained. + """ + for url, uname, pwd, nurl in [('https://aaa:bbb@ccc.com:1010/aaa.txt', + 'aaa', + 'bbb', + 'https://ccc.com:1010/aaa.txt' + ), + ('https://ccc.com:1010/aaa.txt', + '', + '', + 'https://ccc.com:1010/aaa.txt' + )]: + un, pw, mod_url = common.split_uname_from_url(url) + assert mod_url == nurl + assert un == uname + assert pw == pwd + class TestMMapWrapper(object): diff --git a/pandas/tests/test_basic_auth_self_signed.py b/pandas/tests/test_basic_auth_self_signed.py deleted file mode 100644 index ba8e3db099def..0000000000000 --- a/pandas/tests/test_basic_auth_self_signed.py +++ /dev/null @@ -1,36 +0,0 @@ -# DO NOT MERGE -# live working test that tests both scenarios: -# pd.read_csv('https://uname:pwd@fqdn:/fname.csv', verify_ssl=False) -# pd.read_csv('https://fqdn:/fname.csv', username='uname', password='pwd', verify_ssl=False) - -import pandas as pd - -uname='pandasusr' -pwd='pandaspwd' -url = 'https://{}pandastest.mooo.com:5000/' -verify_ssl=False - -def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): - furl = url + fname - kwargs = {} - if uname or pwd: - kwargs['auth']=(uname, pwd) - if verify_ssl is not None: - kwargs['verify_ssl']=verify_ssl - print('\n' +furl) - df = pd_read_fn(furl, **kwargs) - if type(df) is list: # html - df = df[0] - print(df.to_string(index=False)) - print(df.to_json()) - -fparams = [(pd.read_csv, 'aaa.csv'), - (pd.read_json, 'jdoc.json'), - (pd.read_excel, 'ex_doc.xlsx'), - (pd.read_html, 'html_file.html')] - -for pd_read_fn, fname in fparams: - u = url.format('{}:{}@'.format(uname, pwd)) - get_df( u, None, None, verify_ssl, pd_read_fn, fname) - u2 = url.format('') - get_df( u2, uname, pwd, verify_ssl, pd_read_fn, fname) From 3b454dd538e64b97c8a9734681f4417cd4b96372 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 12:49:49 -0700 Subject: [PATCH 06/14] Added what's new --- doc/source/whatsnew/v0.21.0.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 039b24cc63217..eb4ecef232109 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -40,7 +40,8 @@ Other Enhancements - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) - :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) - +- :func:`read_csv` `read_html` `read_json` `read_html` now accept auth in url //:@:/, or ``auth`` tuple (username, password) parameter +- :func:`read_csv` `read_html` `read_json` `read_html` now accept ``verify_ssl`` False to disable https/ssl certificate verification (eg: self signed ssl certs in testing) .. _whatsnew_0210.api_breaking: Backwards incompatible API changes From d359b2d53ffa33b8643fa276c04641cc5e0e94be Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 17:22:31 -0700 Subject: [PATCH 07/14] Updates to comments, etc + some change in username password logic --- doc/source/whatsnew/v0.21.0.txt | 4 +- pandas/io/common.py | 86 ++++++++++++++++++++++++--------- pandas/io/html.py | 24 +++++++-- pandas/io/json/json.py | 13 +++-- pandas/io/parsers.py | 2 - pandas/tests/test_common.py | 29 ++++++----- 6 files changed, 113 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index eb4ecef232109..7baf87da18039 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -40,8 +40,8 @@ Other Enhancements - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) - :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) -- :func:`read_csv` `read_html` `read_json` `read_html` now accept auth in url //:@:/, or ``auth`` tuple (username, password) parameter -- :func:`read_csv` `read_html` `read_json` `read_html` now accept ``verify_ssl`` False to disable https/ssl certificate verification (eg: self signed ssl certs in testing) +- :func:`read_csv`, :func:`read_html`, :func:`read_json`, :func:`read_html` now accept auth in url //:@:/, or ``auth`` tuple (username, password) parameter +- :func:`read_csv`, :func:`read_html`, :func:`read_json`, :func:`read_html` now accept ``verify_ssl`` False to disable https/ssl certificate verification (eg: self signed ssl certs in testing) (:issue:`16716`) .. _whatsnew_0210.api_breaking: Backwards incompatible API changes diff --git a/pandas/io/common.py b/pandas/io/common.py index 1d839bba71eeb..f14fb43da2831 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -190,12 +190,27 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer - supports 'https://username:password@fqdn.com:port/aaa.csv' + now supports 'https://:@:/' + + .. versionadded:: 0.21.0 + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' - compression: - auth: (str,str), default None. (username, password) for HTTP(s) basic auth - verify_ssl: Default True. If False, allow self signed and invalid SSL - certificates for https + + compression : string, default None + + .. versionadded:: 0.18.1 + + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 + Returns ------- @@ -263,19 +278,34 @@ def split_auth_from_url(url_with_uname): Parameters ---------- - url_with_uname : a url that may or may not contain username and password + url_with_uname : string + a url that may or may not contain username and password see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt //:@:/ + + .. versionadded:: 0.21.0 Returns ------- - (username, password), url_no_usrpwd : username or "", password or "", - url without username or password (if it contained it ) + (username, password), url_no_usrpwd : tuple, string Default ('', '') url + A tuple with (username, pwd) pair and + url without username or password (if it contained it ) + + Raises + ------ + ValueError for empty url """ + if not url_with_uname: + msg = "Empty url: {_type}" + raise ValueError(msg.format(_type=type(url_with_uname))) o = parse_url(url_with_uname) - usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) - url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) - return (o.username, o.password), url_no_usrpwd + uname = o.username if o.username else '' + pwd = o.password if o.password else '' + url_no_usrpwd = url_with_uname + if uname or pwd: + usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname) + url_no_usrpwd = url_with_uname.replace(usrch, o.hostname) + return (uname, pwd), url_no_usrpwd def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): @@ -286,30 +316,42 @@ def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): Parameters ---------- - url_with_uname : a url that may or may not contain username and password + url_with_uname : string + a url that may or may not contain username and password see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt //:@:/ - auth : ( username/""/None, password/"", None) tuple - verify_ssl: If False, SSL certificate verification is disabled. + + .. versionadded:: 0.21.0 + + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 Returns ------- Request, kwargs to pass to urlopen. kwargs may be {} or {'context': obj } """ uname = pwd = None + url_no_usrpwd = url_with_uname if auth and len(auth) == 2: uname, pwd = auth if not uname and not pwd: (uname, pwd), url_no_usrpwd = split_auth_from_url(url_with_uname) - else: - url_no_usrpwd = url_with_uname - upstr = '{}:{}'.format(uname, pwd) - if compat.PY3: - b64str = base64.b64encode(bytes(upstr, 'ascii')).decode('utf-8') - else: - b64str = base64.encodestring(upstr).replace('\n', '') req = Request(url_no_usrpwd) - req.add_header("Authorization", "Basic {}".format(b64str)) + if uname or pwd: + upstr = '{}:{}'.format(uname, pwd) + if compat.PY3: + b64str = base64.b64encode(bytes(upstr, 'ascii')).decode('utf-8') + else: + b64str = base64.encodestring(upstr).replace('\n', '') + req.add_header("Authorization", "Basic {}".format(b64str)) kwargs = {} if verify_ssl not in [None, True]: kwargs['context'] = ssl._create_unverified_context() diff --git a/pandas/io/html.py b/pandas/io/html.py index f14050189de27..7e37d856779db 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -123,8 +123,16 @@ def _read(obj, auth=None, verify_ssl=None): Parameters ---------- obj : str, unicode, or file-like - auth: None or (username, password) for http basic auth - verify_ssl: Default True. Set to False to disable cert verification + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 Returns ------- raw_text : str @@ -866,10 +874,16 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 - auth: (str,str), default None. (username, password) for HTTP(s) basic auth + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https - verify_ssl : bool, default True - If False, ssl certificate is not verified (allow self signed SSL certs) + .. versionadded:: 0.21.0 Returns ------- diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index da627cf6a3b11..3aa69dd43109a 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -263,9 +263,16 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 - auth: (str,str), default None. (username, password) for HTTP(s) basic auth - verify_ssl: boolean, default None (True). - If false, allow self siged SSL certificates + auth : tuple, default None + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + + .. versionadded:: 0.21.0 + + verify_ssl : boolean, Default True + If False, allow self signed and invalid SSL certificates for https + + .. versionadded:: 0.21.0 Returns ------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3b0572872eb30..5a85a2bcefa61 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -579,10 +579,8 @@ def parser_f(filepath_or_buffer, memory_map=False, float_precision=None, - # Basic auth (http/https) (username, password) auth=None, - # skip verify self signed SSL certificates verify_ssl=None): # Alias sep -> delimiter. diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 9964e41efa9bb..dc1c28a7c837f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -189,17 +189,24 @@ def test_write_fspath_hdf5(self): expected = pd.read_hdf(string, key='bar') tm.assert_frame_equal(result, expected) - - def test_split_url_extract_uname_pwd(self): - for url, uname, pwd, nurl in [('https://aaa:bbb@ccc.com:1010/aaa.txt', - 'aaa', - 'bbb', - 'https://ccc.com:1010/aaa.txt' - )]: - un, p, u = common.split_uname_from_url(url) - assert u == nurl - assert un == uname - assert p == pwd + + @pytest.mark.parametrize('url, uname, pwd, nurl', [ + ('https://a1:b1@cc.com:101/f.csv', + 'aaa', + 'bbb', + 'https://cc.com:101/f.csv' + ), + ('https://ccc.com:1010/aaa.txt', + '', + '', + 'https://ccc.com:1010/aaa.txt' + ), + ]) + def test_split_url_extract_uname_pwd(self, url, uname, pwd, nurl): + un, pw, ur = common.split_uname_from_url(url) + assert ur == nurl + assert un == uname + assert pw == pwd class TestMMapWrapper(object): From eb03fd30384c94c6cfc37877c9c2b2570d7b81a5 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 17:55:25 -0700 Subject: [PATCH 08/14] fix to test case - had checked in wrong one --- pandas/tests/test_common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index dc1c28a7c837f..31a12740eb9bf 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -192,8 +192,8 @@ def test_write_fspath_hdf5(self): @pytest.mark.parametrize('url, uname, pwd, nurl', [ ('https://a1:b1@cc.com:101/f.csv', - 'aaa', - 'bbb', + 'a1', + 'b1', 'https://cc.com:101/f.csv' ), ('https://ccc.com:1010/aaa.txt', @@ -203,7 +203,7 @@ def test_write_fspath_hdf5(self): ), ]) def test_split_url_extract_uname_pwd(self, url, uname, pwd, nurl): - un, pw, ur = common.split_uname_from_url(url) + (un, pw), ur = common.split_auth_from_url(url) assert ur == nurl assert un == uname assert pw == pwd From cbe3f494fb4d5aec1e71b78cc62e89486322489a Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Thu, 13 Jul 2017 19:11:04 -0700 Subject: [PATCH 09/14] Fixing test_common.py which was mistakenly clobbered --- pandas/tests/io/test_common.py | 36 +-- pandas/tests/test_common.py | 484 +++++++++++++++------------------ 2 files changed, 237 insertions(+), 283 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index dd489cf37e015..135385f9d6d7d 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -190,24 +190,24 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) - def test_split_url_extract_uname_pwd(self): - """ - Test extraction of username, pwd from url, if contained. - """ - for url, uname, pwd, nurl in [('https://aaa:bbb@ccc.com:1010/aaa.txt', - 'aaa', - 'bbb', - 'https://ccc.com:1010/aaa.txt' - ), - ('https://ccc.com:1010/aaa.txt', - '', - '', - 'https://ccc.com:1010/aaa.txt' - )]: - un, pw, mod_url = common.split_uname_from_url(url) - assert mod_url == nurl - assert un == uname - assert pw == pwd + + @pytest.mark.parametrize('url, uname, pwd, nurl', [ + ('https://a1:b1@cc.com:101/f.csv', + 'a1', + 'b1', + 'https://cc.com:101/f.csv' + ), + ('https://ccc.com:1010/aaa.txt', + '', + '', + 'https://ccc.com:1010/aaa.txt' + ), + ]) + def test_split_url_extract_uname_pwd(self, url, uname, pwd, nurl): + (un, pw), ur = common.split_auth_from_url(url) + assert ur == nurl + assert un == uname + assert pw == pwd class TestMMapWrapper(object): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 31a12740eb9bf..5357a4d81f174 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,269 +1,223 @@ -""" - Tests for the pandas.io.common functionalities -""" -import mmap +# -*- coding: utf-8 -*- + import pytest -import os -from os.path import isabs +import collections +from functools import partial + +import numpy as np -import pandas as pd +from pandas import Series, Timestamp +from pandas.compat import range, lmap +import pandas.core.common as com import pandas.util.testing as tm -from pandas.io import common -from pandas.compat import is_platform_windows, StringIO - -from pandas import read_csv, concat - -try: - from pathlib import Path -except ImportError: - pass - -try: - from py.path import local as LocalPath -except ImportError: - pass - - -class CustomFSPath(object): - """For testing fspath on unknown objects""" - def __init__(self, path): - self.path = path - - def __fspath__(self): - return self.path - - -HERE = os.path.dirname(__file__) - - -class TestCommonIOCapabilities(object): - data1 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - def test_expand_user(self): - filename = '~/sometest' - expanded_name = common._expand_user(filename) - - assert expanded_name != filename - assert isabs(expanded_name) - assert os.path.expanduser(filename) == expanded_name - - def test_expand_user_normal_path(self): - filename = '/somefolder/sometest' - expanded_name = common._expand_user(filename) - - assert expanded_name == filename - assert os.path.expanduser(filename) == expanded_name - - def test_stringify_path_pathlib(self): - tm._skip_if_no_pathlib() - - rel_path = common._stringify_path(Path('.')) - assert rel_path == '.' - redundant_path = common._stringify_path(Path('foo//bar')) - assert redundant_path == os.path.join('foo', 'bar') - - def test_stringify_path_localpath(self): - tm._skip_if_no_localpath() - - path = os.path.join('foo', 'bar') - abs_path = os.path.abspath(path) - lpath = LocalPath(path) - assert common._stringify_path(lpath) == abs_path - - def test_stringify_path_fspath(self): - p = CustomFSPath('foo/bar.csv') - result = common._stringify_path(p) - assert result == 'foo/bar.csv' - - def test_get_filepath_or_buffer_with_path(self): - filename = '~/sometest' - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) - assert filepath_or_buffer != filename - assert isabs(filepath_or_buffer) - assert os.path.expanduser(filename) == filepath_or_buffer - - def test_get_filepath_or_buffer_with_buffer(self): - input_buffer = StringIO() - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) - assert filepath_or_buffer == input_buffer - - def test_iterator(self): - reader = read_csv(StringIO(self.data1), chunksize=1) - result = concat(reader, ignore_index=True) - expected = read_csv(StringIO(self.data1)) - tm.assert_frame_equal(result, expected) - - # GH12153 - it = read_csv(StringIO(self.data1), chunksize=1) - first = next(it) - tm.assert_frame_equal(first, expected.iloc[[0]]) - tm.assert_frame_equal(concat(it), expected.iloc[1:]) - - @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_fwf, 'os', os.path.join(HERE, 'data', - 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', os.path.join(HERE, 'data', - 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', - 'test1.sas7bdat')), - (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', - 'tsframe_v012.json')), - (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', - 'frame.mp')), - (pd.read_pickle, 'os', os.path.join(HERE, 'data', - 'categorical_0_14_1.pickle')), - ]) - def test_read_fspath_all(self, reader, module, path): - pytest.importorskip(module) - - mypath = CustomFSPath(path) - result = reader(mypath) - expected = reader(path) - if path.endswith('.pickle'): - # categorical - tm.assert_categorical_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize('writer_name, writer_kwargs, module', [ - ('to_csv', {}, 'os'), - ('to_excel', {'engine': 'xlwt'}, 'xlwt'), - ('to_feather', {}, 'feather'), - ('to_html', {}, 'os'), - ('to_json', {}, 'os'), - ('to_latex', {}, 'os'), - ('to_msgpack', {}, 'os'), - ('to_pickle', {}, 'os'), - ('to_stata', {}, 'os'), - ]) - def test_write_fspath_all(self, writer_name, writer_kwargs, module): - p1 = tm.ensure_clean('string') - p2 = tm.ensure_clean('fspath') - df = pd.DataFrame({"A": [1, 2]}) - - with p1 as string, p2 as fspath: - pytest.importorskip(module) - mypath = CustomFSPath(fspath) - writer = getattr(df, writer_name) - - writer(string, **writer_kwargs) - with open(string, 'rb') as f: - expected = f.read() - - writer(mypath, **writer_kwargs) - with open(fspath, 'rb') as f: - result = f.read() - - assert result == expected - - def test_write_fspath_hdf5(self): - # Same test as write_fspath_all, except HDF5 files aren't - # necessarily byte-for-byte identical for a given dataframe, so we'll - # have to read and compare equality - pytest.importorskip('tables') - - df = pd.DataFrame({"A": [1, 2]}) - p1 = tm.ensure_clean('string') - p2 = tm.ensure_clean('fspath') - - with p1 as string, p2 as fspath: - mypath = CustomFSPath(fspath) - df.to_hdf(mypath, key='bar') - df.to_hdf(string, key='bar') - - result = pd.read_hdf(fspath, key='bar') - expected = pd.read_hdf(string, key='bar') - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize('url, uname, pwd, nurl', [ - ('https://a1:b1@cc.com:101/f.csv', - 'a1', - 'b1', - 'https://cc.com:101/f.csv' - ), - ('https://ccc.com:1010/aaa.txt', - '', - '', - 'https://ccc.com:1010/aaa.txt' - ), - ]) - def test_split_url_extract_uname_pwd(self, url, uname, pwd, nurl): - (un, pw), ur = common.split_auth_from_url(url) - assert ur == nurl - assert un == uname - assert pw == pwd - - -class TestMMapWrapper(object): - - def setup_method(self, method): - self.mmap_file = os.path.join(tm.get_data_path(), - 'test_mmap.csv') - - def test_constructor_bad_file(self): - non_file = StringIO('I am not a file') - non_file.fileno = lambda: -1 - - # the error raised is different on Windows - if is_platform_windows(): - msg = "The parameter is incorrect" - err = OSError - else: - msg = "[Errno 22]" - err = mmap.error - - tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) - - target = open(self.mmap_file, 'r') - target.close() - - msg = "I/O operation on closed file" - tm.assert_raises_regex( - ValueError, msg, common.MMapWrapper, target) - - def test_get_attr(self): - with open(self.mmap_file, 'r') as target: - wrapper = common.MMapWrapper(target) - - attrs = dir(wrapper.mmap) - attrs = [attr for attr in attrs - if not attr.startswith('__')] - attrs.append('__next__') - - for attr in attrs: - assert hasattr(wrapper, attr) - - assert not hasattr(wrapper, 'foo') - - def test_next(self): - with open(self.mmap_file, 'r') as target: - wrapper = common.MMapWrapper(target) - lines = target.readlines() - - for line in lines: - next_line = next(wrapper) - assert next_line.strip() == line.strip() - - pytest.raises(StopIteration, next, wrapper) - - def test_unknown_engine(self): - with tm.ensure_clean() as path: - df = tm.makeDataFrame() - df.to_csv(path) - with tm.assert_raises_regex(ValueError, 'Unknown engine'): - read_csv(path, engine='pyt') + +def test_mut_exclusive(): + msg = "mutually exclusive arguments: '[ab]' and '[ab]'" + with tm.assert_raises_regex(TypeError, msg): + com._mut_exclusive(a=1, b=2) + assert com._mut_exclusive(a=1, b=None) == 1 + assert com._mut_exclusive(major=None, major_axis=None) is None + assert com._mut_exclusive(a=None, b=2) == 2 + + +def test_get_callable_name(): + from functools import partial + getname = com._get_callable_name + + def fn(x): + return x + + lambda_ = lambda x: x + part1 = partial(fn) + part2 = partial(part1) + + class somecall(object): + + def __call__(self): + return x # noqa + + assert getname(fn) == 'fn' + assert getname(lambda_) + assert getname(part1) == 'fn' + assert getname(part2) == 'fn' + assert getname(somecall()) == 'somecall' + assert getname(1) is None + + +def test_any_none(): + assert (com._any_none(1, 2, 3, None)) + assert (not com._any_none(1, 2, 3, 4)) + + +def test_all_not_none(): + assert (com._all_not_none(1, 2, 3, 4)) + assert (not com._all_not_none(1, 2, 3, None)) + assert (not com._all_not_none(None, None, None, None)) + + +def test_iterpairs(): + data = [1, 2, 3, 4] + expected = [(1, 2), (2, 3), (3, 4)] + + result = list(com.iterpairs(data)) + + assert (result == expected) + + +def test_split_ranges(): + def _bin(x, width): + "return int(x) as a base2 string of given width" + return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1)) + + def test_locs(mask): + nfalse = sum(np.array(mask) == 0) + + remaining = 0 + for s, e in com.split_ranges(mask): + remaining += e - s + + assert 0 not in mask[s:e] + + # make sure the total items covered by the ranges are a complete cover + assert remaining + nfalse == len(mask) + + # exhaustively test all possible mask sequences of length 8 + ncols = 8 + for i in range(2 ** ncols): + cols = lmap(int, list(_bin(i, ncols))) # count up in base2 + mask = [cols[i] == 1 for i in range(len(cols))] + test_locs(mask) + + # base cases + test_locs([]) + test_locs([0]) + test_locs([1]) + + +def test_map_indices_py(): + data = [4, 3, 2, 1] + expected = {4: 0, 3: 1, 2: 2, 1: 3} + + result = com.map_indices_py(data) + + assert (result == expected) + + +def test_union(): + a = [1, 2, 3] + b = [4, 5, 6] + + union = sorted(com.union(a, b)) + + assert ((a + b) == union) + + +def test_difference(): + a = [1, 2, 3] + b = [1, 2, 3, 4, 5, 6] + + inter = sorted(com.difference(b, a)) + + assert ([4, 5, 6] == inter) + + +def test_intersection(): + a = [1, 2, 3] + b = [1, 2, 3, 4, 5, 6] + + inter = sorted(com.intersection(a, b)) + + assert (a == inter) + + +def test_groupby(): + values = ['foo', 'bar', 'baz', 'baz2', 'qux', 'foo3'] + expected = {'f': ['foo', 'foo3'], + 'b': ['bar', 'baz', 'baz2'], + 'q': ['qux']} + + grouped = com.groupby(values, lambda x: x[0]) + + for k, v in grouped: + assert v == expected[k] + + +def test_random_state(): + import numpy.random as npr + # Check with seed + state = com._random_state(5) + assert state.uniform() == npr.RandomState(5).uniform() + + # Check with random state object + state2 = npr.RandomState(10) + assert (com._random_state(state2).uniform() == + npr.RandomState(10).uniform()) + + # check with no arg random state + assert com._random_state() is np.random + + # Error for floats or strings + with pytest.raises(ValueError): + com._random_state('test') + + with pytest.raises(ValueError): + com._random_state(5.5) + + +def test_maybe_match_name(): + + matched = com._maybe_match_name( + Series([1], name='x'), Series( + [2], name='x')) + assert (matched == 'x') + + matched = com._maybe_match_name( + Series([1], name='x'), Series( + [2], name='y')) + assert (matched is None) + + matched = com._maybe_match_name(Series([1]), Series([2], name='x')) + assert (matched is None) + + matched = com._maybe_match_name(Series([1], name='x'), Series([2])) + assert (matched is None) + + matched = com._maybe_match_name(Series([1], name='x'), [2]) + assert (matched == 'x') + + matched = com._maybe_match_name([1], Series([2], name='y')) + assert (matched == 'y') + + +def test_dict_compat(): + data_datetime64 = {np.datetime64('1990-03-15'): 1, + np.datetime64('2015-03-15'): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp('1990-3-15'): 1, Timestamp('2015-03-15'): 2} + assert (com._dict_compat(data_datetime64) == expected) + assert (com._dict_compat(expected) == expected) + assert (com._dict_compat(data_unchanged) == data_unchanged) + + +def test_standardize_mapping(): + # No uninitialized defaultdicts + with pytest.raises(TypeError): + com.standardize_mapping(collections.defaultdict) + + # No non-mapping subtypes, instance + with pytest.raises(TypeError): + com.standardize_mapping([]) + + # No non-mapping subtypes, class + with pytest.raises(TypeError): + com.standardize_mapping(list) + + fill = {'bad': 'data'} + assert (com.standardize_mapping(fill) == dict) + + # Convert instance to type + assert (com.standardize_mapping({}) == dict) + + dd = collections.defaultdict(list) + assert isinstance(com.standardize_mapping(dd), partial) \ No newline at end of file From 437e0a2c4ec91cbb221441f96fc21be4c754c54d Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Fri, 14 Jul 2017 02:06:24 -0700 Subject: [PATCH 10/14] Fixing whitespace to meet style guidelines --- pandas/io/common.py | 28 ++++++++++++++-------------- pandas/io/html.py | 12 ++++++------ pandas/io/json/json.py | 6 +++--- pandas/tests/io/test_common.py | 5 ++--- pandas/tests/test_common.py | 2 +- 5 files changed, 26 insertions(+), 27 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index f14fb43da2831..87cc0272499e6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -191,19 +191,19 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer now supports 'https://:@:/' - + .. versionadded:: 0.21.0 - + encoding : the encoding to use to decode py3 bytes, default is 'utf-8' compression : string, default None - + .. versionadded:: 0.18.1 - - auth : tuple, default None - A tuple of string with (username, password) string for + + auth : tuple, default None + A tuple of string with (username, password) string for HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') - + .. versionadded:: 0.21.0 verify_ssl : boolean, Default True @@ -282,20 +282,20 @@ def split_auth_from_url(url_with_uname): a url that may or may not contain username and password see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt //:@:/ - + .. versionadded:: 0.21.0 Returns ------- (username, password), url_no_usrpwd : tuple, string Default ('', '') url - A tuple with (username, pwd) pair and + A tuple with (username, pwd) pair and url without username or password (if it contained it ) Raises ------ ValueError for empty url """ - if not url_with_uname: + if not url_with_uname: msg = "Empty url: {_type}" raise ValueError(msg.format(_type=type(url_with_uname))) o = parse_url(url_with_uname) @@ -320,13 +320,13 @@ def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): a url that may or may not contain username and password see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt //:@:/ - + .. versionadded:: 0.21.0 - auth : tuple, default None - A tuple of string with (username, password) string for + auth : tuple, default None + A tuple of string with (username, password) string for HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') - + .. versionadded:: 0.21.0 verify_ssl : boolean, Default True diff --git a/pandas/io/html.py b/pandas/io/html.py index 7e37d856779db..62bca65c70427 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -123,10 +123,10 @@ def _read(obj, auth=None, verify_ssl=None): Parameters ---------- obj : str, unicode, or file-like - auth : tuple, default None - A tuple of string with (username, password) string for + auth : tuple, default None + A tuple of string with (username, password) string for HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') - + .. versionadded:: 0.21.0 verify_ssl : boolean, Default True @@ -874,10 +874,10 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 - auth : tuple, default None - A tuple of string with (username, password) string for + auth : tuple, default None + A tuple of string with (username, password) string for HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') - + .. versionadded:: 0.21.0 verify_ssl : boolean, Default True diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 3aa69dd43109a..b403eb45ae2cd 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -263,10 +263,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, .. versionadded:: 0.19.0 - auth : tuple, default None - A tuple of string with (username, password) string for + auth : tuple, default None + A tuple of string with (username, password) string for HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') - + .. versionadded:: 0.21.0 verify_ssl : boolean, Default True diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 135385f9d6d7d..82fc4d6271f00 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -190,18 +190,17 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('url, uname, pwd, nurl', [ ('https://a1:b1@cc.com:101/f.csv', 'a1', 'b1', 'https://cc.com:101/f.csv' - ), + ), ('https://ccc.com:1010/aaa.txt', '', '', 'https://ccc.com:1010/aaa.txt' - ), + ), ]) def test_split_url_extract_uname_pwd(self, url, uname, pwd, nurl): (un, pw), ur = common.split_auth_from_url(url) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 5357a4d81f174..57479be4d989f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -220,4 +220,4 @@ def test_standardize_mapping(): assert (com.standardize_mapping({}) == dict) dd = collections.defaultdict(list) - assert isinstance(com.standardize_mapping(dd), partial) \ No newline at end of file + assert isinstance(com.standardize_mapping(dd), partial) From 7b034b88ee24758700afb0908376cee6ddacbd29 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Mon, 17 Jul 2017 22:56:36 -0700 Subject: [PATCH 11/14] Added live working test case for http auth and ssl override. Formatting changes --- pandas/io/common.py | 46 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 87cc0272499e6..30071fb4ad052 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -189,27 +189,27 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), - or buffer - now supports 'https://:@:/' + or buffer now supports url with username and password + eg: 'https://:@:/' - .. versionadded:: 0.21.0 + .. versionadded:: 0.21.0 encoding : the encoding to use to decode py3 bytes, default is 'utf-8' compression : string, default None - .. versionadded:: 0.18.1 + .. versionadded:: 0.18.1 auth : tuple, default None - A tuple of string with (username, password) string for - HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') .. versionadded:: 0.21.0 verify_ssl : boolean, Default True - If False, allow self signed and invalid SSL certificates for https + If False, allow self signed and invalid SSL certificates for https - .. versionadded:: 0.21.0 + .. versionadded:: 0.21.0 Returns @@ -279,17 +279,17 @@ def split_auth_from_url(url_with_uname): Parameters ---------- url_with_uname : string - a url that may or may not contain username and password - see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt - //:@:/ + a url that may or may not contain username and password + see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt + //:@:/ - .. versionadded:: 0.21.0 + .. versionadded:: 0.21.0 Returns ------- (username, password), url_no_usrpwd : tuple, string Default ('', '') url - A tuple with (username, pwd) pair and - url without username or password (if it contained it ) + A tuple with (username, pwd) pair and + url without username or password (if it contained it ) Raises ------ @@ -317,22 +317,22 @@ def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): Parameters ---------- url_with_uname : string - a url that may or may not contain username and password - see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt - //:@:/ + a url that may or may not contain username and password + see section 3.1 RFC 1738 https://www.ietf.org/rfc/rfc1738.txt + //:@:/ - .. versionadded:: 0.21.0 + .. versionadded:: 0.21.0 auth : tuple, default None - A tuple of string with (username, password) string for - HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') + A tuple of string with (username, password) string for + HTTP(s) basic auth: eg auth= ('roberto', 'panda$4life') - .. versionadded:: 0.21.0 + .. versionadded:: 0.21.0 verify_ssl : boolean, Default True - If False, allow self signed and invalid SSL certificates for https + If False, allow self signed and invalid SSL certificates for https - .. versionadded:: 0.21.0 + .. versionadded:: 0.21.0 Returns ------- From 0a4607a36ab5d47b1a8969c2bbb1b6323752d340 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Tue, 18 Jul 2017 21:18:44 -0700 Subject: [PATCH 12/14] Really added live working test case for http auth and ssl override (i hope). Strict require both uname/pwd or neither --- doc/source/whatsnew/v0.21.0.txt | 5 +- pandas/io/common.py | 16 +++++- pandas/tests/io/test_http_auth.py | 89 +++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/io/test_http_auth.py diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7baf87da18039..9eef84aa69033 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -40,8 +40,9 @@ Other Enhancements - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) - :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) -- :func:`read_csv`, :func:`read_html`, :func:`read_json`, :func:`read_html` now accept auth in url //:@:/, or ``auth`` tuple (username, password) parameter -- :func:`read_csv`, :func:`read_html`, :func:`read_json`, :func:`read_html` now accept ``verify_ssl`` False to disable https/ssl certificate verification (eg: self signed ssl certs in testing) (:issue:`16716`) +- It is now possible to read data (i.e. CSV, JSON, HTML) from a HTTP/HTTPS URL that requires Basic Authentication (:issue:`16716`) +- It is now possible to skip check of validity of SSL certificate (eg: Testing using self-signed SSL certificates) (:issue:`16716`) + .. _whatsnew_0210.api_breaking: Backwards incompatible API changes diff --git a/pandas/io/common.py b/pandas/io/common.py index 30071fb4ad052..a353b984564c4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -6,6 +6,7 @@ import mmap import ssl import base64 +import warnings from contextlib import contextmanager, closing from pandas.compat import StringIO, BytesIO, string_types, text_type @@ -271,6 +272,11 @@ def file_path_to_url(path): } +class InsecureRequestWarning(Warning): + "Warned when making an unverified HTTPS request. Borrowed from requests" + pass + + def split_auth_from_url(url_with_uname): """ If a url contains username and password, it is extracted and returned @@ -337,6 +343,10 @@ def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): Returns ------- Request, kwargs to pass to urlopen. kwargs may be {} or {'context': obj } + + Raises + ------ + ValueError if only one of username or password is provided. """ uname = pwd = None url_no_usrpwd = url_with_uname @@ -345,16 +355,20 @@ def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): if not uname and not pwd: (uname, pwd), url_no_usrpwd = split_auth_from_url(url_with_uname) req = Request(url_no_usrpwd) - if uname or pwd: + if uname and pwd: upstr = '{}:{}'.format(uname, pwd) if compat.PY3: b64str = base64.b64encode(bytes(upstr, 'ascii')).decode('utf-8') else: b64str = base64.encodestring(upstr).replace('\n', '') req.add_header("Authorization", "Basic {}".format(b64str)) + elif uname or pwd: + raise ValueError('Only username or password provided without providing the other') kwargs = {} if verify_ssl not in [None, True]: kwargs['context'] = ssl._create_unverified_context() + msg = 'SSL certificate verification is being disabled for HTTPS. Possible security risk' + warnings.warn(msg, InsecureRequestWarning) return req, kwargs diff --git a/pandas/tests/io/test_http_auth.py b/pandas/tests/io/test_http_auth.py new file mode 100644 index 0000000000000..79fb7c018af31 --- /dev/null +++ b/pandas/tests/io/test_http_auth.py @@ -0,0 +1,89 @@ +import pytest +import pandas as pd + + +def gen_http_auth_ssl_test_cases(): + """ + Generate list of test case to test for : http/https, username/pwd in url + or as parameters, self signed ssl certs or trusted ssl certs, no auth + or basic auth + """ + def gen_level1_tc(): + test_cases = [] + # The following host doesn't seem to handle urllib but handles + # python requests package. This is because: + # 'urlopen' sets header 'Host' : ':' - acceptable RFC7230 + # 'requests' sets header 'Host' : '' + # so pandas fails on following hosting server (uses some 'apex' server) + # but pandas succeeds on nginx even if port is non-default. + for host, verify_ssl in [('pandas-unittest.site11.com', False, ), + ('handsome-equator.000webhostapp.com', True) + ]: + for sub_path, is_auth in [('no_auth/', False), + ('basic_auth/', True)]: + pre_ports = [('http', ''), + ('https', '')] + for pre, port in pre_ports: + test_cases.append( + [host, verify_ssl, pre, port, sub_path, is_auth]) + return test_cases + + def gen_base_url(pre, auth_prefix, host, port, su_pa): + return '{}://{}{}{}/{}'.format(pre, auth_prefix, host, port, su_pa) + tc2 = [] + uname = 'pandasusr' + pwd = 'pandaspwd' + for host, verify_ssl, pre, port, sp, is_auth in gen_level1_tc(): + u = uname if is_auth else None + p = pwd if is_auth else None + u_no_uname = gen_base_url(pre, '', host, port, sp) + u_with_uname = None + if is_auth: + auth_prefix = '{}:{}@'.format(u, p) if is_auth else '' + u_with_uname = gen_base_url(pre, auth_prefix, host, port, sp) + tc2.append([u_no_uname, u, p, verify_ssl]) + if u_with_uname and u_with_uname != u_no_uname: + tc2.append([u_with_uname, None, None, verify_ssl]) + else: + tc2.append([u_no_uname, None, None, verify_ssl]) + return tc2 + + +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + gen_http_auth_ssl_test_cases()) +def test_http_auth_ssl(url, uname, pwd, verify_ssl): + + def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): + furl = url + fname + kwargs = {} + if uname or pwd: + kwargs['auth'] = (uname, pwd) + if verify_ssl is not None: + kwargs['verify_ssl'] = verify_ssl + msg = '{0: <90} -- auth:[{1: <10}/{2: <10}] v:[{3: <5}]'.format( + url, str(uname), str(pwd), str(verify_ssl)) + tcsv = 'animal,bird\ndog,pigeon\ncat,emu\n' + j = '{"animal":{"0":"dog","1":"cat"},"bird":{"0":"pigeon","1":"emu"}}' + try: + df = pd_read_fn(furl, **kwargs) + if type(df) is list: # html + df = df[0] + smatch = str(df.to_csv(index=False)) == tcsv + jmatch = str(df.to_json()) == j + res = 'Json : {} -- String: {}'.format(jmatch, smatch) + if not jmatch or not smatch: + raise Exception(' ** ERROR:' + res) + else: + res += ' OK' + print(msg + ' ' + res) + except Exception as ex: + print(msg + ' ' + str(ex)) + raise ex + return True + + for pd_read_fn, fname in [(pd.read_csv, 'aaa.csv'), + (pd.read_json, 'jdoc.json'), + (pd.read_excel, 'ex_doc.xlsx'), + (pd.read_html, 'html_file.html') + ]: + assert get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname) From 209dd58a5c088d149f98b35921934245886aa63d Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Wed, 19 Jul 2017 19:56:07 -0700 Subject: [PATCH 13/14] Lot more test cases for HTTP. However, one fails when run with test_html.py. Common logic update. whatsnew fixed --- doc/source/whatsnew/v0.21.0.txt | 129 ++++++++++++++++++++++++------ pandas/io/common.py | 8 +- pandas/tests/io/test_http_auth.py | 120 ++++++++++++++++++++------- 3 files changed, 198 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 9eef84aa69033..eae60828a9e5e 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,34 +25,101 @@ New features - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) + +.. _whatsnew_0210.enhancements.infer_objects: + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.infer_objects` and :meth:`Series.infer_objects` +methods have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) + +This method only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtypes + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using the :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). + +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + .. _whatsnew_0210.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ - The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception of type ``MergeError`` will be raised. For more, see :ref:`here ` (:issue:`16270`) -- ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) -- ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`) -- ``Series.rename_axis()`` and ``DataFrame.rename_axis()`` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) -- :func:`to_pickle` has gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ +- :func:`Series.to_dict` and :func:`DataFrame.to_dict` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) +- :func:`RangeIndex.append` now returns a ``RangeIndex`` object when possible (:issue:`16212`) +- :func:`Series.rename_axis` and :func:`DataFrame.rename_axis` with ``inplace=True`` now return ``None`` while renaming the axis inplace. (:issue:`15704`) +- :func:`Series.to_pickle` and :func:`DataFrame.to_pickle` have gained a ``protocol`` parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ - :func:`api.types.infer_dtype` now infers decimals. (:issue:`15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func:`Series.clip()` have gained an ``inplace`` argument. (:issue:`15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when ``margins=True``. (:issue:`15972`) -- :func:`Dataframe.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) -- It is now possible to read data (i.e. CSV, JSON, HTML) from a HTTP/HTTPS URL that requires Basic Authentication (:issue:`16716`) -- It is now possible to skip check of validity of SSL certificate (eg: Testing using self-signed SSL certificates) (:issue:`16716`) +- :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`) +- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`) +- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`) + +.. _whatsnew_0210.enhancements.read_csv: + +``read_csv`` allow basic auth and skip SSL verification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is possible to read data (i.e. CSV, JSON, HTML) from a URL that is password-protected (:issue:`16716`) +The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`, +:meth:`DataFrame.read_excel` can now perform Basic Authentication over HTTP/HTTPS +now accept an optional parameter of ``auth`` of type tuple containing username and password +It also fixes an issue where a url containing username and password in the standard form now works: +https://:@:/ +Use of username and password over HTTP (without ssl) is a security threat and not recommended. + +.. ipython:: python + url = 'http://:/my.csv' + username = 'darth' + password = 'cand3stroypassword' + df = pd.read_csv(url, auth=(username, password)) + + # or + df = pd.read_csv('https://darth:cand3stroypassword@myhost.com:1010/my.csv') + +It is now also possible to bypass verification of SSL certificate for HTTPS call. +The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`, +:meth:`DataFrame.read_excel` can now accept an optional parameter ``verify_ssl=False`` to bypass SSL verification. +Doing this creates a security risk and is not recommended. It will raise a warning `pandas.io.common.InsecureRequestWarning` +This functionality is added to primarily help in testability scenarios or within private networks where obtaining SSL certificates is burdensome. + +.. ipython:: python + url = 'https://my-test-server.internal/my.csv' + df = pd.read_csv(url, verify_ssl=False) # succeed with pandas.io.common.InsecureRequestWarning + + df = pd.read_csv(url) # Fails by default .. _whatsnew_0210.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.pandas_eval: + Improved error handling during item assignment in pd.eval ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. _whatsnew_0210.api_breaking.pandas_eval: - :func:`eval` will now raise a ``ValueError`` when item assignment malfunctions, or inplace operations are specified, but there is no item assignment in the expression (:issue:`16732`) @@ -94,9 +161,14 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment +.. _whatsnew_0210.api: + +Other API Changes +^^^^^^^^^^^^^^^^^ + - Support has been dropped for Python 3.4 (:issue:`15251`) - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`) -- Accessing a non-existent attribute on a closed :class:`HDFStore` will now +- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) @@ -104,12 +176,6 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) - Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - -.. _whatsnew_0210.api: - -Other API Changes -^^^^^^^^^^^^^^^^^ - - Moved definition of ``MergeError`` to the ``pandas.errors`` module. @@ -119,6 +185,7 @@ Deprecations ~~~~~~~~~~~~ - :func:`read_excel()` has deprecated ``sheetname`` in favor of ``sheet_name`` for consistency with ``.to_excel()`` (:issue:`10559`). +- ``pd.options.html.border`` has been deprecated in favor of ``pd.options.display.html.border`` (:issue:`15793`). .. _whatsnew_0210.prior_deprecations: @@ -126,10 +193,12 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`read_excel()` has dropped the ``has_index_names`` parameter (:issue:`10967`) +- The ``pd.options.display.height`` configuration has been dropped (:issue:`3663`) +- The ``pd.options.display.line_width`` configuration has been dropped (:issue:`2881`) - The ``pd.options.display.mpl_style`` configuration has been dropped (:issue:`12190`) - ``Index`` has dropped the ``.sym_diff()`` method in favor of ``.symmetric_difference()`` (:issue:`12591`) - ``Categorical`` has dropped the ``.order()`` and ``.sort()`` methods in favor of ``.sort_values()`` (:issue:`12882`) -- :func:`eval` and :method:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) +- :func:`eval` and :func:`DataFrame.eval` have changed the default of ``inplace`` from ``None`` to ``False`` (:issue:`11149`) - The function ``get_offset_name`` has been dropped in favor of the ``.freqstr`` attribute for an offset (:issue:`11834`) @@ -138,6 +207,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`) .. _whatsnew_0210.bug_fixes: @@ -145,7 +215,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Conversion ^^^^^^^^^^ @@ -157,34 +226,41 @@ Indexing - When called with a null slice (e.g. ``df.iloc[:]``), the ``.iloc`` and ``.loc`` indexers return a shallow copy of the original object. Previously they returned the original object. (:issue:`13873`). - When called on an unsorted ``MultiIndex``, the ``loc`` indexer now will raise ``UnsortedIndexError`` only if proper slicing is used on non-sorted levels (:issue:`16734`). - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). +- Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`). +- Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). +- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) +- Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) I/O ^^^ - Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`) +- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) Plotting ^^^^^^^^ - +- Bug in plotting methods using ``secondary_y`` and ``fontsize`` not setting secondary axis font size (:issue:`12565`) Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``DataFrame.resample().size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) -- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) -- Bug in ``.rolling.quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) - +- Bug in ``DataFrame.resample(...).size()`` where an empty ``DataFrame`` did not return a ``Series`` (:issue:`14962`) +- Bug in :func:`infer_freq` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) +- Bug in ``.rolling(...).quantile()`` which incorrectly used different defaults than :func:`Series.quantile()` and :func:`DataFrame.quantile()` (:issue:`9413`, :issue:`16211`) +- Bug in ``groupby.transform()`` that would coerce boolean dtypes back to float (:issue:`16875`) Sparse ^^^^^^ - +- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16777`) Reshaping ^^^^^^^^^ - +- Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) +- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Numeric @@ -194,9 +270,10 @@ Numeric Categorical ^^^^^^^^^^^ -- Bug in ``:func:Series.isin()`` when called with a categorical (:issue`16639`) +- Bug in :func:`Series.isin` when called with a categorical (:issue`16639`) Other ^^^^^ - Bug in :func:`eval` where the ``inplace`` parameter was being incorrectly handled (:issue:`16732`) +- Bug in ``.isin()`` in which checking membership in empty ``Series`` objects raised an error (:issue:`16991`) diff --git a/pandas/io/common.py b/pandas/io/common.py index a353b984564c4..0c06cc209252f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -363,11 +363,13 @@ def get_urlopen_args(url_with_uname, auth=None, verify_ssl=True): b64str = base64.encodestring(upstr).replace('\n', '') req.add_header("Authorization", "Basic {}".format(b64str)) elif uname or pwd: - raise ValueError('Only username or password provided without providing the other') + msg = 'Only username or password provided without the other' + raise ValueError(msg) kwargs = {} - if verify_ssl not in [None, True]: + if verify_ssl is False and url_no_usrpwd.lower().startswith('https://'): kwargs['context'] = ssl._create_unverified_context() - msg = 'SSL certificate verification is being disabled for HTTPS. Possible security risk' + msg = 'SSL certificate verification is being disabled for HTTPS.' + \ + ' Possible security risk' warnings.warn(msg, InsecureRequestWarning) return req, kwargs diff --git a/pandas/tests/io/test_http_auth.py b/pandas/tests/io/test_http_auth.py index 79fb7c018af31..6e74964679b02 100644 --- a/pandas/tests/io/test_http_auth.py +++ b/pandas/tests/io/test_http_auth.py @@ -1,8 +1,21 @@ import pytest +from pandas.io.common import InsecureRequestWarning import pandas as pd +import pandas.util.testing as tm +from urllib2 import HTTPError +uname = 'pandasusr' +pwd = 'pandaspwd' +no_auth_path = 'no_auth/' +basic_auth_path = 'basic_auth/' +valid_ssl_url = 'handsome-equator.000webhostapp.com' +invalid_ssl_url = 'pandas-unittest.site11.com' -def gen_http_auth_ssl_test_cases(): + +def gen_http_auth_ssl_test_cases(uname, + pwd, + is_auth, + sub_path): """ Generate list of test case to test for : http/https, username/pwd in url or as parameters, self signed ssl certs or trusted ssl certs, no auth @@ -15,24 +28,20 @@ def gen_level1_tc(): # 'urlopen' sets header 'Host' : ':' - acceptable RFC7230 # 'requests' sets header 'Host' : '' # so pandas fails on following hosting server (uses some 'apex' server) - # but pandas succeeds on nginx even if port is non-default. - for host, verify_ssl in [('pandas-unittest.site11.com', False, ), - ('handsome-equator.000webhostapp.com', True) + # but pandas succeeds on nginx even if port is non-default. + for host, verify_ssl in [(invalid_ssl_url, False), + (valid_ssl_url, True) ]: - for sub_path, is_auth in [('no_auth/', False), - ('basic_auth/', True)]: - pre_ports = [('http', ''), - ('https', '')] - for pre, port in pre_ports: - test_cases.append( - [host, verify_ssl, pre, port, sub_path, is_auth]) + pre_ports = [('http', ''), + ('https', '')] + for pre, port in pre_ports: + test_cases.append( + [host, verify_ssl, pre, port, sub_path, is_auth]) return test_cases def gen_base_url(pre, auth_prefix, host, port, su_pa): return '{}://{}{}{}/{}'.format(pre, auth_prefix, host, port, su_pa) tc2 = [] - uname = 'pandasusr' - pwd = 'pandaspwd' for host, verify_ssl, pre, port, sp, is_auth in gen_level1_tc(): u = uname if is_auth else None p = pwd if is_auth else None @@ -49,9 +58,59 @@ def gen_base_url(pre, auth_prefix, host, port, su_pa): return tc2 +valid_no_auth = gen_http_auth_ssl_test_cases(uname='', + pwd='', + is_auth=False, + sub_path=no_auth_path) + +valid_auth = gen_http_auth_ssl_test_cases(uname=uname, + pwd=pwd, + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + valid_no_auth + valid_auth) +def test_http_valid_auth(url, uname, pwd, verify_ssl): + check_http_auth(url, uname, pwd, verify_ssl) + + +wrong_auth = gen_http_auth_ssl_test_cases(uname='fakepwd', + pwd='fakepwd', + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow @pytest.mark.parametrize('url, uname, pwd, verify_ssl', - gen_http_auth_ssl_test_cases()) -def test_http_auth_ssl(url, uname, pwd, verify_ssl): + wrong_auth) +def test_http_invalid_auth(url, uname, pwd, verify_ssl): + with pytest.raises(HTTPError): + check_http_auth(url, uname, pwd, verify_ssl) + + +blank_uname = gen_http_auth_ssl_test_cases(uname='', + pwd='fakepwd', + is_auth=True, + sub_path=basic_auth_path) + +blank_pwd = gen_http_auth_ssl_test_cases(uname='fakepwd', + pwd='', + is_auth=True, + sub_path=basic_auth_path) + + +@pytest.mark.slow +@pytest.mark.parametrize('url, uname, pwd, verify_ssl', + blank_uname + blank_pwd) +def test_http_require_uname_and_pwd(url, uname, pwd, verify_ssl): + with pytest.raises(ValueError): + check_http_auth(url, uname, pwd, verify_ssl) + + +@tm.network +def check_http_auth(url, uname, pwd, verify_ssl): def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): furl = url + fname @@ -61,24 +120,24 @@ def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): if verify_ssl is not None: kwargs['verify_ssl'] = verify_ssl msg = '{0: <90} -- auth:[{1: <10}/{2: <10}] v:[{3: <5}]'.format( - url, str(uname), str(pwd), str(verify_ssl)) + furl, str(uname), str(pwd), str(verify_ssl)) tcsv = 'animal,bird\ndog,pigeon\ncat,emu\n' j = '{"animal":{"0":"dog","1":"cat"},"bird":{"0":"pigeon","1":"emu"}}' - try: + if verify_ssl or furl.lower().startswith('http://'): df = pd_read_fn(furl, **kwargs) - if type(df) is list: # html - df = df[0] - smatch = str(df.to_csv(index=False)) == tcsv - jmatch = str(df.to_json()) == j - res = 'Json : {} -- String: {}'.format(jmatch, smatch) - if not jmatch or not smatch: - raise Exception(' ** ERROR:' + res) - else: - res += ' OK' - print(msg + ' ' + res) - except Exception as ex: - print(msg + ' ' + str(ex)) - raise ex + else: + with tm.assert_produces_warning(InsecureRequestWarning): + df = pd_read_fn(furl, **kwargs) + if type(df) is list: # html + df = df[0] + smatch = str(df.to_csv(index=False)) == tcsv + jmatch = str(df.to_json()) == j + res = 'Json : {} -- String: {}'.format(jmatch, smatch) + if not jmatch or not smatch: + raise Exception(' ** ERROR:' + res) + else: + res += ' OK' + print(msg + ' ' + res) return True for pd_read_fn, fname in [(pd.read_csv, 'aaa.csv'), @@ -87,3 +146,4 @@ def get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname): (pd.read_html, 'html_file.html') ]: assert get_df(url, uname, pwd, verify_ssl, pd_read_fn, fname) + return From f520f7b9050186fe84301508daec27e310010049 Mon Sep 17 00:00:00 2001 From: Sky NSS Date: Wed, 19 Jul 2017 20:44:47 -0700 Subject: [PATCH 14/14] Updated test case to work with py3. However still fails when run after 'test_html.py' with error AssertionError: Did not see expected warning of class 'InsecureRequestWarning' --- pandas/tests/io/test_http_auth.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_http_auth.py b/pandas/tests/io/test_http_auth.py index 6e74964679b02..26c9e0cf63ce5 100644 --- a/pandas/tests/io/test_http_auth.py +++ b/pandas/tests/io/test_http_auth.py @@ -2,7 +2,10 @@ from pandas.io.common import InsecureRequestWarning import pandas as pd import pandas.util.testing as tm -from urllib2 import HTTPError +try: + from urllib2 import HTTPError +except: + from urllib.error import HTTPError uname = 'pandasusr' pwd = 'pandaspwd'