Skip to content

Commit 8b5f337

Browse files
author
Sky NSS
committed
Added http(s) basic auth and allow self signed ssl certs
1 parent 9d13227 commit 8b5f337

File tree

6 files changed

+155
-23
lines changed

6 files changed

+155
-23
lines changed

pandas/io/common.py

+68-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import csv
55
import codecs
66
import mmap
7+
import ssl
8+
import base64
79
from contextlib import contextmanager, closing
810

911
from pandas.compat import StringIO, BytesIO, string_types, text_type
@@ -49,7 +51,11 @@
4951

5052

5153
if compat.PY3:
52-
from urllib.request import urlopen, pathname2url
54+
from urllib.request import (urlopen, pathname2url, build_opener,
55+
install_opener,
56+
HTTPPasswordMgrWithDefaultRealm,
57+
HTTPBasicAuthHandler,
58+
HTTPSHandler)
5359
_urlopen = urlopen
5460
from urllib.parse import urlparse as parse_url
5561
from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -58,6 +64,7 @@
5864
from http.client import HTTPException # noqa
5965
else:
6066
from urllib2 import urlopen as _urlopen
67+
from urllib2 import Request
6168
from urllib import urlencode, pathname2url # noqa
6269
from urlparse import urlparse as parse_url
6370
from urlparse import uses_relative, uses_netloc, uses_params, urljoin
@@ -177,7 +184,8 @@ def _stringify_path(filepath_or_buffer):
177184

178185

179186
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
180-
compression=None):
187+
compression=None, username=None,
188+
password=None, verify_ssl=None):
181189
"""
182190
If the filepath_or_buffer is a url, translate and return the buffer.
183191
Otherwise passthrough.
@@ -186,7 +194,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
186194
----------
187195
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
188196
or buffer
197+
support 'https://username:[email protected]:port/aaa.csv'
189198
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
199+
compression:
200+
username: Authentication username (for https basic auth)
201+
password: Authentication password (for https basic auth)
202+
verify_ssl: Default True. If False, allow self signed and invalid SSL
203+
certificates for https
190204
191205
Returns
192206
-------
@@ -195,7 +209,11 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
195209
filepath_or_buffer = _stringify_path(filepath_or_buffer)
196210

197211
if _is_url(filepath_or_buffer):
198-
req = _urlopen(filepath_or_buffer)
212+
ureq, kwargs = get_urlopen_args(filepath_or_buffer,
213+
uname=username,
214+
pwd=password,
215+
verify_ssl=verify_ssl)
216+
req = _urlopen(ureq, **kwargs)
199217
content_encoding = req.headers.get('Content-Encoding', None)
200218
if content_encoding == 'gzip':
201219
# Override compression based on Content-Encoding header
@@ -244,6 +262,53 @@ def file_path_to_url(path):
244262
}
245263

246264

265+
def split_uname_from_url(url_with_uname):
266+
o = parse_url(url_with_uname)
267+
usrch = '{}:{}@{}'.format(o.username, o.password, o.hostname)
268+
url_no_usrpwd = url_with_uname.replace(usrch, o.hostname)
269+
return o.username, o.password, url_no_usrpwd
270+
271+
272+
def get_urlopen_args(url_with_uname, uname=None, pwd=None, verify_ssl=True):
273+
if not uname and not pwd:
274+
uname, pwd, url_no_usrpwd = split_uname_from_url(url_with_uname)
275+
else:
276+
url_no_usrpwd = url_with_uname
277+
if compat.PY3:
278+
fn = get_urlopen_args_py3
279+
else:
280+
fn = get_urlopen_args_py2
281+
req, kwargs = fn(uname, pwd, url_no_usrpwd, verify_ssl=verify_ssl)
282+
return req, kwargs
283+
284+
285+
def get_urlopen_args_py2(uname, pwd, url_no_usrpwd, verify_ssl=True):
286+
req = Request(url_no_usrpwd)
287+
upstr = '{}:{}'.format(uname, pwd)
288+
base64string = base64.encodestring(upstr).replace('\n', '')
289+
req.add_header("Authorization", "Basic {}".format(base64string))
290+
# I hope pandas can support self signed certs too
291+
kwargs = {}
292+
if verify_ssl not in [None, True]:
293+
kwargs['context'] = ssl._create_unverified_context()
294+
return req, kwargs
295+
296+
297+
def get_urlopen_args_py3(uname, pwd, url_no_usrpwd, verify_ssl=True):
298+
passman = HTTPPasswordMgrWithDefaultRealm()
299+
passman.add_password(None, url_no_usrpwd, uname, pwd)
300+
authhandler = HTTPBasicAuthHandler(passman)
301+
if verify_ssl in [None, True]:
302+
opener = build_opener(authhandler)
303+
else:
304+
context = ssl.create_default_context()
305+
context.check_hostname = False
306+
context.verify_mode = ssl.CERT_NONE
307+
opener = build_opener(authhandler, HTTPSHandler(context=context))
308+
install_opener(opener)
309+
return url_no_usrpwd, {}
310+
311+
247312
def _infer_compression(filepath_or_buffer, compression):
248313
"""
249314
Get the compression method for filepath_or_buffer. If compression='infer',

pandas/io/excel.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pandas.errors import EmptyDataError
2121
from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
2222
get_filepath_or_buffer, _NA_VALUES,
23-
_stringify_path)
23+
_stringify_path, get_urlopen_args)
2424
from pandas.core.indexes.period import Period
2525
import pandas._libs.json as json
2626
from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -200,7 +200,6 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
200200
convert_float=True, converters=None, dtype=None,
201201
true_values=None, false_values=None, engine=None,
202202
squeeze=False, **kwds):
203-
204203
# Can't use _deprecate_kwarg since sheetname=None has a special meaning
205204
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
206205
warnings.warn("The `sheetname` keyword is deprecated, use "
@@ -211,7 +210,11 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
211210
"Use just `sheet_name`")
212211

213212
if not isinstance(io, ExcelFile):
214-
io = ExcelFile(io, engine=engine)
213+
io = ExcelFile(io,
214+
engine=engine,
215+
username=kwds.get('username', None),
216+
password=kwds.get('password', None),
217+
verify_ssl=kwds.get('verify_ssl', None))
215218

216219
return io._parse_excel(
217220
sheetname=sheet_name, header=header, skiprows=skiprows, names=names,
@@ -259,7 +262,12 @@ def __init__(self, io, **kwds):
259262
# If io is a url, want to keep the data as bytes so can't pass
260263
# to get_filepath_or_buffer()
261264
if _is_url(self._io):
262-
io = _urlopen(self._io)
265+
verify_ssl = kwds.get('verify_ssl', None)
266+
ureq, kwargs = get_urlopen_args(self._io,
267+
uname=kwds.get('username', None),
268+
pwd=kwds.get('password', None),
269+
verify_ssl=verify_ssl)
270+
io = _urlopen(ureq, **kwargs)
263271
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
264272
io, _, _ = get_filepath_or_buffer(self._io)
265273

pandas/io/html.py

+35-11
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
from pandas.core.dtypes.common import is_list_like
1616
from pandas.errors import EmptyDataError
1717
from pandas.io.common import (_is_url, urlopen,
18-
parse_url, _validate_header_arg)
18+
parse_url, _validate_header_arg,
19+
get_urlopen_args)
1920
from pandas.io.parsers import TextParser
2021
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
2122
raise_with_traceback, binary_type)
@@ -116,19 +117,22 @@ def _get_skiprows(skiprows):
116117
type(skiprows).__name__)
117118

118119

119-
def _read(obj):
120+
def _read(obj, username=None, password=None, verify_ssl=None):
120121
"""Try to read from a url, file or string.
121122
122123
Parameters
123124
----------
124125
obj : str, unicode, or file-like
125-
126+
username: username for http basic auth
127+
password: password for http basic auth
128+
verify_ssl: Default True. Set to False to disable cert verification
126129
Returns
127130
-------
128131
raw_text : str
129132
"""
130133
if _is_url(obj):
131-
with urlopen(obj) as url:
134+
ureq, kwargs = get_urlopen_args(obj, username, password, verify_ssl)
135+
with urlopen(ureq, **kwargs) as url:
132136
text = url.read()
133137
elif hasattr(obj, 'read'):
134138
text = obj.read()
@@ -187,11 +191,15 @@ class _HtmlFrameParser(object):
187191
functionality.
188192
"""
189193

190-
def __init__(self, io, match, attrs, encoding):
194+
def __init__(self, io, match, attrs, encoding, username=None,
195+
password=None, verify_ssl=None):
191196
self.io = io
192197
self.match = match
193198
self.attrs = attrs
194199
self.encoding = encoding
200+
self.username = username
201+
self.password = password
202+
self.verify_ssl = verify_ssl
195203

196204
def parse_tables(self):
197205
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -444,7 +452,8 @@ def _parse_tables(self, doc, match, attrs):
444452
return result
445453

446454
def _setup_build_doc(self):
447-
raw_text = _read(self.io)
455+
raw_text = _read(self.io, self.username,
456+
self.password, self.verify_ssl)
448457
if not raw_text:
449458
raise ValueError('No text parsed from document: %s' % self.io)
450459
return raw_text
@@ -731,8 +740,12 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
731740
retained = None
732741
for flav in flavor:
733742
parser = _parser_dispatch(flav)
734-
p = parser(io, compiled_match, attrs, encoding)
735-
743+
p = parser(io, compiled_match,
744+
attrs,
745+
encoding,
746+
username=kwargs.get('username', None),
747+
password=kwargs.get('password', None),
748+
verify_ssl=kwargs.get('verify_ssl', None))
736749
try:
737750
tables = p.parse_tables()
738751
except Exception as caught:
@@ -755,7 +768,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
755768
skiprows=None, attrs=None, parse_dates=False,
756769
tupleize_cols=False, thousands=',', encoding=None,
757770
decimal='.', converters=None, na_values=None,
758-
keep_default_na=True):
771+
keep_default_na=True, username=None, password=None,
772+
verify_ssl=False):
759773
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
760774
761775
Parameters
@@ -856,7 +870,16 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
856870
857871
.. versionadded:: 0.19.0
858872
859-
Returns
873+
username : str, default None
874+
username for HTTP(s) basic auth
875+
876+
password : str, default None
877+
password for HTTP(s) basic auth
878+
879+
verify_ssl : bool, default True
880+
If False, ssl certificate is not verified (allow self signed SSL certs)
881+
882+
Returns
860883
-------
861884
dfs : list of DataFrames
862885
@@ -903,4 +926,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
903926
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
904927
thousands=thousands, attrs=attrs, encoding=encoding,
905928
decimal=decimal, converters=converters, na_values=na_values,
906-
keep_default_na=keep_default_na)
929+
keep_default_na=keep_default_na, username=username,
930+
password=password, verify_ssl=verify_ssl)

pandas/io/json/json.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def write(self):
174174
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
175175
convert_axes=True, convert_dates=True, keep_default_dates=True,
176176
numpy=False, precise_float=False, date_unit=None, encoding=None,
177-
lines=False):
177+
lines=False, username=None, password=None, verify_ssl=None):
178178
"""
179179
Convert a JSON string to pandas object
180180
@@ -263,6 +263,11 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
263263
264264
.. versionadded:: 0.19.0
265265
266+
username: str, default None. Authentication username for HTTP(s) basic auth
267+
passowrd: str, default None. Authentication password for HTTP(s) basic auth
268+
verify_ssl: boolean, default None (True).
269+
If false, allow self siged SSL certificates
270+
266271
Returns
267272
-------
268273
result : Series or DataFrame, depending on the value of `typ`.
@@ -321,7 +326,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
321326
"""
322327

323328
filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf,
324-
encoding=encoding)
329+
encoding=encoding,
330+
username=username,
331+
password=password,
332+
verify_ssl=verify_ssl)
325333
if isinstance(filepath_or_buffer, compat.string_types):
326334
try:
327335
exists = os.path.exists(filepath_or_buffer)

pandas/io/parsers.py

+19-3
Original file line numberDiff line numberDiff line change
@@ -391,9 +391,13 @@ def _read(filepath_or_buffer, kwds):
391391
kwds['encoding'] = encoding
392392

393393
compression = kwds.get('compression')
394+
username = kwds.get('username', None)
395+
password = kwds.get('password', None)
396+
verify_ssl = kwds.get('verify_ssl', None)
394397
compression = _infer_compression(filepath_or_buffer, compression)
395398
filepath_or_buffer, _, compression = get_filepath_or_buffer(
396-
filepath_or_buffer, encoding, compression)
399+
filepath_or_buffer, encoding, compression, username, password,
400+
verify_ssl)
397401
kwds['compression'] = compression
398402

399403
if kwds.get('date_parser', None) is not None:
@@ -574,7 +578,14 @@ def parser_f(filepath_or_buffer,
574578
low_memory=_c_parser_defaults['low_memory'],
575579
buffer_lines=None,
576580
memory_map=False,
577-
float_precision=None):
581+
float_precision=None,
582+
583+
# Basic auth (http/https)
584+
username=None,
585+
password=None,
586+
587+
# skip verify self signed SSL certificates
588+
verify_ssl=None):
578589

579590
# Alias sep -> delimiter.
580591
if delimiter is None:
@@ -654,7 +665,12 @@ def parser_f(filepath_or_buffer,
654665
mangle_dupe_cols=mangle_dupe_cols,
655666
tupleize_cols=tupleize_cols,
656667
infer_datetime_format=infer_datetime_format,
657-
skip_blank_lines=skip_blank_lines)
668+
skip_blank_lines=skip_blank_lines,
669+
670+
username=username,
671+
password=password,
672+
verify_ssl=verify_ssl
673+
)
658674

659675
return _read(filepath_or_buffer, kwds)
660676

pandas/tests/io/test_common.py

+11
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,17 @@ def test_write_fspath_hdf5(self):
190190

191191
tm.assert_frame_equal(result, expected)
192192

193+
def test_split_url_extract_uname_pwd(self):
194+
for url, uname, pwd, nurl in [('https://aaa:[email protected]:1010/aaa.txt',
195+
'aaa',
196+
'bbb',
197+
'https://ccc.com:1010/aaa.txt'
198+
)]:
199+
un, p, u = common.split_uname_from_url(url)
200+
assert u == nurl
201+
assert un == uname
202+
assert p == pwd
203+
193204

194205
class TestMMapWrapper(object):
195206

0 commit comments

Comments
 (0)