Skip to content

Commit cb66c6c

Browse files
author
Sky NSS
committed
Rebase with pandas-master. Changing version to v0.22
1 parent c3c04e2 commit cb66c6c

File tree

6 files changed

+240
-18
lines changed

6 files changed

+240
-18
lines changed

doc/source/whatsnew/v0.22.0.txt

+50
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,56 @@ levels <merging.merge_on_columns_and_levels>` documentation section.
6363
left.merge(right, on=['key1', 'key2'])
6464

6565

66+
.. _whatsnew_0220.enhancements.read_csv:
67+
68+
``read_csv`` use `python-requests` (if installed) to support basic auth and much more
69+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
70+
71+
If `python-requests` library is installed try to use it first. If not, continue using urllib
72+
The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`,
73+
:meth:`DataFrame.read_excel` now allow optional param of ``http_params`` to pass in
74+
parameters for basic auth, disable ssl strict check or even a requests.Session() object
75+
76+
77+
.. ipython:: python
78+
import pandas as pd
79+
80+
# http_params is optional parameter. If it is non-empty, it attempts to use python-requests library
81+
df = pd.read_csv('https://uname:[email protected]/bb.csv', http_params= {'auth': None} ) # now url can contain username and pwd
82+
# Note - all basic auth scenarios require python-requests library
83+
84+
# Basic Auth
85+
df = pd.read_csv('https://aa.com/bb.csv', http_params={ 'auth': ('john', 'pwd') } ) # now url can contain username and pwd
86+
87+
# Basic Auth And disable verification of SSL certificate eg: testing
88+
up = { 'auth': ('john', 'pwd') , 'verify' : False}
89+
df = pd.read_csv('https://aa.com/bb.csv', http_params=up ) # now url can contain username and pwd
90+
91+
# Optionally, A requests.Session() can also be passed into http_params
92+
import requests
93+
s = requests.Session()
94+
s.auth = MyAuthProvider('secret-key') # custom auth provider supported by requests
95+
df = pd.read_csv(url, http_params=s)
96+
97+
# For advanced users, this may provide extensibility. However, testing on pandas side is limited to basic scenarios
98+
# here is an example of advanced scenario
99+
s = Session()
100+
s.auth = ('darth', 'l0rd') # if user wants to perform basic auth Skip if url itself contains username and pwd
101+
s.timeout = (3.05, 27) # if user wants to modify timeout
102+
s.verify = False # if user wants to disable ssl cert verification
103+
s.headers.update( {'User-Agent': 'Custom user agent'} ) # extensible to set any custom header needed
104+
s.proxies = { 'http': 'http://a.com:100'} # if user has proxies
105+
s.cert = '/path/client.cert' # if custom cert is needed
106+
df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s)
107+
108+
def print_http_status(r, *args, **kwargs):
109+
print(r.status_code)
110+
print(r.headers['Content-Length'])
111+
s = Session()
112+
s.hooks = dict(response=print_http_status)
113+
df = pd.read_csv( 'https://aa.com/bbb.csv', http_params=s)
114+
115+
66116
.. _whatsnew_0220.enhancements.other:
67117

68118
Other Enhancements

pandas/io/common.py

+115-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
])
2929

3030

31+
try:
32+
import requests
33+
_REQUESTS_INSTALLED = True
34+
except ImportError:
35+
_REQUESTS_INSTALLED = False
36+
37+
3138
if compat.PY3:
3239
from urllib.request import urlopen, pathname2url
3340
_urlopen = urlopen
@@ -168,8 +175,87 @@ def _stringify_path(filepath_or_buffer):
168175
return filepath_or_buffer
169176

170177

178+
def _is_handled_by_requests(o):
179+
return _is_url(o) and parse_url(o).scheme in ['http', 'https']
180+
181+
182+
def gen_session(http_params):
183+
"""
184+
Generate python-requests session from http_params dict
185+
"""
186+
s = None
187+
if http_params and type(http_params) is requests.sessions.Session:
188+
s = http_params
189+
else:
190+
s = requests.Session()
191+
s.stream = True
192+
# Setting accept-encoding to None for backwards compatibility with
193+
# urlopen. ideally we want to allow gzip download
194+
# urlopen doesnt decompress automatically, requests does.
195+
s.headers.update({'Accept-Encoding': None})
196+
if http_params and type(http_params) is dict:
197+
if http_params.get('auth', None) and not s.auth:
198+
s.auth = http_params.get('auth')
199+
if http_params.get('verify', True) is False and s.verify is not False:
200+
s.verify = http_params.get('verify')
201+
return s
202+
203+
204+
def fetch_url(url, http_params=None, skip_requests=False):
205+
"""
206+
If url is url, first try python-requests else try urllib.
207+
Note if requests library is used, auto gunzip is
208+
disabled for backwards compatibility of code with urlopen
209+
210+
Parameters
211+
----------
212+
url : str
213+
Could be:
214+
'http://cnn.com'
215+
'file:///home/sky/aaa.csv'
216+
217+
http_params : dict or requests.Session(), default None
218+
A python dict containing:
219+
'auth': tuple (str, str) eg (username, password)
220+
'auth': Any other auth object accepted by requests
221+
'verify': boolean, default True
222+
If False, allow self signed and invalid SSL cert for https
223+
or
224+
A python requests.Session object if http(s) path to enable basic auth
225+
and many other scenarios that requests allows
226+
227+
.. versionadded:: 0.22.0
228+
229+
skip_requests : boolean, default False
230+
for testing - disable `requests` library Internal use only
231+
232+
.. versionadded:: 0.22.0
233+
Raises
234+
------
235+
ValueError if http_params specified without installed python-requests pkg
236+
"""
237+
if not http_params:
238+
skip_requests = True
239+
if (not skip_requests) and \
240+
_REQUESTS_INSTALLED and \
241+
_is_handled_by_requests(url):
242+
s = gen_session(http_params)
243+
resp = s.get(url)
244+
resp.raise_for_status()
245+
content_bytes = resp.content
246+
else:
247+
if http_params and (skip_requests or not _REQUESTS_INSTALLED):
248+
msg = 'To utilize http_params, python-requests library is ' + \
249+
'required but not detected'
250+
raise ValueError(msg)
251+
resp = _urlopen(url)
252+
content_bytes = resp.read()
253+
return resp, content_bytes
254+
255+
171256
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
172-
compression=None):
257+
compression=None, http_params=None,
258+
skip_requests=False):
173259
"""
174260
If the filepath_or_buffer is a url, translate and return the buffer.
175261
Otherwise passthrough.
@@ -180,19 +266,45 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
180266
or buffer
181267
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
182268
269+
compression : str, default None
270+
indicate the compression such as 'gzip'.
271+
272+
http_params : dict or requests.Session(), default None
273+
A python dict containing:
274+
'auth': tuple (str, str) eg (unae, pwd)
275+
'auth': Any other auth object accepted by requests
276+
'verify': boolean, default True
277+
If False, allow self signed and invalid SSL cert for https
278+
or
279+
A python requests.Session object if http(s) path to enable basic auth
280+
and many other scenarios that requests allows
281+
282+
.. versionadded:: 0.22.0
283+
284+
skip_requests : boolean, default False
285+
for testing - disable `requests` library Internal use only
286+
287+
.. versionadded:: 0.22.0
288+
183289
Returns
184290
-------
185291
a filepath_or_buffer, the encoding, the compression
292+
293+
Raises
294+
------
295+
ValueError if http_params specified without installed python-requests pkg
186296
"""
187297
filepath_or_buffer = _stringify_path(filepath_or_buffer)
188298

189299
if _is_url(filepath_or_buffer):
190-
req = _urlopen(filepath_or_buffer)
300+
req, content_bytes = fetch_url(filepath_or_buffer,
301+
http_params,
302+
skip_requests)
303+
reader = BytesIO(content_bytes)
191304
content_encoding = req.headers.get('Content-Encoding', None)
192305
if content_encoding == 'gzip':
193306
# Override compression based on Content-Encoding header
194307
compression = 'gzip'
195-
reader = BytesIO(req.read())
196308
return reader, encoding, compression
197309

198310
if _is_s3_url(filepath_or_buffer):

pandas/io/excel.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@
1515
is_integer, is_float,
1616
is_bool, is_list_like)
1717

18+
from pandas.compat import BytesIO
1819
from pandas.core.frame import DataFrame
1920
from pandas.io.parsers import TextParser
2021
from pandas.errors import EmptyDataError
21-
from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
22+
from pandas.io.common import (_is_url, fetch_url, _validate_header_arg,
2223
get_filepath_or_buffer, _NA_VALUES,
2324
_stringify_path)
2425
from pandas.core.indexes.period import Period
@@ -263,7 +264,9 @@ def __init__(self, io, **kwds):
263264
# If io is a url, want to keep the data as bytes so can't pass
264265
# to get_filepath_or_buffer()
265266
if _is_url(self._io):
266-
io = _urlopen(self._io)
267+
rs = kwds.get('http_params', None)
268+
req, content = fetch_url(self._io, http_params=rs)
269+
io = BytesIO(content)
267270
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
268271
io, _, _ = get_filepath_or_buffer(self._io)
269272

pandas/io/html.py

+46-9
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from pandas.core.dtypes.common import is_list_like
1616
from pandas.errors import EmptyDataError
17-
from pandas.io.common import (_is_url, urlopen,
17+
from pandas.io.common import (_is_url, fetch_url,
1818
parse_url, _validate_header_arg)
1919
from pandas.io.parsers import TextParser
2020
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
@@ -116,20 +116,31 @@ def _get_skiprows(skiprows):
116116
type(skiprows).__name__)
117117

118118

119-
def _read(obj):
119+
def _read(obj, http_params=None):
120120
"""Try to read from a url, file or string.
121121
122122
Parameters
123123
----------
124124
obj : str, unicode, or file-like
125125
126+
http_params : dict or requests.Session(), default None
127+
A python dict containing:
128+
'auth': tuple (str, str) eg (unae, pwd)
129+
'auth': Any other auth object accepted by requests
130+
'verify': boolean, default True
131+
If False, allow self signed and invalid SSL certs for https
132+
or
133+
A python requests.Session object if http(s) path to enable basic auth
134+
and many other scenarios that requests allows
135+
136+
.. versionadded:: 0.22.0
137+
126138
Returns
127139
-------
128140
raw_text : str
129141
"""
130142
if _is_url(obj):
131-
with urlopen(obj) as url:
132-
text = url.read()
143+
req, text = fetch_url(obj, http_params)
133144
elif hasattr(obj, 'read'):
134145
text = obj.read()
135146
elif isinstance(obj, char_types):
@@ -172,6 +183,24 @@ class _HtmlFrameParser(object):
172183
A dictionary of valid table attributes to use to search for table
173184
elements.
174185
186+
encoding : str or None, optional
187+
The encoding used to decode the web page. Defaults to ``None``.``None``
188+
preserves the previous encoding behavior, which depends on the
189+
underlying parser library (e.g., the parser library will try to use
190+
the encoding provided by the document).
191+
192+
http_params : dict or requests.Session(), default None
193+
A python dict containing:
194+
'auth': tuple (str, str) eg (username, password)
195+
'auth': Any other auth object accepted by requests
196+
'verify': boolean, default True
197+
If False, allow self signed and invalid SSL cert for https
198+
or
199+
A python requests.Session object if http(s) path to enable basic auth
200+
and many other scenarios that requests allows
201+
202+
.. versionadded:: 0.22.0
203+
175204
Notes
176205
-----
177206
To subclass this class effectively you must override the following methods:
@@ -187,11 +216,12 @@ class _HtmlFrameParser(object):
187216
functionality.
188217
"""
189218

190-
def __init__(self, io, match, attrs, encoding):
219+
def __init__(self, io, match, attrs, encoding, http_params=None):
191220
self.io = io
192221
self.match = match
193222
self.attrs = attrs
194223
self.encoding = encoding
224+
self.http_params = http_params
195225

196226
def parse_tables(self):
197227
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -444,7 +474,7 @@ def _parse_tables(self, doc, match, attrs):
444474
return result
445475

446476
def _setup_build_doc(self):
447-
raw_text = _read(self.io)
477+
raw_text = _read(self.io, self.http_params)
448478
if not raw_text:
449479
raise ValueError('No text parsed from document: {doc}'
450480
.format(doc=self.io))
@@ -737,7 +767,8 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
737767
retained = None
738768
for flav in flavor:
739769
parser = _parser_dispatch(flav)
740-
p = parser(io, compiled_match, attrs, encoding)
770+
p = parser(io, compiled_match, attrs, encoding,
771+
http_params=kwargs.get('http_params', None))
741772

742773
try:
743774
tables = p.parse_tables()
@@ -773,7 +804,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
773804
skiprows=None, attrs=None, parse_dates=False,
774805
tupleize_cols=None, thousands=',', encoding=None,
775806
decimal='.', converters=None, na_values=None,
776-
keep_default_na=True):
807+
keep_default_na=True, http_params=None):
777808
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
778809
779810
Parameters
@@ -877,6 +908,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
877908
878909
.. versionadded:: 0.19.0
879910
911+
http_params : requests.Session(), default None
912+
A python requests.Session object if http(s) path to enable basic auth
913+
and many other scenarios that requests allows
914+
915+
.. versionadded:: 0.22.0
916+
880917
Returns
881918
-------
882919
dfs : list of DataFrames
@@ -924,4 +961,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
924961
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
925962
thousands=thousands, attrs=attrs, encoding=encoding,
926963
decimal=decimal, converters=converters, na_values=na_values,
927-
keep_default_na=keep_default_na)
964+
keep_default_na=keep_default_na, http_params=http_params)

pandas/io/json/json.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,8 @@ def write(self):
182182
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
183183
convert_axes=True, convert_dates=True, keep_default_dates=True,
184184
numpy=False, precise_float=False, date_unit=None, encoding=None,
185-
lines=False, chunksize=None, compression='infer'):
185+
lines=False, chunksize=None, compression='infer',
186+
http_params=None):
186187
"""
187188
Convert a JSON string to pandas object
188189
@@ -290,6 +291,18 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
290291
291292
.. versionadded:: 0.21.0
292293
294+
http_params : dict or requests.Session(), default None
295+
A python dict containing:
296+
'auth': tuple (str, str) eg (unae, pwd)
297+
'auth': Any other auth object accepted by requests
298+
'verify': boolean, Default True
299+
If False, allow self signed and invalid SSL certs for https
300+
or
301+
A python requests.Session object if http(s) path to enable basic auth
302+
and many other scenarios that requests allows
303+
304+
.. versionadded:: 0.22.0
305+
293306
Returns
294307
-------
295308
result : Series or DataFrame, depending on the value of `typ`.
@@ -350,6 +363,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
350363
compression = _infer_compression(path_or_buf, compression)
351364
filepath_or_buffer, _, compression = get_filepath_or_buffer(
352365
path_or_buf, encoding=encoding, compression=compression,
366+
http_params=http_params
353367
)
354368

355369
json_reader = JsonReader(

0 commit comments

Comments
 (0)