Skip to content

Commit 84ac451

Browse files
author
Sky NSS
committed
Use python-requests if available. Unit tests not tested without requests library
Use url_params instead of requests session. Removed option of prepared requests to simplify. test case updated to handle absence of requests library updated comments and exception Minor fix to test case
1 parent f9a552d commit 84ac451

File tree

7 files changed

+449
-23
lines changed

7 files changed

+449
-23
lines changed

doc/source/whatsnew/v0.21.0.txt

+56
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,62 @@ Other Enhancements
7979
- :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
8080
- :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
8181

82+
.. _whatsnew_0210.enhancements.read_csv:
83+
84+
``read_csv`` use `python-requests` (if installed) to support basic auth and much more
85+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
86+
87+
If `python-requests` library is installed try to use it first. If not, continue using urllib
88+
The :meth:`DataFrame.read_csv`, :meth:`DataFrame.read_html`, :meth:`DataFrame.read_json`,
89+
:meth:`DataFrame.read_excel` now allow optional param of ``req_session`` to pass in requests.Session()
90+
91+
92+
.. ipython:: python
93+
import pandas as pd
94+
from requests import Session
95+
96+
# req_session is optional parameter
97+
df = pd.read_csv('https://uname:[email protected]/bb.csv') # now url can contain username and pwd
98+
99+
# custom auth can be implemented
100+
s = Session()
101+
s.auth = MyAuthProvider('secret-key') # custom auth provider supported by requests
102+
df = pd.read_csv(url, req_session=s)
103+
104+
# optional advanced scenarios: basic auth, timeout, disable ssl certificate verification, proxy, etc
105+
s = Session()
106+
s.auth = ('darth', 'l0rd') # if user wants to perform basic auth Skip if url itself contains username and pwd
107+
s.timeout = (3.05, 27) # if user wants to modify timeout
108+
s.verify = False # if user wants to disable ssl cert verification
109+
s.headers.update( {'User-Agent': 'Custom user agent'} ) # extensible to set any custom header needed
110+
s.proxies = { 'http': 'http://a.com:100'} # if user has proxies
111+
s.cert = '/path/client.cert' # if custom cert is needed
112+
df = pd.read_csv( 'https://aa.com/bbb.csv', req_session=s)
113+
114+
# support verbs other than 'GET' such as 'POST' using requests.PreparedRequest
115+
r = Request('POST', 'http://joker:pwd@nlp_service.api/email_sentiment_extract?out=json')
116+
prepped = req.prepare()
117+
prepped.body = 'from: [email protected]\nto: [email protected]\nsubject:Complaint letter\n\nbody: I am feeling :(' # multiple lines
118+
df = pd.read_json( prepped) # minor update pandas code to detect type(Request) and submit it using requests session in lieu of URL.
119+
"""
120+
[{
121+
'from': '[email protected]',
122+
123+
'email_type': 'complaint',
124+
'sentiment': 'unhappy',
125+
}]
126+
"""
127+
128+
# Event hooks callback (eg log http status codes or other callback related functions)
129+
def print_http_status(r, *args, **kwargs):
130+
print(r.status_code)
131+
print(r.headers['Content-Length'])
132+
s = Session()
133+
s.hooks = dict(response=print_http_status)
134+
df = pd.read_csv( 'https://aa.com/bbb.csv', req_session=s)
135+
136+
137+
82138
.. _whatsnew_0210.api_breaking:
83139

84140
Backwards incompatible API changes

pandas/io/common.py

+117-3
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@
4747
except:
4848
_PY_PATH_INSTALLED = False
4949

50+
try:
51+
import requests
52+
_PY_REQUESTS_INSTALLED = True
53+
except ImportError:
54+
_PY_REQUESTS_INSTALLED = False
55+
5056

5157
if compat.PY3:
5258
from urllib.request import urlopen, pathname2url
@@ -93,6 +99,11 @@ def __next__(self):
9399
BaseIterator.next = lambda self: self.__next__()
94100

95101

102+
def is_requests_pkg_avail():
103+
"""Checks if 'python-requests' package is already installed."""
104+
return _PY_REQUESTS_INSTALLED
105+
106+
96107
def _is_url(url):
97108
"""Check to see if a URL has a valid protocol.
98109
@@ -176,8 +187,85 @@ def _stringify_path(filepath_or_buffer):
176187
return filepath_or_buffer
177188

178189

190+
def _is_handled_by_requests(o):
191+
return _is_url(o) and parse_url(o).scheme in ['http', 'https']
192+
193+
194+
def gen_session(url_params):
195+
"""
196+
Generate python-requests session from url_params dict
197+
"""
198+
s = None
199+
if url_params and type(url_params) is requests.sessions.Session:
200+
s = url_params
201+
else:
202+
s = requests.Session()
203+
s.stream = True
204+
# Setting accept-encoding to None for backwards compatibility with
205+
# urlopen. ideally we want to allow gzip download
206+
# urlopen doesnt decompress automatically, requests does.
207+
s.headers.update({'Accept-Encoding': None})
208+
if url_params and type(url_params) is dict:
209+
if url_params.get('auth', None) and not s.auth:
210+
s.auth = url_params.get('auth')
211+
if url_params.get('verify', True) is False and s.verify is not False:
212+
s.verify = url_params.get('verify')
213+
return s
214+
215+
216+
def fetch_url(url, url_params=None, skip_requests=False):
217+
"""
218+
If url is url, first try python-requests else try urllib.
219+
Note if requests library is used, auto gunzip is
220+
disabled for backwards compatibility of code with urlopen
221+
222+
Parameters
223+
----------
224+
url : str
225+
Could be:
226+
'http://cnn.com'
227+
'file:///home/sky/aaa.csv'
228+
229+
url_params : dict or requests.Session(), default None
230+
A python dict containing:
231+
'auth': tuple (str, str) eg (unae, pwd)
232+
'auth': Any other auth object accepted by requests
233+
'verify': boolean, Default True
234+
If False, allow self signed and invalid SSL cert for https
235+
or
236+
A python requests.Session object if http(s) path to enable basic auth
237+
and many other scenarios that requests allows
238+
239+
.. versionadded:: 0.21.0
240+
241+
skip_requests : boolean, default False
242+
for testing - disable `requests` library Internal use only
243+
244+
.. versionadded:: 0.21.0
245+
Raises
246+
------
247+
ValueError if url_params specified without installed python-requests pkg
248+
"""
249+
if is_requests_pkg_avail() and \
250+
_is_handled_by_requests(url) and \
251+
(not skip_requests):
252+
s = gen_session(url_params)
253+
resp = s.get(url)
254+
resp.raise_for_status()
255+
content_bytes = resp.content
256+
else:
257+
if url_params and (skip_requests or not is_requests_pkg_avail()):
258+
msg = 'To utilize url_params, python-requests library is ' + \
259+
'required but not detected'
260+
raise ValueError(msg)
261+
resp = _urlopen(url)
262+
content_bytes = resp.read()
263+
return resp, content_bytes
264+
265+
179266
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
180-
compression=None):
267+
compression=None, url_params=None,
268+
skip_requests=False):
181269
"""
182270
If the filepath_or_buffer is a url, translate and return the buffer.
183271
Otherwise passthrough.
@@ -188,19 +276,45 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
188276
or buffer
189277
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
190278
279+
compression : str, default None
280+
indicate the compression such as 'gzip'.
281+
282+
url_params : dict or requests.Session(), default None
283+
A python dict containing:
284+
'auth': tuple (str, str) eg (unae, pwd)
285+
'auth': Any other auth object accepted by requests
286+
'verify': boolean, Default True
287+
If False, allow self signed and invalid SSL cert for https
288+
or
289+
A python requests.Session object if http(s) path to enable basic auth
290+
and many other scenarios that requests allows
291+
292+
.. versionadded:: 0.21.0
293+
294+
skip_requests : boolean, default False
295+
for testing - disable `requests` library Internal use only
296+
297+
.. versionadded:: 0.21.0
298+
191299
Returns
192300
-------
193301
a filepath_or_buffer, the encoding, the compression
302+
303+
Raises
304+
------
305+
ValueError if url_params specified without installed python-requests pkg
194306
"""
195307
filepath_or_buffer = _stringify_path(filepath_or_buffer)
196308

197309
if _is_url(filepath_or_buffer):
198-
req = _urlopen(filepath_or_buffer)
310+
req, content_bytes = fetch_url(filepath_or_buffer,
311+
url_params,
312+
skip_requests)
313+
reader = BytesIO(content_bytes)
199314
content_encoding = req.headers.get('Content-Encoding', None)
200315
if content_encoding == 'gzip':
201316
# Override compression based on Content-Encoding header
202317
compression = 'gzip'
203-
reader = BytesIO(req.read())
204318
return reader, encoding, compression
205319

206320
if _is_s3_url(filepath_or_buffer):

pandas/io/excel.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
is_integer, is_float,
1616
is_bool, is_list_like)
1717

18+
from pandas.compat import BytesIO
1819
from pandas.core.frame import DataFrame
1920
from pandas.io.parsers import TextParser
2021
from pandas.errors import EmptyDataError
21-
from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
22-
get_filepath_or_buffer, _NA_VALUES,
23-
_stringify_path)
22+
from pandas.io.common import (_is_url, fetch_url,
23+
_validate_header_arg, get_filepath_or_buffer,
24+
_NA_VALUES, _stringify_path)
2425
from pandas.core.indexes.period import Period
2526
import pandas._libs.json as json
2627
from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -211,7 +212,8 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
211212
"Use just `sheet_name`")
212213

213214
if not isinstance(io, ExcelFile):
214-
io = ExcelFile(io, engine=engine)
215+
io = ExcelFile(io, engine=engine,
216+
url_params=kwds.get('url_params', None))
215217

216218
return io._parse_excel(
217219
sheetname=sheet_name, header=header, skiprows=skiprows, names=names,
@@ -258,8 +260,10 @@ def __init__(self, io, **kwds):
258260

259261
# If io is a url, want to keep the data as bytes so can't pass
260262
# to get_filepath_or_buffer()
261-
if _is_url(self._io):
262-
io = _urlopen(self._io)
263+
if _is_url(self.io):
264+
rs = kwds.get('url_params', None)
265+
req, content = fetch_url(self.io, url_params=rs)
266+
io = BytesIO(content)
263267
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
264268
io, _, _ = get_filepath_or_buffer(self._io)
265269

pandas/io/html.py

+28-9
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from pandas.core.dtypes.common import is_list_like
1616
from pandas.errors import EmptyDataError
17-
from pandas.io.common import (_is_url, urlopen,
17+
from pandas.io.common import (_is_url, fetch_url,
1818
parse_url, _validate_header_arg)
1919
from pandas.io.parsers import TextParser
2020
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
@@ -116,20 +116,31 @@ def _get_skiprows(skiprows):
116116
type(skiprows).__name__)
117117

118118

119-
def _read(obj):
119+
def _read(obj, url_params=None):
120120
"""Try to read from a url, file or string.
121121
122122
Parameters
123123
----------
124124
obj : str, unicode, or file-like
125125
126+
url_params : dict or requests.Session(), default None
127+
A python dict containing:
128+
'auth': tuple (str, str) eg (unae, pwd)
129+
'auth': Any other auth object accepted by requests
130+
'verify': boolean, Default True
131+
If False, allow self signed and invalid SSL certs for https
132+
or
133+
A python requests.Session object if http(s) path to enable basic auth
134+
and many other scenarios that requests allows
135+
136+
.. versionadded:: 0.21.0
137+
126138
Returns
127139
-------
128140
raw_text : str
129141
"""
130142
if _is_url(obj):
131-
with urlopen(obj) as url:
132-
text = url.read()
143+
req, text = fetch_url(obj, url_params)
133144
elif hasattr(obj, 'read'):
134145
text = obj.read()
135146
elif isinstance(obj, char_types):
@@ -187,11 +198,12 @@ class _HtmlFrameParser(object):
187198
functionality.
188199
"""
189200

190-
def __init__(self, io, match, attrs, encoding):
201+
def __init__(self, io, match, attrs, encoding, url_params=None):
191202
self.io = io
192203
self.match = match
193204
self.attrs = attrs
194205
self.encoding = encoding
206+
self.url_params = url_params
195207

196208
def parse_tables(self):
197209
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -444,7 +456,7 @@ def _parse_tables(self, doc, match, attrs):
444456
return result
445457

446458
def _setup_build_doc(self):
447-
raw_text = _read(self.io)
459+
raw_text = _read(self.io, self.url_params)
448460
if not raw_text:
449461
raise ValueError('No text parsed from document: %s' % self.io)
450462
return raw_text
@@ -731,7 +743,8 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
731743
retained = None
732744
for flav in flavor:
733745
parser = _parser_dispatch(flav)
734-
p = parser(io, compiled_match, attrs, encoding)
746+
p = parser(io, compiled_match, attrs, encoding,
747+
url_params=kwargs.get('url_params', None))
735748

736749
try:
737750
tables = p.parse_tables()
@@ -755,7 +768,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
755768
skiprows=None, attrs=None, parse_dates=False,
756769
tupleize_cols=False, thousands=',', encoding=None,
757770
decimal='.', converters=None, na_values=None,
758-
keep_default_na=True):
771+
keep_default_na=True, url_params=None):
759772
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
760773
761774
Parameters
@@ -856,6 +869,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
856869
857870
.. versionadded:: 0.19.0
858871
872+
url_params : requests.Session(), default None
873+
A python requests.Session object if http(s) path to enable basic auth
874+
and many other scenarios that requests allows
875+
876+
.. versionadded:: 0.21.0
877+
859878
Returns
860879
-------
861880
dfs : list of DataFrames
@@ -903,4 +922,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
903922
parse_dates=parse_dates, tupleize_cols=tupleize_cols,
904923
thousands=thousands, attrs=attrs, encoding=encoding,
905924
decimal=decimal, converters=converters, na_values=na_values,
906-
keep_default_na=keep_default_na)
925+
keep_default_na=keep_default_na, url_params=url_params)

0 commit comments

Comments
 (0)