Skip to content

use requests when it is installed #28874

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 35 additions & 7 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,46 @@ def is_gcs_url(url) -> bool:
return parse_url(url).scheme in ["gcs", "gs"]


def _urlopen(*args, **kwargs):
compression = None
content_encoding = None
try:
import requests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to use the pandas.compat._optional.import_optional_dependency


url = args[0]
session = kwargs.pop("session", None)
if session:
if not isinstance(session, requests.sessions.Session):
raise ValueError(
"Expected a requests.sessions.Session object, "
"got {!r}".format(session)
)
r = session.get(url)
else:
r = requests.get(url)
r.raise_for_status()
content = r.content
r.close()
except ImportError:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use pandas.compat._optional.import_optional_dependency here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if I got it right.

r = urlopen(*args, **kwargs)
content = r.read()
content_encoding = r.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this also needs to be under the except ImportError? From what I can tell, requests .content will automatically decode gzip: https://requests.readthedocs.io/en/master/user/quickstart/#binary-response-content

# Override compression based on Content-Encoding header.
compression = "gzip"
reader = BytesIO(content)
return reader, compression


def urlopen(*args, **kwargs):
"""
Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
the stdlib.
"""
import urllib.request

_ = kwargs.pop("session")

return urllib.request.urlopen(*args, **kwargs)


Expand All @@ -198,6 +231,7 @@ def get_filepath_or_buffer(
encoding: Optional[str] = None,
compression: Optional[str] = None,
mode: Optional[str] = None,
session=None,
):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Expand All @@ -221,13 +255,7 @@ def get_filepath_or_buffer(
filepath_or_buffer = _stringify_path(filepath_or_buffer)

if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
# Override compression based on Content-Encoding header
compression = "gzip"
reader = BytesIO(req.read())
req.close()
reader, compression = _urlopen(filepath_or_buffer, session=session)
return reader, encoding, compression, True

if is_s3_url(filepath_or_buffer):
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
_stringify_path,
_validate_header_arg,
get_filepath_or_buffer,
urlopen,
_urlopen,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use the non-private version

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

)
from pandas.io.excel._util import (
_fill_mi_header,
Expand Down Expand Up @@ -336,10 +336,10 @@ def read_excel(


class _BaseExcelReader(metaclass=abc.ABCMeta):
def __init__(self, filepath_or_buffer):
def __init__(self, filepath_or_buffer, session=None):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit lost on how to pass the session object to this class, right now it defaults to None b/c it does not get the value from read_excel.

# If filepath_or_buffer is a url, load the data into a BytesIO
if _is_url(filepath_or_buffer):
filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
filepath_or_buffer, _ = _urlopen(filepath_or_buffer, session=session)
elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)

Expand Down
18 changes: 10 additions & 8 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from pandas import Series

from pandas.io.common import _is_url, _validate_header_arg, urlopen
from pandas.io.common import _is_url, _urlopen, _validate_header_arg
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import TextParser

Expand Down Expand Up @@ -109,7 +109,7 @@ def _get_skiprows(skiprows):
)


def _read(obj):
def _read(obj, session=None):
"""
Try to read from a url, file or string.

Expand All @@ -122,8 +122,7 @@ def _read(obj):
raw_text : str
"""
if _is_url(obj):
with urlopen(obj) as url:
text = url.read()
text, _ = _urlopen(obj, session=session)
elif hasattr(obj, "read"):
text = obj.read()
elif isinstance(obj, (str, bytes)):
Expand Down Expand Up @@ -199,12 +198,13 @@ class _HtmlFrameParser:
functionality.
"""

def __init__(self, io, match, attrs, encoding, displayed_only):
def __init__(self, io, match, attrs, encoding, displayed_only, session=None):
self.io = io
self.match = match
self.attrs = attrs
self.encoding = encoding
self.displayed_only = displayed_only
self.session = session

def parse_tables(self):
"""
Expand Down Expand Up @@ -588,7 +588,7 @@ def _parse_tfoot_tr(self, table):
return table.select("tfoot tr")

def _setup_build_doc(self):
raw_text = _read(self.io)
raw_text = _read(self.io, self.session)
if not raw_text:
raise ValueError("No text parsed from document: {doc}".format(doc=self.io))
return raw_text
Expand Down Expand Up @@ -714,7 +714,7 @@ def _build_doc(self):

try:
if _is_url(self.io):
with urlopen(self.io) as f:
with _urlopen(self.io) as f:
r = parse(f, parser=parser)
else:
# try to parse the input in the simplest way
Expand Down Expand Up @@ -891,9 +891,10 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):

# hack around python 3 deleting the exception variable
retained = None
session = kwargs.get("session", None)
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs, encoding, displayed_only)
p = parser(io, compiled_match, attrs, encoding, displayed_only, session)

try:
tables = p.parse_tables()
Expand Down Expand Up @@ -943,6 +944,7 @@ def read_html(
na_values=None,
keep_default_na=True,
displayed_only=True,
session=None,
):
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ def read_json(
lines=False,
chunksize=None,
compression="infer",
session=None,
):
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -582,7 +583,7 @@ def read_json(

compression = _infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression
path_or_buf, encoding=encoding, compression=compression, session=session
)

json_reader = JsonReader(
Expand Down
8 changes: 7 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,9 @@
values. The options are `None` for the ordinary converter,
`high` for the high-precision converter, and `round_trip` for the
round-trip converter.
session : requests.Session
object with the a requests session configuration for remote file.
(requires the requests library)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a version added tag

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what that means.


Returns
-------
Expand Down Expand Up @@ -423,6 +426,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
encoding = re.sub("_", "-", encoding).lower()
kwds["encoding"] = encoding

session = kwds.get("session", None)
compression = kwds.get("compression", "infer")
compression = _infer_compression(filepath_or_buffer, compression)

Expand All @@ -431,7 +435,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
# though mypy handling of conditional imports is difficult.
# See https://github.com/python/mypy/issues/1297
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
filepath_or_buffer, encoding, compression
filepath_or_buffer, encoding, compression, session=session
)
kwds["compression"] = compression

Expand Down Expand Up @@ -588,6 +592,7 @@ def parser_f(
low_memory=_c_parser_defaults["low_memory"],
memory_map=False,
float_precision=None,
session=None,
):

# gh-23761
Expand Down Expand Up @@ -674,6 +679,7 @@ def parser_f(
mangle_dupe_cols=mangle_dupe_cols,
infer_datetime_format=infer_datetime_format,
skip_blank_lines=skip_blank_lines,
session=session,
)

return _read(filepath_or_buffer, kwds)
Expand Down