From 2031cfe5c5cc45dce3a19c91afad1947616f9614 Mon Sep 17 00:00:00 2001 From: Filipe Fernandes Date: Wed, 9 Oct 2019 14:27:27 -0400 Subject: [PATCH 1/5] use requests when it is installed --- pandas/io/common.py | 40 +++++++++++++++++++++++++++++++++------- pandas/io/excel/_base.py | 6 +++--- pandas/io/html.py | 18 ++++++++++-------- pandas/io/json/_json.py | 3 ++- pandas/io/parsers.py | 8 +++++++- 5 files changed, 55 insertions(+), 20 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 2ca2007e2925f..a9551a4e36018 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -183,6 +183,37 @@ def is_gcs_url(url) -> bool: return parse_url(url).scheme in ["gcs", "gs"] +def _urlopen(*args, **kwargs): + compression = None + content_encoding = None + try: + import requests + + url = args[0] + session = kwargs.pop("session", None) + if session: + if not isinstance(session, requests.sessions.Session): + raise ValueError( + "Expected a requests.sessions.Session object, " + "got {!r}".format(session) + ) + r = session.get(url) + else: + r = requests.get(url) + r.raise_for_status() + content = r.content + r.close() + except ImportError: + r = urlopen(*args, **kwargs) + content = r.read() + content_encoding = r.headers.get("Content-Encoding", None) + if content_encoding == "gzip": + # Override compression based on Content-Encoding header. + compression = "gzip" + reader = BytesIO(content) + return reader, compression + + def urlopen(*args, **kwargs): """ Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of @@ -198,6 +229,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, + session=None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -221,13 +253,7 @@ def get_filepath_or_buffer( filepath_or_buffer = _stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): - req = urlopen(filepath_or_buffer) - content_encoding = req.headers.get("Content-Encoding", None) - if content_encoding == "gzip": - # Override compression based on Content-Encoding header - compression = "gzip" - reader = BytesIO(req.read()) - req.close() + reader, compression = _urlopen(filepath_or_buffer, session=session) return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 039a0560af627..67de860a9877c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -20,7 +20,7 @@ _stringify_path, _validate_header_arg, get_filepath_or_buffer, - urlopen, + _urlopen, ) from pandas.io.excel._util import ( _fill_mi_header, @@ -336,10 +336,10 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, session=None): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): - filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) + filepath_or_buffer, _ = _urlopen(filepath_or_buffer, session=session) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) diff --git a/pandas/io/html.py b/pandas/io/html.py index 490c574463b9b..4ba79452201d1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -17,7 +17,7 @@ from pandas import Series -from pandas.io.common import _is_url, _validate_header_arg, urlopen +from pandas.io.common import _is_url, _urlopen, _validate_header_arg from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser @@ -109,7 +109,7 @@ def _get_skiprows(skiprows): ) -def _read(obj): +def _read(obj, session=None): """ Try to read from a url, file or string. @@ -122,8 +122,7 @@ def _read(obj): raw_text : str """ if _is_url(obj): - with urlopen(obj) as url: - text = url.read() + text, _ = _urlopen(obj, session=session) elif hasattr(obj, "read"): text = obj.read() elif isinstance(obj, (str, bytes)): @@ -199,12 +198,13 @@ class _HtmlFrameParser: functionality. """ - def __init__(self, io, match, attrs, encoding, displayed_only): + def __init__(self, io, match, attrs, encoding, displayed_only, session=None): self.io = io self.match = match self.attrs = attrs self.encoding = encoding self.displayed_only = displayed_only + self.session = session def parse_tables(self): """ @@ -588,7 +588,7 @@ def _parse_tfoot_tr(self, table): return table.select("tfoot tr") def _setup_build_doc(self): - raw_text = _read(self.io) + raw_text = _read(self.io, self.session) if not raw_text: raise ValueError("No text parsed from document: {doc}".format(doc=self.io)) return raw_text @@ -714,7 +714,7 @@ def _build_doc(self): try: if _is_url(self.io): - with urlopen(self.io) as f: + with _urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way @@ -891,9 +891,10 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): # hack around python 3 deleting the exception variable retained = None + session = kwargs.get("session", None) for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding, displayed_only) + p = parser(io, compiled_match, attrs, encoding, displayed_only, session) try: tables = p.parse_tables() @@ -943,6 +944,7 @@ def read_html( na_values=None, keep_default_na=True, displayed_only=True, + session=None, ): r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 73f4985e201f1..4821f369f1bd3 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -366,6 +366,7 @@ def read_json( lines=False, chunksize=None, compression="infer", + session=None, ): """ Convert a JSON string to pandas object. @@ -582,7 +583,7 @@ def read_json( compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression + path_or_buf, encoding=encoding, compression=compression, session=session ) json_reader = JsonReader( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3678e32943b2e..d008277263d7c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -342,6 +342,9 @@ values. The options are `None` for the ordinary converter, `high` for the high-precision converter, and `round_trip` for the round-trip converter. +session : requests.Session + object with the a requests session configuration for remote file. + (requires the requests library) Returns ------- @@ -423,6 +426,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding + session = kwds.get("session", None) compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) @@ -431,7 +435,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression + filepath_or_buffer, encoding, compression, session=session ) kwds["compression"] = compression @@ -588,6 +592,7 @@ def parser_f( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + session=None, ): # gh-23761 @@ -674,6 +679,7 @@ def parser_f( mangle_dupe_cols=mangle_dupe_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, + session=session, ) return _read(filepath_or_buffer, kwds) From 95e3b756c5b59f70b15847421aa01b896fb8c5a7 Mon Sep 17 00:00:00 2001 From: Filipe Fernandes Date: Wed, 9 Oct 2019 15:05:30 -0400 Subject: [PATCH 2/5] pop session out before calling urllib.request.urlopen --- pandas/io/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/common.py b/pandas/io/common.py index a9551a4e36018..5834712c3b37d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -221,6 +221,8 @@ def urlopen(*args, **kwargs): """ import urllib.request + _ = kwargs.pop("session") + return urllib.request.urlopen(*args, **kwargs) From ac39c2e815956eb96eb21feee836bb1d5e1f7ae2 Mon Sep 17 00:00:00 2001 From: Filipe Fernandes Date: Thu, 10 Oct 2019 11:58:59 -0400 Subject: [PATCH 3/5] no session obj for now --- pandas/io/common.py | 33 ++++++--------------------------- pandas/io/excel/_base.py | 7 +++---- pandas/io/html.py | 17 +++++++---------- pandas/io/json/_json.py | 3 +-- pandas/io/parsers.py | 8 +------- 5 files changed, 18 insertions(+), 50 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 5834712c3b37d..bbe939a5aa457 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -183,28 +183,20 @@ def is_gcs_url(url) -> bool: return parse_url(url).scheme in ["gcs", "gs"] -def _urlopen(*args, **kwargs): +def urlopen(*args, **kwargs): compression = None content_encoding = None try: import requests - url = args[0] - session = kwargs.pop("session", None) - if session: - if not isinstance(session, requests.sessions.Session): - raise ValueError( - "Expected a requests.sessions.Session object, " - "got {!r}".format(session) - ) - r = session.get(url) - else: - r = requests.get(url) + r = requests.get(*args, **kwargs) r.raise_for_status() content = r.content r.close() except ImportError: - r = urlopen(*args, **kwargs) + import urllib.request + + r = urllib.request.urlopen(*args, **kwargs) content = r.read() content_encoding = r.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -214,24 +206,11 @@ def _urlopen(*args, **kwargs): return reader, compression -def urlopen(*args, **kwargs): - """ - Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of - the stdlib. - """ - import urllib.request - - _ = kwargs.pop("session") - - return urllib.request.urlopen(*args, **kwargs) - - def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, - session=None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -255,7 +234,7 @@ def get_filepath_or_buffer( filepath_or_buffer = _stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): - reader, compression = _urlopen(filepath_or_buffer, session=session) + reader, compression = urlopen(filepath_or_buffer) return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 67de860a9877c..ae4c94dcde833 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,7 +1,6 @@ import abc from collections import OrderedDict from datetime import date, datetime, timedelta -from io import BytesIO import os from textwrap import fill @@ -20,7 +19,7 @@ _stringify_path, _validate_header_arg, get_filepath_or_buffer, - _urlopen, + urlopen, ) from pandas.io.excel._util import ( _fill_mi_header, @@ -336,10 +335,10 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer, session=None): + def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): - filepath_or_buffer, _ = _urlopen(filepath_or_buffer, session=session) + filepath_or_buffer, _ = urlopen(filepath_or_buffer) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) diff --git a/pandas/io/html.py b/pandas/io/html.py index 4ba79452201d1..6bb5e5436dc5a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -17,7 +17,7 @@ from pandas import Series -from pandas.io.common import _is_url, _urlopen, _validate_header_arg +from pandas.io.common import _is_url, _validate_header_arg, urlopen from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser @@ -109,7 +109,7 @@ def _get_skiprows(skiprows): ) -def _read(obj, session=None): +def _read(obj): """ Try to read from a url, file or string. @@ -122,7 +122,7 @@ def _read(obj, session=None): raw_text : str """ if _is_url(obj): - text, _ = _urlopen(obj, session=session) + text, _ = urlopen(obj) elif hasattr(obj, "read"): text = obj.read() elif isinstance(obj, (str, bytes)): @@ -198,13 +198,12 @@ class _HtmlFrameParser: functionality. """ - def __init__(self, io, match, attrs, encoding, displayed_only, session=None): + def __init__(self, io, match, attrs, encoding, displayed_only): self.io = io self.match = match self.attrs = attrs self.encoding = encoding self.displayed_only = displayed_only - self.session = session def parse_tables(self): """ @@ -588,7 +587,7 @@ def _parse_tfoot_tr(self, table): return table.select("tfoot tr") def _setup_build_doc(self): - raw_text = _read(self.io, self.session) + raw_text = _read(self.io) if not raw_text: raise ValueError("No text parsed from document: {doc}".format(doc=self.io)) return raw_text @@ -714,7 +713,7 @@ def _build_doc(self): try: if _is_url(self.io): - with _urlopen(self.io) as f: + with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way @@ -891,10 +890,9 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): # hack around python 3 deleting the exception variable retained = None - session = kwargs.get("session", None) for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding, displayed_only, session) + p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() @@ -944,7 +942,6 @@ def read_html( na_values=None, keep_default_na=True, displayed_only=True, - session=None, ): r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 4821f369f1bd3..73f4985e201f1 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -366,7 +366,6 @@ def read_json( lines=False, chunksize=None, compression="infer", - session=None, ): """ Convert a JSON string to pandas object. @@ -583,7 +582,7 @@ def read_json( compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, session=session + path_or_buf, encoding=encoding, compression=compression ) json_reader = JsonReader( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d008277263d7c..3678e32943b2e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -342,9 +342,6 @@ values. The options are `None` for the ordinary converter, `high` for the high-precision converter, and `round_trip` for the round-trip converter. -session : requests.Session - object with the a requests session configuration for remote file. - (requires the requests library) Returns ------- @@ -426,7 +423,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding - session = kwds.get("session", None) compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) @@ -435,7 +431,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression, session=session + filepath_or_buffer, encoding, compression ) kwds["compression"] = compression @@ -592,7 +588,6 @@ def parser_f( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - session=None, ): # gh-23761 @@ -679,7 +674,6 @@ def parser_f( mangle_dupe_cols=mangle_dupe_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, - session=session, ) return _read(filepath_or_buffer, kwds) From 03959aa24e5449335038567e5daca62450714e25 Mon Sep 17 00:00:00 2001 From: Filipe Fernandes Date: Thu, 10 Oct 2019 12:07:17 -0400 Subject: [PATCH 4/5] use import_optional_dependency --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index bbe939a5aa457..cbf385328429a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -32,6 +32,7 @@ import zipfile from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat._optional import import_optional_dependency from pandas.errors import ( # noqa AbstractMethodError, DtypeWarning, @@ -187,8 +188,7 @@ def urlopen(*args, **kwargs): compression = None content_encoding = None try: - import requests - + requests = import_optional_dependency("requests") r = requests.get(*args, **kwargs) r.raise_for_status() content = r.content From 02a236506b655d6199d05a3a6770f21286f6e350 Mon Sep 17 00:00:00 2001 From: Filipe Fernandes Date: Thu, 10 Oct 2019 14:53:05 -0400 Subject: [PATCH 5/5] documento min requests version --- doc/source/getting_started/install.rst | 1 + pandas/compat/_optional.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index fc99b458fa0af..b3402345f8c1a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -263,6 +263,7 @@ pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O +requests 2.10.0 Improves reading data from URLs s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index cd4e1b7e8aa4d..7756953aadbdf 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -18,6 +18,7 @@ "pandas_gbq": "0.8.0", "pyarrow": "0.9.0", "pytables": "3.4.2", + "requests": "2.10.0", "s3fs": "0.0.8", "scipy": "0.19.0", "sqlalchemy": "1.1.4",