From 52582ec8d4c75218db811e2c39e17cfc21c228da Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 29 Aug 2019 16:13:19 -0700 Subject: [PATCH 1/4] PERF: trim import time ~5% --- pandas/io/common.py | 16 +++++++++++++--- pandas/io/excel/_base.py | 2 +- pandas/tests/io/excel/test_readers.py | 3 +-- pandas/tests/io/parser/test_common.py | 2 +- pandas/tests/io/test_html.py | 3 ++- pandas/util/testing.py | 22 +++++++++++++++++----- 6 files changed, 35 insertions(+), 13 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 30228d660e816..baaaaee65931b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,7 +4,6 @@ import codecs import csv import gzip -from http.client import HTTPException # noqa from io import BufferedIOBase, BytesIO import mmap import os @@ -22,7 +21,6 @@ Type, Union, ) -from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, urljoin, @@ -31,7 +29,6 @@ uses_params, uses_relative, ) -from urllib.request import pathname2url, urlopen import zipfile from pandas.compat import _get_lzma_file, _import_lzma @@ -188,6 +185,16 @@ def is_gcs_url(url) -> bool: return False +def urlopen(*args, **kwargs): + """ + Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of + the stdlib. + """ + from urllib.request import urlopen as _urlopen + + return _urlopen(*args, **kwargs) + + def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, @@ -261,6 +268,9 @@ def file_path_to_url(path: str) -> str: ------- a valid FILE URL """ + # lazify expensive import (~30ms) + from urllib.request import pathname2url + return urljoin("file:", pathname2url(path)) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 997edf49d9e8f..949eff45c0e92 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,7 +4,6 @@ from io import BytesIO import os from textwrap import fill -from urllib.request import urlopen from pandas._config import config @@ -21,6 +20,7 @@ _stringify_path, _validate_header_arg, get_filepath_or_buffer, + urlopen, ) from pandas.io.excel._util import ( _fill_mi_header, diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a39cface0e015..5326f2df68972 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -3,6 +3,7 @@ from datetime import datetime, time from functools import partial import os +from urllib.error import URLError import warnings import numpy as np @@ -14,8 +15,6 @@ from pandas import DataFrame, Index, MultiIndex, Series import pandas.util.testing as tm -from pandas.io.common import URLError - @contextlib.contextmanager def ignore_xlrd_time_clock_warning(): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e04535df56663..4a79211a34819 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -11,6 +11,7 @@ import os import platform from tempfile import TemporaryFile +from urllib.error import URLError import numpy as np import pytest @@ -21,7 +22,6 @@ from pandas import DataFrame, Index, MultiIndex, Series, compat, concat import pandas.util.testing as tm -from pandas.io.common import URLError from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 615e2735cd288..183d217eb09d6 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -4,6 +4,7 @@ import os import re import threading +from urllib.error import URLError import numpy as np from numpy.random import rand @@ -17,7 +18,7 @@ import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network -from pandas.io.common import URLError, file_path_to_url +from pandas.io.common import file_path_to_url import pandas.io.html from pandas.io.html import read_html diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 0d543f891a5f6..c54dab046f57e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -4,7 +4,6 @@ from datetime import datetime from functools import wraps import gzip -import http.client import os import re from shutil import rmtree @@ -2275,11 +2274,17 @@ def dec(f): # But some tests (test_data yahoo) contact incredibly flakey # servers. -# and conditionally raise on these exception types -_network_error_classes = (IOError, http.client.HTTPException, TimeoutError) +# and conditionally raise on exception types in _get_default_network_errors -def can_connect(url, error_classes=_network_error_classes): +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + + return (IOError, http.client.HTTPException, TimeoutError) + + +def can_connect(url, error_classes=None): """Try to connect to the given url. True if succeeds, False if IOError raised @@ -2294,6 +2299,10 @@ def can_connect(url, error_classes=_network_error_classes): Return True if no IOError (unable to connect) or URLError (bad url) was raised """ + + if error_classes is None: + error_classes = _get_default_network_errors() + try: with urlopen(url): pass @@ -2309,7 +2318,7 @@ def network( url="http://www.google.com", raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, check_before_test=False, - error_classes=_network_error_classes, + error_classes=None, skip_errnos=_network_errno_vals, _skip_on_messages=_network_error_messages, ): @@ -2397,6 +2406,9 @@ def network( """ from pytest import skip + if error_classes is None: + error_classes = _get_default_network_errors() + t.network = True @wraps(t) From 97eb06ba821dd1fb26f05433890127c79f3020f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 08:11:53 -0700 Subject: [PATCH 2/4] update per comments --- ci/code_checks.sh | 2 +- pandas/io/common.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 333136ddfddd9..90e1a2aa33e97 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -203,7 +203,7 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then import sys import pandas -blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis', +blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2' 'hypothesis', 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} mods = blacklist & set(m.split('.')[0] for m in sys.modules) diff --git a/pandas/io/common.py b/pandas/io/common.py index baaaaee65931b..ac8dee8467370 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -190,9 +190,9 @@ def urlopen(*args, **kwargs): Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of the stdlib. """ - from urllib.request import urlopen as _urlopen + import urllib.request - return _urlopen(*args, **kwargs) + return urllib.request.urlopen(*args, **kwargs) def get_filepath_or_buffer( From 7baec3e57d2211e797b94555931b1379a65d7dea Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 30 Aug 2019 15:32:58 -0700 Subject: [PATCH 3/4] update check, disallow urllib.request --- ci/code_checks.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 90e1a2aa33e97..acc00e8f8662c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -205,8 +205,9 @@ import pandas blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2' 'hypothesis', 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', - 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} -mods = blacklist & set(m.split('.')[0] for m in sys.modules) + 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} +import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules) +mods = blacklist & import_mods if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) From dd7be9dea9eb4256fc9c4af71fed9291ad309131 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2019 15:50:26 -0700 Subject: [PATCH 4/4] comment --- ci/code_checks.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index acc00e8f8662c..2b564be12da3e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -206,6 +206,9 @@ import pandas blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2' 'hypothesis', 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} + +# GH#28227 for some of these check for top-level modules, while others are +# more specific (e.g. urllib.request) import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules) mods = blacklist & import_mods if mods: