Skip to content

Commit 290f013

Browse files
hugues-affJukkaL
authored andcommitted
FindModuleCache: optionally leverage BuildSourceSet (#12616)
Given a large codebase with folder hierarchy of the form ``` foo/ company/ __init__.py foo/ bar/ company/ __init__.py bar/ baz/ company/ __init__.py baz/ ... ``` with >100 toplevel folders, the time spent in load_graph is dominated by find_module because this operation is itself O(n) where n is the number of input files, which ends up being O(n**2) because it is called for every import statement in the codebase and the way find_module work, it will always scan through each and every one of those toplevel directories for each and every import statement of company.* Introduce a fast path that leverages the fact that for imports within the code being typechecked, we already have a mapping of module import path to file path in BuildSourceSet Gated behind a command line flag (--fast-module-lookup) to assuage concerns about subtle issues in module lookup being introduced by this fast path.
1 parent aa7c21a commit 290f013

File tree

8 files changed

+283
-32
lines changed

8 files changed

+283
-32
lines changed

docs/source/command_line.rst

+23
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,29 @@ imports.
212212
By default, mypy will suppress any error messages generated within :pep:`561`
213213
compliant packages. Adding this flag will disable this behavior.
214214

215+
.. option:: --fast-module-lookup
216+
217+
The default logic used to scan through search paths to resolve imports has a
218+
quadratic worse-case behavior in some cases, which is for instance triggered
219+
by a large number of folders sharing a top-level namespace as in:
220+
221+
foo/
222+
company/
223+
foo/
224+
a.py
225+
bar/
226+
company/
227+
bar/
228+
b.py
229+
baz/
230+
company/
231+
baz/
232+
c.py
233+
...
234+
235+
If you are in this situation, you can enable an experimental fast path by
236+
setting the :option:`--fast-module-lookup` option.
237+
215238

216239
.. _platform-configuration:
217240

docs/source/running_mypy.rst

+1
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,7 @@ same directory on the search path, only the stub file is used.
516516
(However, if the files are in different directories, the one found
517517
in the earlier directory is used.)
518518

519+
519520
Other advice and best practices
520521
*******************************
521522

mypy/build.py

+4-30
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
from mypy.report import Reports # Avoid unconditional slow import
4444
from mypy.fixup import fixup_module
4545
from mypy.modulefinder import (
46-
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
47-
ModuleNotFoundReason
46+
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths,
47+
ModuleSearchResult, ModuleNotFoundReason
4848
)
4949
from mypy.nodes import Expression
5050
from mypy.options import Options
@@ -107,33 +107,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
107107
self.errors: List[str] = [] # Filled in by build if desired
108108

109109

110-
class BuildSourceSet:
111-
"""Efficiently test a file's membership in the set of build sources."""
112-
113-
def __init__(self, sources: List[BuildSource]) -> None:
114-
self.source_text_present = False
115-
self.source_modules: Set[str] = set()
116-
self.source_paths: Set[str] = set()
117-
118-
for source in sources:
119-
if source.text is not None:
120-
self.source_text_present = True
121-
elif source.path:
122-
self.source_paths.add(source.path)
123-
else:
124-
self.source_modules.add(source.module)
125-
126-
def is_source(self, file: MypyFile) -> bool:
127-
if file.path and file.path in self.source_paths:
128-
return True
129-
elif file._fullname in self.source_modules:
130-
return True
131-
elif self.source_text_present:
132-
return True
133-
else:
134-
return False
135-
136-
137110
def build(sources: List[BuildSource],
138111
options: Options,
139112
alt_lib_path: Optional[str] = None,
@@ -630,7 +603,8 @@ def __init__(self, data_dir: str,
630603
or options.use_fine_grained_cache)
631604
and not has_reporters)
632605
self.fscache = fscache
633-
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
606+
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
607+
source_set=self.source_set)
634608
self.metastore = create_metastore(options)
635609

636610
# a mapping from source files to their corresponding shadow files

mypy/main.py

+4
Original file line numberDiff line numberDiff line change
@@ -881,6 +881,10 @@ def add_invertible_flag(flag: str,
881881
'--explicit-package-bases', default=False,
882882
help="Use current directory and MYPYPATH to determine module names of files passed",
883883
group=code_group)
884+
add_invertible_flag(
885+
'--fast-module-lookup', default=False,
886+
help=argparse.SUPPRESS,
887+
group=code_group)
884888
code_group.add_argument(
885889
"--exclude",
886890
action="append",

mypy/modulefinder.py

+112-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from typing_extensions import Final, TypeAlias as _TypeAlias
2424

2525
from mypy.fscache import FileSystemCache
26+
from mypy.nodes import MypyFile
2627
from mypy.options import Options
2728
from mypy.stubinfo import is_legacy_bundled_package
2829
from mypy import pyinfo
@@ -126,6 +127,33 @@ def __repr__(self) -> str:
126127
self.base_dir)
127128

128129

130+
class BuildSourceSet:
131+
"""Helper to efficiently test a file's membership in a set of build sources."""
132+
133+
def __init__(self, sources: List[BuildSource]) -> None:
134+
self.source_text_present = False
135+
self.source_modules = {} # type: Dict[str, str]
136+
self.source_paths = set() # type: Set[str]
137+
138+
for source in sources:
139+
if source.text is not None:
140+
self.source_text_present = True
141+
if source.path:
142+
self.source_paths.add(source.path)
143+
if source.module:
144+
self.source_modules[source.module] = source.path or ''
145+
146+
def is_source(self, file: MypyFile) -> bool:
147+
if file.path and file.path in self.source_paths:
148+
return True
149+
elif file._fullname in self.source_modules:
150+
return True
151+
elif self.source_text_present:
152+
return True
153+
else:
154+
return False
155+
156+
129157
class FindModuleCache:
130158
"""Module finder with integrated cache.
131159
@@ -141,8 +169,10 @@ def __init__(self,
141169
search_paths: SearchPaths,
142170
fscache: Optional[FileSystemCache],
143171
options: Optional[Options],
144-
stdlib_py_versions: Optional[StdlibVersions] = None) -> None:
172+
stdlib_py_versions: Optional[StdlibVersions] = None,
173+
source_set: Optional[BuildSourceSet] = None) -> None:
145174
self.search_paths = search_paths
175+
self.source_set = source_set
146176
self.fscache = fscache or FileSystemCache()
147177
# Cache for get_toplevel_possibilities:
148178
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -164,6 +194,53 @@ def clear(self) -> None:
164194
self.initial_components.clear()
165195
self.ns_ancestors.clear()
166196

197+
def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
198+
"""Fast path to find modules by looking through the input sources
199+
200+
This is only used when --fast-module-lookup is passed on the command line."""
201+
if not self.source_set:
202+
return None
203+
204+
p = self.source_set.source_modules.get(id, None)
205+
if p and self.fscache.isfile(p):
206+
# We need to make sure we still have __init__.py all the way up
207+
# otherwise we might have false positives compared to slow path
208+
# in case of deletion of init files, which is covered by some tests.
209+
# TODO: are there some combination of flags in which this check should be skipped?
210+
d = os.path.dirname(p)
211+
for _ in range(id.count('.')):
212+
if not any(self.fscache.isfile(os.path.join(d, '__init__' + x))
213+
for x in PYTHON_EXTENSIONS):
214+
return None
215+
d = os.path.dirname(d)
216+
return p
217+
218+
idx = id.rfind('.')
219+
if idx != -1:
220+
# When we're looking for foo.bar.baz and can't find a matching module
221+
# in the source set, look up for a foo.bar module.
222+
parent = self.find_module_via_source_set(id[:idx])
223+
if parent is None or not isinstance(parent, str):
224+
return None
225+
226+
basename, ext = os.path.splitext(parent)
227+
if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS)
228+
and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))):
229+
# If we do find such a *module* (and crucially, we don't want a package,
230+
# hence the filtering out of __init__ files, and checking for the presence
231+
# of a folder with a matching name), then we can be pretty confident that
232+
# 'baz' will either be a top-level variable in foo.bar, or will not exist.
233+
#
234+
# Either way, spelunking in other search paths for another 'foo.bar.baz'
235+
# module should be avoided because:
236+
# 1. in the unlikely event that one were found, it's highly likely that
237+
# it would be unrelated to the source being typechecked and therefore
238+
# more likely to lead to erroneous results
239+
# 2. as described in _find_module, in some cases the search itself could
240+
# potentially waste significant amounts of time
241+
return ModuleNotFoundReason.NOT_FOUND
242+
return None
243+
167244
def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
168245
"""Find which elements of a lib_path have the directory a module needs to exist.
169246
@@ -229,7 +306,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
229306
elif top_level in self.stdlib_py_versions:
230307
use_typeshed = self._typeshed_has_version(top_level)
231308
self.results[id] = self._find_module(id, use_typeshed)
232-
if (not fast_path
309+
if (not (fast_path or (self.options is not None and self.options.fast_module_lookup))
233310
and self.results[id] is ModuleNotFoundReason.NOT_FOUND
234311
and self._can_find_module_in_parent_dir(id)):
235312
self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY
@@ -295,6 +372,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
295372
def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult:
296373
fscache = self.fscache
297374

375+
# Fast path for any modules in the current source set.
376+
# This is particularly important when there are a large number of search
377+
# paths which share the first (few) component(s) due to the use of namespace
378+
# packages, for instance:
379+
# foo/
380+
# company/
381+
# __init__.py
382+
# foo/
383+
# bar/
384+
# company/
385+
# __init__.py
386+
# bar/
387+
# baz/
388+
# company/
389+
# __init__.py
390+
# baz/
391+
#
392+
# mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
393+
# and computes [foo, bar, baz, ...] as the module search path.
394+
#
395+
# This would result in O(n) search for every import of company.*, leading to
396+
# O(n**2) behavior in load_graph as such imports are unsurprisingly present
397+
# at least once, and usually many more times than that, in each and every file
398+
# being parsed.
399+
#
400+
# Thankfully, such cases are efficiently handled by looking up the module path
401+
# via BuildSourceSet.
402+
p = (self.find_module_via_source_set(id)
403+
if (self.options is not None and self.options.fast_module_lookup)
404+
else None)
405+
if p:
406+
return p
407+
298408
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
299409
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
300410
# that only once and cache it for when we look for modules like 'foo.bar.blah'

mypy/options.py

+2
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ def __init__(self) -> None:
293293
self.cache_map: Dict[str, Tuple[str, str]] = {}
294294
# Don't properly free objects on exit, just kill the current process.
295295
self.fast_exit = True
296+
# fast path for finding modules from source set
297+
self.fast_module_lookup = False
296298
# Used to transform source code before parsing if not None
297299
# TODO: Make the type precise (AnyStr -> AnyStr)
298300
self.transform_source: Optional[Callable[[Any], Any]] = None

mypy/test/testcheck.py

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
'check-multiple-inheritance.test',
4141
'check-super.test',
4242
'check-modules.test',
43+
'check-modules-fast.test',
4344
'check-typevar-values.test',
4445
'check-unsupported.test',
4546
'check-unreachable-code.test',

0 commit comments

Comments
 (0)