Skip to content

Commit ba35026

Browse files
authored
[mypyc] Switch to table-driven imports for smaller IR (python#14917)
Add CPyImport_ImportFromMany() which imports a module and a tuple of names, placing them in the globals dict in one go. Previously, each name would imported and placed in globals manually in IR, leading to some pretty verbose code. The other option to collect all from imports and perform them all at once in the helper would remove even more ops, however, it has some major downsides: - It wouldn't be able to be done in IRBuild directly, instead being handled in the prebuild visitor and codegen... which all sounds really involved. - It would cause from imports to be performed eagerly, potentially causing circular imports (especially in functions whose imports are probably there to avoid a circular import!). The latter is the nail in the coffin for this idea. --- Change how imports (not from imports!) are processed so they can be table-driven (tuple-driven, really) and compact. Here's how it works: Import nodes are divided in groups (in the prebuild visitor). Each group consists of consecutive Import nodes: import mod <| group 1 import mod2 | def foo() -> None: import mod3 <- group 2 (*) import mod4 <- group 3 Every time we encounter the first import of a group, build IR to call CPyImport_ImportMany() that will perform all of the group's imports in one go. (*) Imports in functions or classes are still transformed into the original, verbose IR as speed is more important than codesize. Previously, each module would imported and placed in globals manually in IR, leading to some pretty verbose code. The other option to collect all imports and perform them all at once in the helper would remove even more ops, however, it's problematic for the same reasons from the previous commit (spoiler: it's not safe). Implementation notes: - I had to add support for loading the address of a static directly, so I shoehorned in LoadStatic support for LoadAddress. - Unfortunately by replacing multiple import nodes with a single function call at the IR level, if any import within a group fails, the traceback line number is static and will be probably wrong (pointing to the first import node in my original impl.). To fix this, I had to make CPyImport_ImportMany() add the traceback entry itself on failure (instead of letting codegen handle it automatically). This is admittedly ugly. Overall, this doesn't speed up initialization. The only real speed impact is that back to back imports in a tight loop seems to be 10-20% slower. I believe that's acceptable given the code size reduction. --- **Other changes:** - Don't declare internal static for non-compiled modules It won't be read anywhere as the internal statics are only used to avoid runaway recursion with import cycles in our module init functions. - Wrap long RArray initializers and long annotations in codegen Table-driven imports can load some rather large RArrays and tuple literals so this was needed to keep the generated C readable. - Add LLBuilder helper for setting up a RArray Resolves mypyc/mypyc#591.
1 parent aee983e commit ba35026

19 files changed

+954
-622
lines changed

mypyc/codegen/emit.py

+57-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
from __future__ import annotations
44

5+
import pprint
56
import sys
7+
import textwrap
68
from typing import Callable
79
from typing_extensions import Final
810

@@ -191,10 +193,31 @@ def reg(self, reg: Value) -> str:
191193
def attr(self, name: str) -> str:
192194
return ATTR_PREFIX + name
193195

194-
def emit_line(self, line: str = "") -> None:
196+
def object_annotation(self, obj: object, line: str) -> str:
197+
"""Build a C comment with an object's string represention.
198+
199+
If the comment exceeds the line length limit, it's wrapped into a
200+
multiline string (with the extra lines indented to be aligned with
201+
the first line's comment).
202+
203+
If it contains illegal characters, an empty string is returned."""
204+
line_width = self._indent + len(line)
205+
formatted = pprint.pformat(obj, compact=True, width=max(90 - line_width, 20))
206+
if any(x in formatted for x in ("/*", "*/", "\0")):
207+
return ""
208+
209+
if "\n" in formatted:
210+
first_line, rest = formatted.split("\n", maxsplit=1)
211+
comment_continued = textwrap.indent(rest, (line_width + 3) * " ")
212+
return f" /* {first_line}\n{comment_continued} */"
213+
else:
214+
return f" /* {formatted} */"
215+
216+
def emit_line(self, line: str = "", *, ann: object = None) -> None:
195217
if line.startswith("}"):
196218
self.dedent()
197-
self.fragments.append(self._indent * " " + line + "\n")
219+
comment = self.object_annotation(ann, line) if ann is not None else ""
220+
self.fragments.append(self._indent * " " + line + comment + "\n")
198221
if line.endswith("{"):
199222
self.indent()
200223

@@ -1119,3 +1142,35 @@ def _emit_traceback(
11191142
self.emit_line(line)
11201143
if DEBUG_ERRORS:
11211144
self.emit_line('assert(PyErr_Occurred() != NULL && "failure w/o err!");')
1145+
1146+
1147+
def c_array_initializer(components: list[str], *, indented: bool = False) -> str:
1148+
"""Construct an initializer for a C array variable.
1149+
1150+
Components are C expressions valid in an initializer.
1151+
1152+
For example, if components are ["1", "2"], the result
1153+
would be "{1, 2}", which can be used like this:
1154+
1155+
int a[] = {1, 2};
1156+
1157+
If the result is long, split it into multiple lines.
1158+
"""
1159+
indent = " " * 4 if indented else ""
1160+
res = []
1161+
current: list[str] = []
1162+
cur_len = 0
1163+
for c in components:
1164+
if not current or cur_len + 2 + len(indent) + len(c) < 70:
1165+
current.append(c)
1166+
cur_len += len(c) + 2
1167+
else:
1168+
res.append(indent + ", ".join(current))
1169+
current = [c]
1170+
cur_len = len(c)
1171+
if not res:
1172+
# Result fits on a single line
1173+
return "{%s}" % ", ".join(current)
1174+
# Multi-line result
1175+
res.append(indent + ", ".join(current))
1176+
return "{\n " + ",\n ".join(res) + "\n" + indent + "}"

mypyc/codegen/emitfunc.py

+18-25
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing_extensions import Final
66

77
from mypyc.analysis.blockfreq import frequently_executed_blocks
8-
from mypyc.codegen.emit import DEBUG_ERRORS, Emitter, TracebackAndGotoHandler
8+
from mypyc.codegen.emit import DEBUG_ERRORS, Emitter, TracebackAndGotoHandler, c_array_initializer
99
from mypyc.common import MODULE_PREFIX, NATIVE_PREFIX, REG_PREFIX, STATIC_PREFIX, TYPE_PREFIX
1010
from mypyc.ir.class_ir import ClassIR
1111
from mypyc.ir.func_ir import FUNC_CLASSMETHOD, FUNC_STATICMETHOD, FuncDecl, FuncIR, all_values
@@ -262,12 +262,12 @@ def visit_assign_multi(self, op: AssignMulti) -> None:
262262
# RArray values can only be assigned to once, so we can always
263263
# declare them on initialization.
264264
self.emit_line(
265-
"%s%s[%d] = {%s};"
265+
"%s%s[%d] = %s;"
266266
% (
267267
self.emitter.ctype_spaced(typ.item_type),
268268
dest,
269269
len(op.src),
270-
", ".join(self.reg(s) for s in op.src),
270+
c_array_initializer([self.reg(s) for s in op.src], indented=True),
271271
)
272272
)
273273

@@ -282,15 +282,12 @@ def visit_load_error_value(self, op: LoadErrorValue) -> None:
282282

283283
def visit_load_literal(self, op: LoadLiteral) -> None:
284284
index = self.literals.literal_index(op.value)
285-
s = repr(op.value)
286-
if not any(x in s for x in ("/*", "*/", "\0")):
287-
ann = " /* %s */" % s
288-
else:
289-
ann = ""
290285
if not is_int_rprimitive(op.type):
291-
self.emit_line("%s = CPyStatics[%d];%s" % (self.reg(op), index, ann))
286+
self.emit_line("%s = CPyStatics[%d];" % (self.reg(op), index), ann=op.value)
292287
else:
293-
self.emit_line("%s = (CPyTagged)CPyStatics[%d] | 1;%s" % (self.reg(op), index, ann))
288+
self.emit_line(
289+
"%s = (CPyTagged)CPyStatics[%d] | 1;" % (self.reg(op), index), ann=op.value
290+
)
294291

295292
def get_attr_expr(self, obj: str, op: GetAttr | SetAttr, decl_cl: ClassIR) -> str:
296293
"""Generate attribute accessor for normal (non-property) access.
@@ -468,12 +465,7 @@ def visit_load_static(self, op: LoadStatic) -> None:
468465
name = self.emitter.static_name(op.identifier, op.module_name, prefix)
469466
if op.namespace == NAMESPACE_TYPE:
470467
name = "(PyObject *)%s" % name
471-
ann = ""
472-
if op.ann:
473-
s = repr(op.ann)
474-
if not any(x in s for x in ("/*", "*/", "\0")):
475-
ann = " /* %s */" % s
476-
self.emit_line(f"{dest} = {name};{ann}")
468+
self.emit_line(f"{dest} = {name};", ann=op.ann)
477469

478470
def visit_init_static(self, op: InitStatic) -> None:
479471
value = self.reg(op.value)
@@ -636,12 +628,7 @@ def visit_extend(self, op: Extend) -> None:
636628

637629
def visit_load_global(self, op: LoadGlobal) -> None:
638630
dest = self.reg(op)
639-
ann = ""
640-
if op.ann:
641-
s = repr(op.ann)
642-
if not any(x in s for x in ("/*", "*/", "\0")):
643-
ann = " /* %s */" % s
644-
self.emit_line(f"{dest} = {op.identifier};{ann}")
631+
self.emit_line(f"{dest} = {op.identifier};", ann=op.ann)
645632

646633
def visit_int_op(self, op: IntOp) -> None:
647634
dest = self.reg(op)
@@ -727,7 +714,13 @@ def visit_get_element_ptr(self, op: GetElementPtr) -> None:
727714
def visit_load_address(self, op: LoadAddress) -> None:
728715
typ = op.type
729716
dest = self.reg(op)
730-
src = self.reg(op.src) if isinstance(op.src, Register) else op.src
717+
if isinstance(op.src, Register):
718+
src = self.reg(op.src)
719+
elif isinstance(op.src, LoadStatic):
720+
prefix = self.PREFIX_MAP[op.src.namespace]
721+
src = self.emitter.static_name(op.src.identifier, op.src.module_name, prefix)
722+
else:
723+
src = op.src
731724
self.emit_line(f"{dest} = ({typ._ctype})&{src};")
732725

733726
def visit_keep_alive(self, op: KeepAlive) -> None:
@@ -776,8 +769,8 @@ def c_error_value(self, rtype: RType) -> str:
776769
def c_undefined_value(self, rtype: RType) -> str:
777770
return self.emitter.c_undefined_value(rtype)
778771

779-
def emit_line(self, line: str) -> None:
780-
self.emitter.emit_line(line)
772+
def emit_line(self, line: str, *, ann: object = None) -> None:
773+
self.emitter.emit_line(line, ann=ann)
781774

782775
def emit_lines(self, *lines: str) -> None:
783776
self.emitter.emit_lines(*lines)

mypyc/codegen/emitmodule.py

+13-43
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from mypy.plugin import Plugin, ReportConfigContext
2727
from mypy.util import hash_digest
2828
from mypyc.codegen.cstring import c_string_initializer
29-
from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration
29+
from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer
3030
from mypyc.codegen.emitclass import generate_class, generate_class_type_decl
3131
from mypyc.codegen.emitfunc import generate_native_function, native_function_header
3232
from mypyc.codegen.emitwrapper import (
@@ -296,11 +296,11 @@ def compile_ir_to_c(
296296
# compiled into a separate extension module.
297297
ctext: dict[str | None, list[tuple[str, str]]] = {}
298298
for group_sources, group_name in groups:
299-
group_modules = [
300-
(source.module, modules[source.module])
299+
group_modules = {
300+
source.module: modules[source.module]
301301
for source in group_sources
302302
if source.module in modules
303-
]
303+
}
304304
if not group_modules:
305305
ctext[group_name] = []
306306
continue
@@ -465,7 +465,7 @@ def group_dir(group_name: str) -> str:
465465
class GroupGenerator:
466466
def __init__(
467467
self,
468-
modules: list[tuple[str, ModuleIR]],
468+
modules: dict[str, ModuleIR],
469469
source_paths: dict[str, str],
470470
group_name: str | None,
471471
group_map: dict[str, str | None],
@@ -512,7 +512,7 @@ def generate_c_for_modules(self) -> list[tuple[str, str]]:
512512
multi_file = self.use_shared_lib and self.multi_file
513513

514514
# Collect all literal refs in IR.
515-
for _, module in self.modules:
515+
for module in self.modules.values():
516516
for fn in module.functions:
517517
collect_literals(fn, self.context.literals)
518518

@@ -528,7 +528,7 @@ def generate_c_for_modules(self) -> list[tuple[str, str]]:
528528

529529
self.generate_literal_tables()
530530

531-
for module_name, module in self.modules:
531+
for module_name, module in self.modules.items():
532532
if multi_file:
533533
emitter = Emitter(self.context)
534534
emitter.emit_line(f'#include "__native{self.short_group_suffix}.h"')
@@ -582,7 +582,7 @@ def generate_c_for_modules(self) -> list[tuple[str, str]]:
582582
declarations.emit_line("int CPyGlobalsInit(void);")
583583
declarations.emit_line()
584584

585-
for module_name, module in self.modules:
585+
for module_name, module in self.modules.items():
586586
self.declare_finals(module_name, module.final_names, declarations)
587587
for cl in module.classes:
588588
generate_class_type_decl(cl, emitter, ext_declarations, declarations)
@@ -790,7 +790,7 @@ def generate_shared_lib_init(self, emitter: Emitter) -> None:
790790
"",
791791
)
792792

793-
for mod, _ in self.modules:
793+
for mod in self.modules:
794794
name = exported_name(mod)
795795
emitter.emit_lines(
796796
f"extern PyObject *CPyInit_{name}(void);",
@@ -1023,12 +1023,13 @@ def module_internal_static_name(self, module_name: str, emitter: Emitter) -> str
10231023
return emitter.static_name(module_name + "_internal", None, prefix=MODULE_PREFIX)
10241024

10251025
def declare_module(self, module_name: str, emitter: Emitter) -> None:
1026-
# We declare two globals for each module:
1026+
# We declare two globals for each compiled module:
10271027
# one used internally in the implementation of module init to cache results
10281028
# and prevent infinite recursion in import cycles, and one used
10291029
# by other modules to refer to it.
1030-
internal_static_name = self.module_internal_static_name(module_name, emitter)
1031-
self.declare_global("CPyModule *", internal_static_name, initializer="NULL")
1030+
if module_name in self.modules:
1031+
internal_static_name = self.module_internal_static_name(module_name, emitter)
1032+
self.declare_global("CPyModule *", internal_static_name, initializer="NULL")
10321033
static_name = emitter.static_name(module_name, None, prefix=MODULE_PREFIX)
10331034
self.declare_global("CPyModule *", static_name)
10341035
self.simple_inits.append((static_name, "Py_None"))
@@ -1126,37 +1127,6 @@ def collect_literals(fn: FuncIR, literals: Literals) -> None:
11261127
literals.record_literal(op.value)
11271128

11281129

1129-
def c_array_initializer(components: list[str]) -> str:
1130-
"""Construct an initializer for a C array variable.
1131-
1132-
Components are C expressions valid in an initializer.
1133-
1134-
For example, if components are ["1", "2"], the result
1135-
would be "{1, 2}", which can be used like this:
1136-
1137-
int a[] = {1, 2};
1138-
1139-
If the result is long, split it into multiple lines.
1140-
"""
1141-
res = []
1142-
current: list[str] = []
1143-
cur_len = 0
1144-
for c in components:
1145-
if not current or cur_len + 2 + len(c) < 70:
1146-
current.append(c)
1147-
cur_len += len(c) + 2
1148-
else:
1149-
res.append(", ".join(current))
1150-
current = [c]
1151-
cur_len = len(c)
1152-
if not res:
1153-
# Result fits on a single line
1154-
return "{%s}" % ", ".join(current)
1155-
# Multi-line result
1156-
res.append(", ".join(current))
1157-
return "{\n " + ",\n ".join(res) + "\n}"
1158-
1159-
11601130
def c_string_array_initializer(components: list[bytes]) -> str:
11611131
result = []
11621132
result.append("{\n")

mypyc/ir/ops.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1348,13 +1348,14 @@ class LoadAddress(RegisterOp):
13481348
Attributes:
13491349
type: Type of the loaded address(e.g. ptr/object_ptr)
13501350
src: Source value (str for globals like 'PyList_Type',
1351-
Register for temporary values or locals)
1351+
Register for temporary values or locals, LoadStatic
1352+
for statics.)
13521353
"""
13531354

13541355
error_kind = ERR_NEVER
13551356
is_borrowed = True
13561357

1357-
def __init__(self, type: RType, src: str | Register, line: int = -1) -> None:
1358+
def __init__(self, type: RType, src: str | Register | LoadStatic, line: int = -1) -> None:
13581359
super().__init__(line)
13591360
self.type = type
13601361
self.src = src

mypyc/ir/pprint.py

+5
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,11 @@ def visit_get_element_ptr(self, op: GetElementPtr) -> str:
266266
def visit_load_address(self, op: LoadAddress) -> str:
267267
if isinstance(op.src, Register):
268268
return self.format("%r = load_address %r", op, op.src)
269+
elif isinstance(op.src, LoadStatic):
270+
name = op.src.identifier
271+
if op.src.module_name is not None:
272+
name = f"{op.src.module_name}.{name}"
273+
return self.format("%r = load_address %s :: %s", op, name, op.src.namespace)
269274
else:
270275
return self.format("%r = load_address %s", op, op.src)
271276

mypyc/irbuild/builder.py

+3-23
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@
9090
RType,
9191
RUnion,
9292
bitmap_rprimitive,
93-
c_int_rprimitive,
9493
c_pyssize_t_rprimitive,
9594
dict_rprimitive,
9695
int_rprimitive,
@@ -127,12 +126,7 @@
127126
from mypyc.primitives.dict_ops import dict_get_item_op, dict_set_item_op
128127
from mypyc.primitives.generic_ops import iter_op, next_op, py_setattr_op
129128
from mypyc.primitives.list_ops import list_get_item_unsafe_op, list_pop_last, to_list
130-
from mypyc.primitives.misc_ops import (
131-
check_unpack_count_op,
132-
get_module_dict_op,
133-
import_extra_args_op,
134-
import_op,
135-
)
129+
from mypyc.primitives.misc_ops import check_unpack_count_op, get_module_dict_op, import_op
136130
from mypyc.primitives.registry import CFunctionDescription, function_ops
137131

138132
# These int binary operations can borrow their operands safely, since the
@@ -194,6 +188,8 @@ def __init__(
194188
self.encapsulating_funcs = pbv.encapsulating_funcs
195189
self.nested_fitems = pbv.nested_funcs.keys()
196190
self.fdefs_to_decorators = pbv.funcs_to_decorators
191+
self.module_import_groups = pbv.module_import_groups
192+
197193
self.singledispatch_impls = singledispatch_impls
198194

199195
self.visitor = visitor
@@ -395,22 +391,6 @@ def add_to_non_ext_dict(
395391
key_unicode = self.load_str(key)
396392
self.call_c(dict_set_item_op, [non_ext.dict, key_unicode, val], line)
397393

398-
def gen_import_from(
399-
self, id: str, globals_dict: Value, imported: list[str], line: int
400-
) -> Value:
401-
self.imports[id] = None
402-
403-
null_dict = Integer(0, dict_rprimitive, line)
404-
names_to_import = self.new_list_op([self.load_str(name) for name in imported], line)
405-
zero_int = Integer(0, c_int_rprimitive, line)
406-
value = self.call_c(
407-
import_extra_args_op,
408-
[self.load_str(id), globals_dict, null_dict, names_to_import, zero_int],
409-
line,
410-
)
411-
self.add(InitStatic(value, id, namespace=NAMESPACE_MODULE))
412-
return value
413-
414394
def gen_import(self, id: str, line: int) -> None:
415395
self.imports[id] = None
416396

mypyc/irbuild/function.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ def gen_func_ns(builder: IRBuilder) -> str:
515515
return "_".join(
516516
info.name + ("" if not info.class_name else "_" + info.class_name)
517517
for info in builder.fn_infos
518-
if info.name and info.name != "<top level>"
518+
if info.name and info.name != "<module>"
519519
)
520520

521521

0 commit comments

Comments
 (0)