-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit 31a98cb
committed
Auto merge of #122059 - nyurik:with-as-const-str, r=cuviper
Optimize write with as_const_str for shorter code
Following up on #121001
Apparently this code generates significant code block for each call to `write()` with non-simple formatting string - approx 100 lines of assembly code, possibly due to `dyn` (?). See generated assembly code [here](https://github.com/nyurik/rust-optimize-format-str/compare/before-changes..with-my-change#diff-6b404e954c692d8cdc8c452d819a216aa5dcf40522b5944639e9ad947279a477):
<details><summary>Details</summary>
<p>
This is the inlining of `write!(buffer, "Iteration {value} was written")`
```asm
core::fmt::Write::write_fmt:
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 194
fn write_fmt(&mut self, args: Arguments<'_>) -> Result {
push r15
push r14
push r13
push r12
push rbx
mov rdx, rsi
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 427
match (self.pieces, self.args) {
mov rcx, qword ptr [rsi + 8]
mov rax, qword ptr [rsi + 24]
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 428
([], []) => Some(""),
cmp rcx, 1
je .LBB0_8
test rcx, rcx
jne .LBB0_9
test rax, rax
jne .LBB0_9
// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
self.buf.reserve(self.len, additional);
lea r12, [rdi + 16]
lea rsi, [rip + .L__unnamed_2]
xor ebx, ebx
.LBB0_6:
mov r14, qword ptr [r12]
jmp .LBB0_7
.LBB0_8:
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429
([s], []) => Some(s),
test rax, rax
je .LBB0_4
.LBB0_9:
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 1108
if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) }
lea rsi, [rip + .L__unnamed_1]
pop rbx
pop r12
pop r13
pop r14
pop r15
jmp qword ptr [rip + core::fmt::write_internal@GOTPCREL]
.LBB0_4:
mov rax, qword ptr [rdx]
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429
([s], []) => Some(s),
mov rsi, qword ptr [rax]
mov rbx, qword ptr [rax + 8]
// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 248
if T::IS_ZST { usize::MAX } else { self.cap.0 }
mov rax, qword ptr [rdi]
// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
self.buf.reserve(self.len, additional);
mov r14, qword ptr [rdi + 16]
// /home/nyurik/dev/rust/rust/library/core/src/num/mod.rs : 1281
uint_impl! {
sub rax, r14
// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 392
additional > self.capacity().wrapping_sub(len)
cmp rax, rbx
// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 309
if self.needs_to_grow(len, additional) {
jb .LBB0_5
.LBB0_7:
mov rax, qword ptr [rdi + 8]
// /home/nyurik/dev/rust/rust/library/core/src/ptr/mut_ptr.rs : 1046
unsafe { intrinsics::offset(self, count) }
add rax, r14
mov r15, rdi
// /home/nyurik/dev/rust/rust/library/core/src/intrinsics.rs : 2922
copy_nonoverlapping(src, dst, count)
mov rdi, rax
mov rdx, rbx
call qword ptr [rip + memcpy@GOTPCREL]
// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 2040
self.len += count;
add r14, rbx
mov qword ptr [r15 + 16], r14
// /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 216
}
xor eax, eax
pop rbx
pop r12
pop r13
pop r14
pop r15
ret
.LBB0_5:
// /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
self.buf.reserve(self.len, additional);
lea r12, [rdi + 16]
mov r15, rdi
mov r13, rsi
// /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 310
do_reserve_and_handle(self, len, additional);
mov rsi, r14
mov rdx, rbx
call alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle
mov rsi, r13
mov rdi, r15
jmp .LBB0_6
```
</p>
</details>
```rust
#[inline]
pub fn write(output: &mut dyn Write, args: Arguments<'_>) -> Result {
if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) }
}
```
So, this brings back the older experiment - where I used `if core::intrinsics::is_val_statically_known(s.is_some()) { s } else { None }` helper function, and called it in multiple places that used `write`. This is not as optimal because now every user of `write` must do this logic, but at least it results in significantly smaller assembly code for the formatting case, and results in identical code as now for the "simple" (no formatting) case. See [assembly comparison](https://github.com/nyurik/rust-optimize-format-str/compare/with-my-change..with-as-const-str#diff-6b404e954c692d8cdc8c452d819a216aa5dcf40522b5944639e9ad947279a477) of what is now with what this change brings (focus only on `fmt/intel-lib.txt` and `str/intel-lib.txt` files).
```rust
if let Some(s) = args.as_const_str() {
self.write_str(s)
} else {
write(self, args)
}
```File tree
0 file changed
+0
-0
lines changedFilter options
0 file changed
+0
-0
lines changed
0 commit comments