Skip to content

Commit 6a70556

Browse files
committed
Auto merge of rust-lang#94412 - scottmcm:cfg-out-miri-from-swap, r=oli-obk
For MIRI, cfg out the swap vectorization logic from 94212 Because of rust-lang#69488 the swap logic from rust-lang#94212 doesn't currently work in MIRI. Copying in smaller pieces is probably much worse for its performance anyway, so it'd probably rather just use the simple path regardless. Part of rust-lang#94371, though another PR will be needed for the CTFE aspect. r? `@oli-obk` cc `@RalfJung`
2 parents 3b1fe7e + b582bd3 commit 6a70556

File tree

3 files changed

+64
-13
lines changed

3 files changed

+64
-13
lines changed

Diff for: library/core/src/mem/mod.rs

+22-5
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,10 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
708708
// understanding `mem::replace`, `Option::take`, etc. - a better overall
709709
// solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
710710
// a backend can choose to implement using the block optimization, or not.
711-
#[cfg(not(target_arch = "spirv"))]
711+
// NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
712+
// pessimization for it. Also, if the type contains any unaligned pointers,
713+
// copying those over multiple reads is difficult to support.
714+
#[cfg(not(any(target_arch = "spirv", miri)))]
712715
{
713716
// For types that are larger multiples of their alignment, the simple way
714717
// tends to copy the whole thing to stack rather than doing it one part
@@ -737,12 +740,26 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
737740
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
738741
#[inline]
739742
pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
743+
// We arrange for this to typically be called with small types,
744+
// so this reads-and-writes approach is actually better than using
745+
// copy_nonoverlapping as it easily puts things in LLVM registers
746+
// directly and doesn't end up inlining allocas.
747+
// And LLVM actually optimizes it to 3×memcpy if called with
748+
// a type larger than it's willing to keep in a register.
749+
// Having typed reads and writes in MIR here is also good as
750+
// it lets MIRI and CTFE understand them better, including things
751+
// like enforcing type validity for them.
752+
// Importantly, read+copy_nonoverlapping+write introduces confusing
753+
// asymmetry to the behaviour where one value went through read+write
754+
// whereas the other was copied over by the intrinsic (see #94371).
755+
740756
// SAFETY: exclusive references are always valid to read/write,
741-
// are non-overlapping, and nothing here panics so it's drop-safe.
757+
// including being aligned, and nothing here panics so it's drop-safe.
742758
unsafe {
743-
let z = ptr::read(x);
744-
ptr::copy_nonoverlapping(y, x, 1);
745-
ptr::write(y, z);
759+
let a = ptr::read(x);
760+
let b = ptr::read(y);
761+
ptr::write(x, b);
762+
ptr::write(y, a);
746763
}
747764
}
748765

Diff for: library/core/src/ptr/mod.rs

+15-8
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ pub const unsafe fn swap<T>(x: *mut T, y: *mut T) {
419419
#[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
420420
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
421421
pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
422+
#[allow(unused)]
422423
macro_rules! attempt_swap_as_chunks {
423424
($ChunkTy:ty) => {
424425
if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
@@ -437,15 +438,21 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
437438
};
438439
}
439440

440-
// Split up the slice into small power-of-two-sized chunks that LLVM is able
441-
// to vectorize (unless it's a special type with more-than-pointer alignment,
442-
// because we don't want to pessimize things like slices of SIMD vectors.)
443-
if mem::align_of::<T>() <= mem::size_of::<usize>()
444-
&& (!mem::size_of::<T>().is_power_of_two()
445-
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
441+
// NOTE(scottmcm) MIRI is disabled here as reading in smaller units is a
442+
// pessimization for it. Also, if the type contains any unaligned pointers,
443+
// copying those over multiple reads is difficult to support.
444+
#[cfg(not(miri))]
446445
{
447-
attempt_swap_as_chunks!(usize);
448-
attempt_swap_as_chunks!(u8);
446+
// Split up the slice into small power-of-two-sized chunks that LLVM is able
447+
// to vectorize (unless it's a special type with more-than-pointer alignment,
448+
// because we don't want to pessimize things like slices of SIMD vectors.)
449+
if mem::align_of::<T>() <= mem::size_of::<usize>()
450+
&& (!mem::size_of::<T>().is_power_of_two()
451+
|| mem::size_of::<T>() > mem::size_of::<usize>() * 2)
452+
{
453+
attempt_swap_as_chunks!(usize);
454+
attempt_swap_as_chunks!(u8);
455+
}
449456
}
450457

451458
// SAFETY: Same preconditions as this function

Diff for: src/test/codegen/swap-large-types.rs

+27
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
3939
swap(x, y)
4040
}
4141

42+
// Verify that types with usize alignment are swapped via vectored usizes,
43+
// not falling back to byte-level code.
44+
4245
// CHECK-LABEL: @swap_slice
4346
#[no_mangle]
4447
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
@@ -50,6 +53,8 @@ pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
5053
}
5154
}
5255

56+
// But for a large align-1 type, vectorized byte copying is what we want.
57+
5358
type OneKilobyteBuffer = [u8; 1024];
5459

5560
// CHECK-LABEL: @swap_1kb_slices
@@ -62,3 +67,25 @@ pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer])
6267
x.swap_with_slice(y);
6368
}
6469
}
70+
71+
// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
72+
// for an unusual type like this. It's not clear whether we should do anything
73+
// smarter in Rust for these, so for now it's fine to leave these up to the backend.
74+
// That's not as bad as it might seem, as for example, LLVM will lower the
75+
// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
76+
// Eventually we'll be able to pass `align_of::<T>` to a const generic and
77+
// thus pick a smarter chunk size ourselves without huge code duplication.
78+
79+
#[repr(align(64))]
80+
pub struct BigButHighlyAligned([u8; 64 * 3]);
81+
82+
// CHECK-LABEL: @swap_big_aligned
83+
#[no_mangle]
84+
pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
85+
// CHECK-NOT: call void @llvm.memcpy
86+
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
87+
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
88+
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 64 dereferenceable(192)
89+
// CHECK-NOT: call void @llvm.memcpy
90+
swap(x, y)
91+
}

0 commit comments

Comments
 (0)