Skip to content

Commit e1b020d

Browse files
committed
Use load-store instead of memcpy for short integer arrays
1 parent cce0b52 commit e1b020d

File tree

8 files changed

+136
-16
lines changed

8 files changed

+136
-16
lines changed

Diff for: compiler/rustc_codegen_llvm/src/type_.rs

+3
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,9 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
288288
fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
289289
ty.llvm_type(self)
290290
}
291+
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
292+
layout.scalar_copy_llvm_type(self)
293+
}
291294
}
292295

293296
impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {

Diff for: compiler/rustc_codegen_llvm/src/type_of.rs

+33
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use rustc_middle::bug;
66
use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout};
77
use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
88
use rustc_middle::ty::{self, Ty, TypeVisitableExt};
9+
use rustc_target::abi::HasDataLayout;
910
use rustc_target::abi::{Abi, Align, FieldsShape};
1011
use rustc_target::abi::{Int, Pointer, F32, F64};
1112
use rustc_target::abi::{PointeeInfo, Scalar, Size, TyAbiInterface, Variants};
@@ -192,6 +193,7 @@ pub trait LayoutLlvmExt<'tcx> {
192193
) -> &'a Type;
193194
fn llvm_field_index<'a>(&self, cx: &CodegenCx<'a, 'tcx>, index: usize) -> u64;
194195
fn pointee_info_at<'a>(&self, cx: &CodegenCx<'a, 'tcx>, offset: Size) -> Option<PointeeInfo>;
196+
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
195197
}
196198

197199
impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
@@ -414,4 +416,35 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
414416
cx.pointee_infos.borrow_mut().insert((self.ty, offset), result);
415417
result
416418
}
419+
420+
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
421+
debug_assert!(self.is_sized());
422+
423+
// FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
424+
// (matching the 128-bit SIMD types proposal) and 256 bits on x64
425+
// (like AVX2 registers) seems at least like a tolerable starting point.
426+
let threshold = cx.data_layout().pointer_size * 4;
427+
if self.layout.size() > threshold {
428+
return None;
429+
}
430+
431+
// Vectors, even for non-power-of-two sizes, have the same layout as
432+
// arrays but don't count as aggregate types
433+
if let FieldsShape::Array { count, .. } = self.layout.fields()
434+
&& let element = self.field(cx, 0)
435+
&& element.ty.is_integral()
436+
{
437+
// `cx.type_ix(bits)` is tempting here, but while that works great
438+
// for things that *stay* as memory-to-memory copies, it also ends
439+
// up suppressing vectorization as it introduces shifts when it
440+
// extracts all the individual values.
441+
442+
let ety = element.llvm_type(cx);
443+
return Some(cx.type_vector(ety, *count));
444+
}
445+
446+
// FIXME: The above only handled integer arrays; surely more things
447+
// would also be possible. Be careful about provenance, though!
448+
None
449+
}
417450
}

Diff for: compiler/rustc_codegen_ssa/src/base.rs

+13-1
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,19 @@ pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
380380
return;
381381
}
382382

383-
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
383+
if flags == MemFlags::empty()
384+
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
385+
{
386+
// I look forward to only supporting opaque pointers
387+
let pty = bx.type_ptr_to(bty);
388+
let src = bx.pointercast(src, pty);
389+
let dst = bx.pointercast(dst, pty);
390+
391+
let temp = bx.load(bty, src, src_align);
392+
bx.store(temp, dst, dst_align);
393+
} else {
394+
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
395+
}
384396
}
385397

386398
pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(

Diff for: compiler/rustc_codegen_ssa/src/traits/type_.rs

+22
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,28 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
126126
index: usize,
127127
immediate: bool,
128128
) -> Self::Type;
129+
130+
/// A type that can be used in a [`super::BuilderMethods::load`] +
131+
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
132+
/// such as a MIR `*_0 = *_1`.
133+
///
134+
/// It's always legal to return `None` here, as the provided impl does,
135+
/// in which case callers should use [`super::BuilderMethods::memcpy`]
136+
/// instead of the `load`+`store` pair.
137+
///
138+
/// This can be helpful for things like arrays, where the LLVM backend type
139+
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
140+
/// instead be copied via an `i48` that stays as the single `load`+`store`.
141+
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
142+
/// cases, due to `poison` handling, but in codegen we have more information
143+
/// about the type invariants, so can emit something better instead.)
144+
///
145+
/// This *should* return `None` for particularly-large types, where leaving
146+
/// the `memcpy` may well be important to avoid code size explosion.
147+
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
148+
let _ = layout;
149+
None
150+
}
129151
}
130152

131153
// For backends that support CFI using type membership (i.e., testing whether a given pointer is

Diff for: tests/codegen/array-codegen.rs

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// compile-flags: -O -C no-prepopulate-passes
2+
// min-llvm-version: 15.0 (for opaque pointers)
3+
4+
#![crate_type = "lib"]
5+
6+
// CHECK-LABEL: @array_load
7+
#[no_mangle]
8+
pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
9+
// CHECK: %0 = alloca [4 x i8], align 1
10+
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
11+
// CHECK: store <4 x i8> %[[TEMP1]], ptr %0, align 1
12+
// CHECK: %[[TEMP2:.+]] = load i32, ptr %0, align 1
13+
// CHECK: ret i32 %[[TEMP2]]
14+
*a
15+
}
16+
17+
// CHECK-LABEL: @array_store
18+
#[no_mangle]
19+
pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
20+
// CHECK: %a = alloca [4 x i8]
21+
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
22+
// CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
23+
*p = a;
24+
}
25+
26+
// CHECK-LABEL: @array_copy
27+
#[no_mangle]
28+
pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
29+
// CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
30+
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
31+
// CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
32+
// CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
33+
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
34+
*p = *a;
35+
}

Diff for: tests/codegen/mem-replace-simple-type.rs

+11
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,14 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str {
3232
// CHECK: ret { ptr, i64 } %[[P2]]
3333
std::mem::replace(r, v)
3434
}
35+
36+
#[no_mangle]
37+
// CHECK-LABEL: @replace_short_array(
38+
pub fn replace_short_array(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
39+
// CHECK-NOT: alloca
40+
// CHECK: %[[R:.+]] = load <3 x i32>, ptr %r, align 4
41+
// CHECK: store <3 x i32> %[[R]], ptr %0
42+
// CHECK: %[[V:.+]] = load <3 x i32>, ptr %v, align 4
43+
// CHECK: store <3 x i32> %[[V]], ptr %r
44+
std::mem::replace(r, v)
45+
}

Diff for: tests/codegen/swap-simd-types.rs

+9
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,12 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
3030
x.swap_with_slice(y);
3131
}
3232
}
33+
34+
// CHECK-LABEL: @swap_bytes32
35+
#[no_mangle]
36+
pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
37+
// CHECK-NOT: alloca
38+
// CHECK: load <32 x i8>{{.+}}align 1
39+
// CHECK: store <32 x i8>{{.+}}align 1
40+
swap(x, y)
41+
}

Diff for: tests/codegen/swap-small-types.rs

+10-15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// compile-flags: -O
1+
// compile-flags: -O -Z merge-functions=disabled
22
// only-x86_64
33
// ignore-debug: the debug assertions get in the way
44

@@ -12,13 +12,10 @@ type RGB48 = [u16; 3];
1212
#[no_mangle]
1313
pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
1414
// CHECK-NOT: alloca
15-
// CHECK: %temp = alloca [3 x i16]
16-
// CHECK-NOT: alloca
17-
// CHECK-NOT: call void @llvm.memcpy
18-
// CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %temp, {{.+}} %x, {{.+}} 6, {{.+}})
19-
// CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %x, {{.+}} %y, {{.+}} 6, {{.+}})
20-
// CHECK: call void @llvm.memcpy.{{.+}}({{.+}} %y, {{.+}} %temp, {{.+}} 6, {{.+}})
21-
// CHECK-NOT: call void @llvm.memcpy
15+
// CHECK: %[[TEMP0:.+]] = load <3 x i16>, ptr %x, align 2
16+
// CHECK: %[[TEMP1:.+]] = load <3 x i16>, ptr %y, align 2
17+
// CHECK: store <3 x i16> %[[TEMP1]], ptr %x, align 2
18+
// CHECK: store <3 x i16> %[[TEMP0]], ptr %y, align 2
2219

2320
let temp = *x;
2421
*x = *y;
@@ -28,13 +25,11 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
2825
// CHECK-LABEL: @swap_rgb48
2926
#[no_mangle]
3027
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
31-
// FIXME MIR inlining messes up LLVM optimizations.
32-
// If these checks start failing, please update this test.
33-
// CHECK: alloca [3 x i16]
34-
// CHECK: call void @llvm.memcpy
35-
// WOULD-CHECK-NOT: alloca
36-
// WOULD-CHECK: load i48
37-
// WOULD-CHECK: store i48
28+
// CHECK-NOT: alloca
29+
// CHECK: load <3 x i16>
30+
// CHECK: load <3 x i16>
31+
// CHECK: store <3 x i16>
32+
// CHECK: store <3 x i16>
3833
swap(x, y)
3934
}
4035

0 commit comments

Comments
 (0)