Skip to content

Commit 204c64b

Browse files
authored
Merge pull request #1378 from bjorn3/more_vendor_intrinsics
Implement all vendor intrinsics used by regex on AVX2 systems
2 parents e369cce + 8fbd6f5 commit 204c64b

File tree

5 files changed

+243
-8
lines changed

5 files changed

+243
-8
lines changed

.github/workflows/main.yml

+12-6
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,6 @@ jobs:
9393
- name: Prepare dependencies
9494
run: ./y.rs prepare
9595

96-
- name: Build without unstable features
97-
env:
98-
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
99-
# This is the config rust-lang/rust uses for builds
100-
run: ./y.rs build --no-unstable-features
101-
10296
- name: Build
10397
run: ./y.rs build --sysroot none
10498

@@ -107,6 +101,18 @@ jobs:
107101
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
108102
run: ./y.rs test
109103

104+
- name: Install LLVM standard library
105+
run: rustup target add ${{ matrix.env.TARGET_TRIPLE }}
106+
107+
# This is roughly config rust-lang/rust uses for testing
108+
- name: Test with LLVM sysroot
109+
# Skip native x86_64-pc-windows-gnu. It is way too slow and cross-compiled
110+
# x86_64-pc-windows-gnu covers at least part of the tests.
111+
if: matrix.os != 'windows-latest' || matrix.env.TARGET_TRIPLE != 'x86_64-pc-windows-gnu'
112+
env:
113+
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
114+
run: ./y.rs test --sysroot llvm --no-unstable-features
115+
110116

111117
# This job doesn't use cg_clif in any way. It checks that all cg_clif tests work with cg_llvm too.
112118
test_llvm:

example/alloc_example.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#![feature(start, core_intrinsics, alloc_error_handler)]
1+
#![feature(start, core_intrinsics, alloc_error_handler, lang_items)]
22
#![no_std]
33

44
extern crate alloc;
@@ -27,6 +27,11 @@ fn alloc_error_handler(_: alloc::alloc::Layout) -> ! {
2727
core::intrinsics::abort();
2828
}
2929

30+
#[lang = "eh_personality"]
31+
fn eh_personality() -> ! {
32+
loop {}
33+
}
34+
3035
#[start]
3136
fn main(_argc: isize, _argv: *const *const u8) -> isize {
3237
let world: Box<&str> = Box::new("Hello World!\0");

example/std_example.rs

+47
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ unsafe fn test_simd() {
198198
test_mm_extract_epi8();
199199
test_mm_insert_epi16();
200200

201+
test_mm256_shuffle_epi8();
202+
test_mm256_permute2x128_si256();
203+
201204
#[rustfmt::skip]
202205
let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
203206
assert_eq!(mask1, 1);
@@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
293296
}
294297
}
295298

299+
#[cfg(target_arch = "x86_64")]
300+
#[target_feature(enable = "avx")]
301+
pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
302+
assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b))
303+
}
304+
296305
#[cfg(target_arch = "x86_64")]
297306
#[target_feature(enable = "sse2")]
298307
unsafe fn test_mm_cvtsi128_si64() {
@@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() {
336345
assert_eq_m128i(r, e);
337346
}
338347

348+
#[cfg(target_arch = "x86_64")]
349+
#[target_feature(enable = "avx2")]
350+
unsafe fn test_mm256_shuffle_epi8() {
351+
#[rustfmt::skip]
352+
let a = _mm256_setr_epi8(
353+
1, 2, 3, 4, 5, 6, 7, 8,
354+
9, 10, 11, 12, 13, 14, 15, 16,
355+
17, 18, 19, 20, 21, 22, 23, 24,
356+
25, 26, 27, 28, 29, 30, 31, 32,
357+
);
358+
#[rustfmt::skip]
359+
let b = _mm256_setr_epi8(
360+
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
361+
12, 5, 5, 10, 4, 1, 8, 0,
362+
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
363+
12, 5, 5, 10, 4, 1, 8, 0,
364+
);
365+
#[rustfmt::skip]
366+
let expected = _mm256_setr_epi8(
367+
5, 0, 5, 4, 9, 13, 7, 4,
368+
13, 6, 6, 11, 5, 2, 9, 1,
369+
21, 0, 21, 20, 25, 29, 23, 20,
370+
29, 22, 22, 27, 21, 18, 25, 17,
371+
);
372+
let r = _mm256_shuffle_epi8(a, b);
373+
assert_eq_m256i(r, expected);
374+
}
375+
376+
#[cfg(target_arch = "x86_64")]
377+
#[target_feature(enable = "avx2")]
378+
unsafe fn test_mm256_permute2x128_si256() {
379+
let a = _mm256_setr_epi64x(100, 200, 500, 600);
380+
let b = _mm256_setr_epi64x(300, 400, 700, 800);
381+
let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
382+
let e = _mm256_setr_epi64x(700, 800, 500, 600);
383+
assert_eq_m256i(r, e);
384+
}
385+
339386
fn test_checked_mul() {
340387
let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
341388
assert_eq!(u, None);

src/intrinsics/llvm_x86.rs

+157-1
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,41 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
110110
};
111111
let a = codegen_operand(fx, a);
112112
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
113-
.expect("llvm.x86.sse2.psrli.d imm8 not const");
113+
.expect("llvm.x86.sse2.pslli.d imm8 not const");
114+
115+
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
116+
.try_to_bits(Size::from_bytes(4))
117+
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
118+
{
119+
imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
120+
_ => fx.bcx.ins().iconst(types::I32, 0),
121+
});
122+
}
123+
"llvm.x86.avx.psrli.d" => {
124+
let (a, imm8) = match args {
125+
[a, imm8] => (a, imm8),
126+
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
127+
};
128+
let a = codegen_operand(fx, a);
129+
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
130+
.expect("llvm.x86.avx.psrli.d imm8 not const");
131+
132+
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
133+
.try_to_bits(Size::from_bytes(4))
134+
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
135+
{
136+
imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
137+
_ => fx.bcx.ins().iconst(types::I32, 0),
138+
});
139+
}
140+
"llvm.x86.avx.pslli.d" => {
141+
let (a, imm8) = match args {
142+
[a, imm8] => (a, imm8),
143+
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
144+
};
145+
let a = codegen_operand(fx, a);
146+
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
147+
.expect("llvm.x86.avx.pslli.d imm8 not const");
114148

115149
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
116150
.try_to_bits(Size::from_bytes(4))
@@ -120,6 +154,128 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
120154
_ => fx.bcx.ins().iconst(types::I32, 0),
121155
});
122156
}
157+
"llvm.x86.avx2.psrli.w" => {
158+
let (a, imm8) = match args {
159+
[a, imm8] => (a, imm8),
160+
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
161+
};
162+
let a = codegen_operand(fx, a);
163+
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
164+
.expect("llvm.x86.avx.psrli.w imm8 not const");
165+
166+
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
167+
.try_to_bits(Size::from_bytes(4))
168+
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
169+
{
170+
imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
171+
_ => fx.bcx.ins().iconst(types::I32, 0),
172+
});
173+
}
174+
"llvm.x86.avx2.pslli.w" => {
175+
let (a, imm8) = match args {
176+
[a, imm8] => (a, imm8),
177+
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
178+
};
179+
let a = codegen_operand(fx, a);
180+
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
181+
.expect("llvm.x86.avx.pslli.w imm8 not const");
182+
183+
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
184+
.try_to_bits(Size::from_bytes(4))
185+
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
186+
{
187+
imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
188+
_ => fx.bcx.ins().iconst(types::I32, 0),
189+
});
190+
}
191+
"llvm.x86.avx2.pshuf.b" => {
192+
let (a, b) = match args {
193+
[a, b] => (a, b),
194+
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
195+
};
196+
let a = codegen_operand(fx, a);
197+
let b = codegen_operand(fx, b);
198+
199+
// Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332
200+
let zero = fx.bcx.ins().iconst(types::I8, 0);
201+
for i in 0..16 {
202+
let b_lane = b.value_lane(fx, i).load_scalar(fx);
203+
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
204+
let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);
205+
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
206+
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
207+
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
208+
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
209+
}
210+
for i in 16..32 {
211+
let b_lane = b.value_lane(fx, i).load_scalar(fx);
212+
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
213+
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
214+
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
215+
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
216+
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
217+
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
218+
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
219+
}
220+
}
221+
"llvm.x86.avx2.vperm2i128" => {
222+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
223+
let (a, b, imm8) = match args {
224+
[a, b, imm8] => (a, b, imm8),
225+
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
226+
};
227+
let a = codegen_operand(fx, a);
228+
let b = codegen_operand(fx, b);
229+
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);
230+
231+
let a_0 = a.value_lane(fx, 0).load_scalar(fx);
232+
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
233+
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
234+
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
235+
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
236+
let a_high = fx.bcx.ins().iconcat(a_2, a_3);
237+
238+
let b_0 = b.value_lane(fx, 0).load_scalar(fx);
239+
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
240+
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
241+
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
242+
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
243+
let b_high = fx.bcx.ins().iconcat(b_2, b_3);
244+
245+
fn select4(
246+
fx: &mut FunctionCx<'_, '_, '_>,
247+
a_high: Value,
248+
a_low: Value,
249+
b_high: Value,
250+
b_low: Value,
251+
control: Value,
252+
) -> Value {
253+
let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);
254+
let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);
255+
let is_zero = fx.bcx.ins().band_imm(control, 0b1000);
256+
257+
let zero = fx.bcx.ins().iconst(types::I64, 0);
258+
let zero = fx.bcx.ins().iconcat(zero, zero);
259+
260+
let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);
261+
let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);
262+
let res = fx.bcx.ins().select(a_or_b, res_b, res_a);
263+
fx.bcx.ins().select(is_zero, zero, res)
264+
}
265+
266+
let control0 = imm8;
267+
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
268+
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);
269+
270+
let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
271+
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
272+
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);
273+
274+
ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
275+
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
276+
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
277+
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
278+
}
123279
"llvm.x86.sse2.storeu.dq" => {
124280
intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
125281
let mem_addr = mem_addr.load_scalar(fx);

src/value_and_place.rs

+21
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,27 @@ impl<'tcx> CValue<'tcx> {
258258
}
259259
}
260260

261+
/// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index.
262+
pub(crate) fn value_lane_dyn(
263+
self,
264+
fx: &mut FunctionCx<'_, '_, 'tcx>,
265+
lane_idx: Value,
266+
) -> CValue<'tcx> {
267+
let layout = self.1;
268+
assert!(layout.ty.is_simd());
269+
let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
270+
let lane_layout = fx.layout_of(lane_ty);
271+
match self.0 {
272+
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
273+
CValueInner::ByRef(ptr, None) => {
274+
let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64);
275+
let field_ptr = ptr.offset_value(fx, field_offset);
276+
CValue::by_ref(field_ptr, lane_layout)
277+
}
278+
CValueInner::ByRef(_, Some(_)) => unreachable!(),
279+
}
280+
}
281+
261282
/// If `ty` is signed, `const_val` must already be sign extended.
262283
pub(crate) fn const_val(
263284
fx: &mut FunctionCx<'_, '_, 'tcx>,

0 commit comments

Comments
 (0)