Skip to content

Implement all vendor intrinsics used by regex on AVX2 systems #1378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,6 @@ jobs:
- name: Prepare dependencies
run: ./y.rs prepare

- name: Build without unstable features
env:
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
# This is the config rust-lang/rust uses for builds
run: ./y.rs build --no-unstable-features

- name: Build
run: ./y.rs build --sysroot none

Expand All @@ -107,6 +101,18 @@ jobs:
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
run: ./y.rs test

- name: Install LLVM standard library
run: rustup target add ${{ matrix.env.TARGET_TRIPLE }}

# This is roughly config rust-lang/rust uses for testing
- name: Test with LLVM sysroot
# Skip native x86_64-pc-windows-gnu. It is way too slow and cross-compiled
# x86_64-pc-windows-gnu covers at least part of the tests.
if: matrix.os != 'windows-latest' || matrix.env.TARGET_TRIPLE != 'x86_64-pc-windows-gnu'
env:
TARGET_TRIPLE: ${{ matrix.env.TARGET_TRIPLE }}
run: ./y.rs test --sysroot llvm --no-unstable-features


# This job doesn't use cg_clif in any way. It checks that all cg_clif tests work with cg_llvm too.
test_llvm:
Expand Down
7 changes: 6 additions & 1 deletion example/alloc_example.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#![feature(start, core_intrinsics, alloc_error_handler)]
#![feature(start, core_intrinsics, alloc_error_handler, lang_items)]
#![no_std]

extern crate alloc;
Expand Down Expand Up @@ -27,6 +27,11 @@ fn alloc_error_handler(_: alloc::alloc::Layout) -> ! {
core::intrinsics::abort();
}

#[lang = "eh_personality"]
fn eh_personality() -> ! {
loop {}
}

#[start]
fn main(_argc: isize, _argv: *const *const u8) -> isize {
let world: Box<&str> = Box::new("Hello World!\0");
Expand Down
47 changes: 47 additions & 0 deletions example/std_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ unsafe fn test_simd() {
test_mm_extract_epi8();
test_mm_insert_epi16();

test_mm256_shuffle_epi8();
test_mm256_permute2x128_si256();

#[rustfmt::skip]
let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
assert_eq!(mask1, 1);
Expand Down Expand Up @@ -293,6 +296,12 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
}
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx")]
pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
assert_eq!(std::mem::transmute::<_, [u64; 4]>(a), std::mem::transmute::<_, [u64; 4]>(b))
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse2")]
unsafe fn test_mm_cvtsi128_si64() {
Expand Down Expand Up @@ -336,6 +345,44 @@ unsafe fn test_mm_insert_epi16() {
assert_eq_m128i(r, e);
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_shuffle_epi8() {
#[rustfmt::skip]
let a = _mm256_setr_epi8(
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24,
25, 26, 27, 28, 29, 30, 31, 32,
);
#[rustfmt::skip]
let b = _mm256_setr_epi8(
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
12, 5, 5, 10, 4, 1, 8, 0,
);
#[rustfmt::skip]
let expected = _mm256_setr_epi8(
5, 0, 5, 4, 9, 13, 7, 4,
13, 6, 6, 11, 5, 2, 9, 1,
21, 0, 21, 20, 25, 29, 23, 20,
29, 22, 22, 27, 21, 18, 25, 17,
);
let r = _mm256_shuffle_epi8(a, b);
assert_eq_m256i(r, expected);
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn test_mm256_permute2x128_si256() {
let a = _mm256_setr_epi64x(100, 200, 500, 600);
let b = _mm256_setr_epi64x(300, 400, 700, 800);
let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
let e = _mm256_setr_epi64x(700, 800, 500, 600);
assert_eq_m256i(r, e);
}

fn test_checked_mul() {
let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
assert_eq!(u, None);
Expand Down
158 changes: 157 additions & 1 deletion src/intrinsics/llvm_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,41 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.sse2.psrli.d imm8 not const");
.expect("llvm.x86.sse2.pslli.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.psrli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.pslli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
Expand All @@ -120,6 +154,128 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.psrli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.w imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pslli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.w imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pshuf.b" => {
let (a, b) = match args {
[a, b] => (a, b),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let b = codegen_operand(fx, b);

// Based on the pseudocode at https://github.com/rust-lang/stdarch/blob/1cfbca8b38fd9b4282b2f054f61c6ca69fc7ce29/crates/core_arch/src/x86/avx2.rs#L2319-L2332
let zero = fx.bcx.ins().iconst(types::I8, 0);
for i in 0..16 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let a_idx = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
for i in 16..32 {
let b_lane = b.value_lane(fx, i).load_scalar(fx);
let is_zero = fx.bcx.ins().band_imm(b_lane, 0x80);
let b_lane_masked = fx.bcx.ins().band_imm(b_lane, 0xf);
let a_idx = fx.bcx.ins().iadd_imm(b_lane_masked, 16);
let a_idx = fx.bcx.ins().uextend(fx.pointer_type, a_idx);
let a_lane = a.value_lane_dyn(fx, a_idx).load_scalar(fx);
let res = fx.bcx.ins().select(is_zero, zero, a_lane);
ret.place_lane(fx, i).to_ptr().store(fx, res, MemFlags::trusted());
}
}
"llvm.x86.avx2.vperm2i128" => {
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256
let (a, b, imm8) = match args {
[a, b, imm8] => (a, b, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let b = codegen_operand(fx, b);
let imm8 = codegen_operand(fx, imm8).load_scalar(fx);

let a_0 = a.value_lane(fx, 0).load_scalar(fx);
let a_1 = a.value_lane(fx, 1).load_scalar(fx);
let a_low = fx.bcx.ins().iconcat(a_0, a_1);
let a_2 = a.value_lane(fx, 2).load_scalar(fx);
let a_3 = a.value_lane(fx, 3).load_scalar(fx);
let a_high = fx.bcx.ins().iconcat(a_2, a_3);

let b_0 = b.value_lane(fx, 0).load_scalar(fx);
let b_1 = b.value_lane(fx, 1).load_scalar(fx);
let b_low = fx.bcx.ins().iconcat(b_0, b_1);
let b_2 = b.value_lane(fx, 2).load_scalar(fx);
let b_3 = b.value_lane(fx, 3).load_scalar(fx);
let b_high = fx.bcx.ins().iconcat(b_2, b_3);

fn select4(
fx: &mut FunctionCx<'_, '_, '_>,
a_high: Value,
a_low: Value,
b_high: Value,
b_low: Value,
control: Value,
) -> Value {
let a_or_b = fx.bcx.ins().band_imm(control, 0b0010);
let high_or_low = fx.bcx.ins().band_imm(control, 0b0001);
let is_zero = fx.bcx.ins().band_imm(control, 0b1000);

let zero = fx.bcx.ins().iconst(types::I64, 0);
let zero = fx.bcx.ins().iconcat(zero, zero);

let res_a = fx.bcx.ins().select(high_or_low, a_high, a_low);
let res_b = fx.bcx.ins().select(high_or_low, b_high, b_low);
let res = fx.bcx.ins().select(a_or_b, res_b, res_a);
fx.bcx.ins().select(is_zero, zero, res)
}

let control0 = imm8;
let res_low = select4(fx, a_high, a_low, b_high, b_low, control0);
let (res_0, res_1) = fx.bcx.ins().isplit(res_low);

let control1 = fx.bcx.ins().ushr_imm(imm8, 4);
let res_high = select4(fx, a_high, a_low, b_high, b_low, control1);
let (res_2, res_3) = fx.bcx.ins().isplit(res_high);

ret.place_lane(fx, 0).to_ptr().store(fx, res_0, MemFlags::trusted());
ret.place_lane(fx, 1).to_ptr().store(fx, res_1, MemFlags::trusted());
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
}
"llvm.x86.sse2.storeu.dq" => {
intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
let mem_addr = mem_addr.load_scalar(fx);
Expand Down
21 changes: 21 additions & 0 deletions src/value_and_place.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,27 @@ impl<'tcx> CValue<'tcx> {
}
}

/// Like [`CValue::value_lane`] except allowing a dynamically calculated lane index.
pub(crate) fn value_lane_dyn(
self,
fx: &mut FunctionCx<'_, '_, 'tcx>,
lane_idx: Value,
) -> CValue<'tcx> {
let layout = self.1;
assert!(layout.ty.is_simd());
let (_lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
let lane_layout = fx.layout_of(lane_ty);
match self.0 {
CValueInner::ByVal(_) | CValueInner::ByValPair(_, _) => unreachable!(),
CValueInner::ByRef(ptr, None) => {
let field_offset = fx.bcx.ins().imul_imm(lane_idx, lane_layout.size.bytes() as i64);
let field_ptr = ptr.offset_value(fx, field_offset);
CValue::by_ref(field_ptr, lane_layout)
}
CValueInner::ByRef(_, Some(_)) => unreachable!(),
}
}

/// If `ty` is signed, `const_val` must already be sign extended.
pub(crate) fn const_val(
fx: &mut FunctionCx<'_, '_, 'tcx>,
Expand Down