Skip to content

Implement a whole bunch more x86 vendor intrinsics #1380

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 23, 2023
274 changes: 242 additions & 32 deletions src/intrinsics/llvm_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
// Spin loop hint
}

// Used by is_x86_feature_detected!();
"llvm.x86.xgetbv" => {
// FIXME use the actual xgetbv instruction
intrinsic_args!(fx, args => (v); intrinsic);

let v = v.load_scalar(fx);

// As of writing on XCR0 exists
fx.bcx.ins().trapnz(v, TrapCode::UnreachableCodeReached);

let res = fx.bcx.ins().iconst(types::I64, 1 /* bit 0 must be set */);
ret.write_cvalue(fx, CValue::by_val(res, fx.layout_of(fx.tcx.types.i64)));
}

// Used by `_mm_movemask_epi8` and `_mm256_movemask_epi8`
"llvm.x86.sse2.pmovmskb.128"
| "llvm.x86.avx2.pmovmskb"
Expand Down Expand Up @@ -53,7 +67,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
let res = CValue::by_val(res, fx.layout_of(fx.tcx.types.i32));
ret.write_cvalue(fx, res);
}
"llvm.x86.sse2.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
let (x, y, kind) = match args {
[x, y, kind] => (x, y, kind),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
Expand All @@ -66,18 +80,95 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
let flt_cc = match kind
.try_to_bits(Size::from_bytes(1))
.unwrap_or_else(|| panic!("kind not scalar: {:?}", kind))
.try_into()
.unwrap()
{
0 => FloatCC::Equal,
1 => FloatCC::LessThan,
2 => FloatCC::LessThanOrEqual,
7 => FloatCC::Ordered,
3 => FloatCC::Unordered,
4 => FloatCC::NotEqual,
5 => FloatCC::UnorderedOrGreaterThanOrEqual,
6 => FloatCC::UnorderedOrGreaterThan,
_CMP_EQ_OQ | _CMP_EQ_OS => FloatCC::Equal,
_CMP_LT_OS | _CMP_LT_OQ => FloatCC::LessThan,
_CMP_LE_OS | _CMP_LE_OQ => FloatCC::LessThanOrEqual,
_CMP_UNORD_Q | _CMP_UNORD_S => FloatCC::Unordered,
_CMP_NEQ_UQ | _CMP_NEQ_US => FloatCC::NotEqual,
_CMP_NLT_US | _CMP_NLT_UQ => FloatCC::UnorderedOrGreaterThanOrEqual,
_CMP_NLE_US | _CMP_NLE_UQ => FloatCC::UnorderedOrGreaterThan,
_CMP_ORD_Q | _CMP_ORD_S => FloatCC::Ordered,
_CMP_EQ_UQ | _CMP_EQ_US => FloatCC::UnorderedOrEqual,
_CMP_NGE_US | _CMP_NGE_UQ => FloatCC::UnorderedOrLessThan,
_CMP_NGT_US | _CMP_NGT_UQ => FloatCC::UnorderedOrLessThanOrEqual,
_CMP_FALSE_OQ | _CMP_FALSE_OS => todo!(),
_CMP_NEQ_OQ | _CMP_NEQ_OS => FloatCC::OrderedNotEqual,
_CMP_GE_OS | _CMP_GE_OQ => FloatCC::GreaterThanOrEqual,
_CMP_GT_OS | _CMP_GT_OQ => FloatCC::GreaterThan,
_CMP_TRUE_UQ | _CMP_TRUE_US => todo!(),

kind => unreachable!("kind {:?}", kind),
};

// Copied from stdarch
/// Equal (ordered, non-signaling)
const _CMP_EQ_OQ: i32 = 0x00;
/// Less-than (ordered, signaling)
const _CMP_LT_OS: i32 = 0x01;
/// Less-than-or-equal (ordered, signaling)
const _CMP_LE_OS: i32 = 0x02;
/// Unordered (non-signaling)
const _CMP_UNORD_Q: i32 = 0x03;
/// Not-equal (unordered, non-signaling)
const _CMP_NEQ_UQ: i32 = 0x04;
/// Not-less-than (unordered, signaling)
const _CMP_NLT_US: i32 = 0x05;
/// Not-less-than-or-equal (unordered, signaling)
const _CMP_NLE_US: i32 = 0x06;
/// Ordered (non-signaling)
const _CMP_ORD_Q: i32 = 0x07;
/// Equal (unordered, non-signaling)
const _CMP_EQ_UQ: i32 = 0x08;
/// Not-greater-than-or-equal (unordered, signaling)
const _CMP_NGE_US: i32 = 0x09;
/// Not-greater-than (unordered, signaling)
const _CMP_NGT_US: i32 = 0x0a;
/// False (ordered, non-signaling)
const _CMP_FALSE_OQ: i32 = 0x0b;
/// Not-equal (ordered, non-signaling)
const _CMP_NEQ_OQ: i32 = 0x0c;
/// Greater-than-or-equal (ordered, signaling)
const _CMP_GE_OS: i32 = 0x0d;
/// Greater-than (ordered, signaling)
const _CMP_GT_OS: i32 = 0x0e;
/// True (unordered, non-signaling)
const _CMP_TRUE_UQ: i32 = 0x0f;
/// Equal (ordered, signaling)
const _CMP_EQ_OS: i32 = 0x10;
/// Less-than (ordered, non-signaling)
const _CMP_LT_OQ: i32 = 0x11;
/// Less-than-or-equal (ordered, non-signaling)
const _CMP_LE_OQ: i32 = 0x12;
/// Unordered (signaling)
const _CMP_UNORD_S: i32 = 0x13;
/// Not-equal (unordered, signaling)
const _CMP_NEQ_US: i32 = 0x14;
/// Not-less-than (unordered, non-signaling)
const _CMP_NLT_UQ: i32 = 0x15;
/// Not-less-than-or-equal (unordered, non-signaling)
const _CMP_NLE_UQ: i32 = 0x16;
/// Ordered (signaling)
const _CMP_ORD_S: i32 = 0x17;
/// Equal (unordered, signaling)
const _CMP_EQ_US: i32 = 0x18;
/// Not-greater-than-or-equal (unordered, non-signaling)
const _CMP_NGE_UQ: i32 = 0x19;
/// Not-greater-than (unordered, non-signaling)
const _CMP_NGT_UQ: i32 = 0x1a;
/// False (ordered, signaling)
const _CMP_FALSE_OS: i32 = 0x1b;
/// Not-equal (ordered, signaling)
const _CMP_NEQ_OS: i32 = 0x1c;
/// Greater-than-or-equal (ordered, non-signaling)
const _CMP_GE_OQ: i32 = 0x1d;
/// Greater-than (ordered, non-signaling)
const _CMP_GT_OQ: i32 = 0x1e;
/// True (unordered, signaling)
const _CMP_TRUE_US: i32 = 0x1f;

simd_pair_for_each_lane(fx, x, y, ret, &|fx, lane_ty, res_lane_ty, x_lane, y_lane| {
let res_lane = match lane_ty.kind() {
ty::Float(_) => fx.bcx.ins().fcmp(flt_cc, x_lane, y_lane),
Expand All @@ -103,6 +194,23 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.sse2.psrai.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.sse2.psrai.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().sshr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.sse2.pslli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
Expand Down Expand Up @@ -137,6 +245,23 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.sse2.psrai.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.sse2.psrai.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().sshr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.sse2.pslli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
Expand Down Expand Up @@ -171,6 +296,57 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.psrai.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrai.d imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 32 => fx.bcx.ins().sshr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.sse2.psrli.q" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrli.q imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 64 => fx.bcx.ins().ushr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.sse2.pslli.q" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.pslli.q imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 64 => fx.bcx.ins().ishl_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx.pslli.d" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
Expand Down Expand Up @@ -205,6 +381,23 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.psrai.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);
let imm8 = crate::constant::mir_operand_get_const_val(fx, imm8)
.expect("llvm.x86.avx.psrai.w imm8 not const");

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| match imm8
.try_to_bits(Size::from_bytes(4))
.unwrap_or_else(|| panic!("imm8 not scalar: {:?}", imm8))
{
imm8 if imm8 < 16 => fx.bcx.ins().sshr_imm(lane, i64::from(imm8 as u8)),
_ => fx.bcx.ins().iconst(types::I32, 0),
});
}
"llvm.x86.avx2.pslli.w" => {
let (a, imm8) = match args {
[a, imm8] => (a, imm8),
Expand Down Expand Up @@ -313,25 +506,53 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
ret.place_lane(fx, 2).to_ptr().store(fx, res_2, MemFlags::trusted());
ret.place_lane(fx, 3).to_ptr().store(fx, res_3, MemFlags::trusted());
}
"llvm.x86.sse2.storeu.dq" => {
"llvm.x86.sse2.storeu.dq" | "llvm.x86.sse2.storeu.pd" => {
intrinsic_args!(fx, args => (mem_addr, a); intrinsic);
let mem_addr = mem_addr.load_scalar(fx);

// FIXME correctly handle the unalignment
let dest = CPlace::for_ptr(Pointer::new(mem_addr), a.layout());
dest.write_cvalue(fx, a);
}
"llvm.x86.addcarry.64" => {
"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
let a = match args {
[a] => a,
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
};
let a = codegen_operand(fx, a);

simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| {
fx.bcx.ins().iabs(lane)
});
}
"llvm.x86.addcarry.32" | "llvm.x86.addcarry.64" => {
intrinsic_args!(fx, args => (c_in, a, b); intrinsic);
let c_in = c_in.load_scalar(fx);

llvm_add_sub(fx, BinOp::Add, ret, c_in, a, b);
let (cb_out, c) = llvm_add_sub(fx, BinOp::Add, c_in, a, b);

let layout = fx.layout_of(fx.tcx.mk_tup(&[fx.tcx.types.u8, a.layout().ty]));
let val = CValue::by_val_pair(cb_out, c, layout);
ret.write_cvalue(fx, val);
}
"llvm.x86.subborrow.64" => {
"llvm.x86.addcarryx.u32" | "llvm.x86.addcarryx.u64" => {
intrinsic_args!(fx, args => (c_in, a, b, out); intrinsic);
let c_in = c_in.load_scalar(fx);

let (cb_out, c) = llvm_add_sub(fx, BinOp::Add, c_in, a, b);

Pointer::new(out.load_scalar(fx)).store(fx, c, MemFlags::trusted());
ret.write_cvalue(fx, CValue::by_val(cb_out, fx.layout_of(fx.tcx.types.u8)));
}
"llvm.x86.subborrow.32" | "llvm.x86.subborrow.64" => {
intrinsic_args!(fx, args => (b_in, a, b); intrinsic);
let b_in = b_in.load_scalar(fx);

llvm_add_sub(fx, BinOp::Sub, ret, b_in, a, b);
let (cb_out, c) = llvm_add_sub(fx, BinOp::Sub, b_in, a, b);

let layout = fx.layout_of(fx.tcx.mk_tup(&[fx.tcx.types.u8, a.layout().ty]));
let val = CValue::by_val_pair(cb_out, c, layout);
ret.write_cvalue(fx, val);
}
_ => {
fx.tcx
Expand All @@ -356,37 +577,26 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
fn llvm_add_sub<'tcx>(
fx: &mut FunctionCx<'_, '_, 'tcx>,
bin_op: BinOp,
ret: CPlace<'tcx>,
cb_in: Value,
a: CValue<'tcx>,
b: CValue<'tcx>,
) {
assert_eq!(
a.layout().ty,
fx.tcx.types.u64,
"llvm.x86.addcarry.64/llvm.x86.subborrow.64 second operand must be u64"
);
assert_eq!(
b.layout().ty,
fx.tcx.types.u64,
"llvm.x86.addcarry.64/llvm.x86.subborrow.64 third operand must be u64"
);
) -> (Value, Value) {
assert_eq!(a.layout().ty, b.layout().ty);

// c + carry -> c + first intermediate carry or borrow respectively
let int0 = crate::num::codegen_checked_int_binop(fx, bin_op, a, b);
let c = int0.value_field(fx, FieldIdx::new(0));
let cb0 = int0.value_field(fx, FieldIdx::new(1)).load_scalar(fx);

// c + carry -> c + second intermediate carry or borrow respectively
let cb_in_as_u64 = fx.bcx.ins().uextend(types::I64, cb_in);
let cb_in_as_u64 = CValue::by_val(cb_in_as_u64, fx.layout_of(fx.tcx.types.u64));
let int1 = crate::num::codegen_checked_int_binop(fx, bin_op, c, cb_in_as_u64);
let clif_ty = fx.clif_type(a.layout().ty).unwrap();
let cb_in_as_int = fx.bcx.ins().uextend(clif_ty, cb_in);
let cb_in_as_int = CValue::by_val(cb_in_as_int, fx.layout_of(a.layout().ty));
let int1 = crate::num::codegen_checked_int_binop(fx, bin_op, c, cb_in_as_int);
let (c, cb1) = int1.load_scalar_pair(fx);

// carry0 | carry1 -> carry or borrow respectively
let cb_out = fx.bcx.ins().bor(cb0, cb1);

let layout = fx.layout_of(fx.tcx.mk_tup(&[fx.tcx.types.u8, fx.tcx.types.u64]));
let val = CValue::by_val_pair(cb_out, c, layout);
ret.write_cvalue(fx, val);
(cb_out, c)
}
3 changes: 2 additions & 1 deletion src/intrinsics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -647,12 +647,13 @@ fn codegen_regular_intrinsic_call<'tcx>(
let val = CValue::by_ref(Pointer::new(ptr.load_scalar(fx)), inner_layout);
ret.write_cvalue(fx, val);
}
sym::volatile_store | sym::unaligned_volatile_store => {
sym::volatile_store | sym::unaligned_volatile_store | sym::nontemporal_store => {
intrinsic_args!(fx, args => (ptr, val); intrinsic);
let ptr = ptr.load_scalar(fx);

// Cranelift treats stores as volatile by default
// FIXME correctly handle unaligned_volatile_store
// FIXME actually do nontemporal stores if requested
let dest = CPlace::for_ptr(Pointer::new(ptr), val.layout());
dest.write_cvalue(fx, val);
}
Expand Down