Skip to content

Commit 813f8b4

Browse files
committed
Use inline asm for _mm_clmulepi64_si128
This is a lot more compact and significantly faster
1 parent dc60334 commit 813f8b4

File tree

1 file changed

+30
-55
lines changed

1 file changed

+30
-55
lines changed

src/intrinsics/llvm_x86.rs

+30-55
Original file line numberDiff line numberDiff line change
@@ -719,66 +719,41 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
719719
}
720720

721721
"llvm.x86.pclmulqdq" => {
722-
// FIXME use inline asm
723722
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
724-
intrinsic_args!(fx, args => (a, b, imm8); intrinsic);
723+
intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);
725724

726-
assert_eq!(a.layout(), b.layout());
727-
let layout = a.layout();
728-
729-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
730-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
731-
assert_eq!(lane_ty, fx.tcx.types.i64);
732-
assert_eq!(ret_lane_ty, fx.tcx.types.i64);
733-
assert_eq!(lane_count, 2);
734-
assert_eq!(ret_lane_count, 2);
735-
736-
let imm8 = imm8.load_scalar(fx);
737-
738-
let control0 = fx.bcx.ins().band_imm(imm8, 0b0000_0001);
739-
let a_lane0 = a.value_lane(fx, 0).load_scalar(fx);
740-
let a_lane1 = a.value_lane(fx, 1).load_scalar(fx);
741-
let temp1 = fx.bcx.ins().select(control0, a_lane1, a_lane0);
725+
let a = a.load_scalar(fx);
726+
let b = b.load_scalar(fx);
742727

743-
let control4 = fx.bcx.ins().band_imm(imm8, 0b0001_0000);
744-
let b_lane0 = b.value_lane(fx, 0).load_scalar(fx);
745-
let b_lane1 = b.value_lane(fx, 1).load_scalar(fx);
746-
let temp2 = fx.bcx.ins().select(control4, b_lane1, b_lane0);
728+
let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[2])
729+
{
730+
imm8
731+
} else {
732+
fx.tcx.sess.span_fatal(
733+
span,
734+
"Index argument for `_mm_clmulepi64_si128` is not a constant",
735+
);
736+
};
747737

748-
fn extract_bit(fx: &mut FunctionCx<'_, '_, '_>, val: Value, bit: i64) -> Value {
749-
let tmp = fx.bcx.ins().ushr_imm(val, bit);
750-
fx.bcx.ins().band_imm(tmp, 1)
751-
}
738+
let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
752739

753-
let mut res1 = fx.bcx.ins().iconst(types::I64, 0);
754-
for i in 0..=63 {
755-
let x = extract_bit(fx, temp1, 0);
756-
let y = extract_bit(fx, temp2, i);
757-
let mut temp = fx.bcx.ins().band(x, y);
758-
for j in 1..=i {
759-
let x = extract_bit(fx, temp1, j);
760-
let y = extract_bit(fx, temp2, i - j);
761-
let z = fx.bcx.ins().band(x, y);
762-
temp = fx.bcx.ins().bxor(temp, z);
763-
}
764-
let temp = fx.bcx.ins().ishl_imm(temp, i);
765-
res1 = fx.bcx.ins().bor(res1, temp);
766-
}
767-
ret.place_lane(fx, 0).to_ptr().store(fx, res1, MemFlags::trusted());
768-
769-
let mut res2 = fx.bcx.ins().iconst(types::I64, 0);
770-
for i in 64..=127 {
771-
let mut temp = fx.bcx.ins().iconst(types::I64, 0);
772-
for j in i - 63..=63 {
773-
let x = extract_bit(fx, temp1, j);
774-
let y = extract_bit(fx, temp2, i - j);
775-
let z = fx.bcx.ins().band(x, y);
776-
temp = fx.bcx.ins().bxor(temp, z);
777-
}
778-
let temp = fx.bcx.ins().ishl_imm(temp, i);
779-
res2 = fx.bcx.ins().bor(res2, temp);
780-
}
781-
ret.place_lane(fx, 1).to_ptr().store(fx, res2, MemFlags::trusted());
740+
codegen_inline_asm_inner(
741+
fx,
742+
&[InlineAsmTemplatePiece::String(format!("pclmulqdq xmm0, xmm1, {imm8}"))],
743+
&[
744+
CInlineAsmOperand::InOut {
745+
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
746+
_late: true,
747+
in_value: a,
748+
out_place: Some(ret),
749+
},
750+
CInlineAsmOperand::In {
751+
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
752+
value: b,
753+
},
754+
],
755+
InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
756+
);
782757
}
783758

784759
"llvm.x86.aesni.aeskeygenassist" => {

0 commit comments

Comments
 (0)