Merge pull request #1495 from folkertdev/add-llvm-sse2-cvtps2dq

bjorn3 · web-flow · commit c511676a621b · 2024-06-06T23:25:16.000+02:00
add `llvm.x86.sse2.cvtps2dq`
diff --git a/example/std_example.rs b/example/std_example.rs
@@ -251,6 +251,9 @@ unsafe fn test_simd() {
     test_mm_add_epi8();
     test_mm_add_pd();
     test_mm_cvtepi8_epi16();
+    #[cfg(not(jit))]
+    test_mm_cvtps_epi32();
+    test_mm_cvttps_epi32();
     test_mm_cvtsi128_si64();
 
     test_mm_extract_epi8();
@@ -476,6 +479,41 @@ unsafe fn test_mm256_permutevar8x32_epi32() {
     assert_eq_m256i(r, e);
 }
 
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+#[cfg(not(jit))]
+unsafe fn test_mm_cvtps_epi32() {
+    let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN];
+
+    let float_vec = _mm_loadu_ps(floats.as_ptr());
+    let int_vec = _mm_cvtps_epi32(float_vec);
+
+    let mut ints: [i32; 4] = [0; 4];
+    _mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec);
+
+    // this is very different from `floats.map(|f| f as i32)`!
+    let expected_ints: [i32; 4] = [2, -2, i32::MIN, i32::MIN];
+
+    assert_eq!(ints, expected_ints);
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn test_mm_cvttps_epi32() {
+    let floats: [f32; 4] = [1.5, -2.5, i32::MAX as f32 + 1.0, f32::NAN];
+
+    let float_vec = _mm_loadu_ps(floats.as_ptr());
+    let int_vec = _mm_cvttps_epi32(float_vec);
+
+    let mut ints: [i32; 4] = [0; 4];
+    _mm_storeu_si128(ints.as_mut_ptr() as *mut __m128i, int_vec);
+
+    // this is very different from `floats.map(|f| f as i32)`!
+    let expected_ints: [i32; 4] = [1, -2, i32::MIN, i32::MIN];
+
+    assert_eq!(ints, expected_ints);
+}
+
 fn test_checked_mul() {
     let u: Option<u8> = u8::from_str_radix("1000", 10).ok();
     assert_eq!(u, None);
diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs
@@ -459,11 +459,20 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
             intrinsic_args!(fx, args => (a); intrinsic);
             let a = a.load_scalar(fx);
 
+            let value = fx.bcx.ins().x86_cvtt2dq(types::I32X4, a);
+            let cvalue = CValue::by_val(value, ret.layout());
+            ret.write_cvalue(fx, cvalue);
+        }
+        "llvm.x86.sse2.cvtps2dq" => {
+            // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+            intrinsic_args!(fx, args => (a); intrinsic);
+            let a = a.load_scalar(fx);
+
             // Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned
             // into 0x80000000 for which Cranelift doesn't have a native instruction.
             codegen_inline_asm_inner(
                 fx,
-                &[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))],
+                &[InlineAsmTemplatePiece::String(format!("cvtps2dq xmm0, xmm0"))],
                 &[CInlineAsmOperand::InOut {
                     reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
                     _late: true,