Sse (rust-lang#225)

gwenn · alexcrichton · commit 22210c2f5eb1 · 2017-12-09T11:20:44.000-06:00
* sse: _mm_cvt_pi2ps

* sse: _mm_extract_pi16

* sse: _mm_insert_pi16

* sse: _mm_movemask_pi8

* sse: _mm_shuffle_pi16

* sse: fix _mm_insert_pi16 and _mm_extract_pi16

* sse: add tests
diff --git a/coresimd/src/x86/i686/sse.rs b/coresimd/src/x86/i686/sse.rs
@@ -12,6 +12,16 @@ use stdsimd_test::assert_instr;
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.x86.sse.cvtpi2ps"]
+    fn cvtpi2ps(a: f32x4, b: __m64) -> f32x4;
+    #[link_name = "llvm.x86.mmx.pextr.w"]
+    fn pextrw(a: __m64, imm8: i32) -> i32;
+    #[link_name = "llvm.x86.mmx.pinsr.w"]
+    fn pinsrw(a: __m64, d: i32, imm8: i32) -> __m64;
+    #[link_name = "llvm.x86.mmx.pmovmskb"]
+    fn pmovmskb(a: __m64) -> i32;
+    #[link_name = "llvm.x86.sse.pshuf.w"]
+    fn pshufw(a: __m64, imm8: i8) -> __m64;
     #[link_name = "llvm.x86.mmx.pmaxs.w"]
     fn pmaxsw(a: __m64, b: __m64) -> __m64;
     #[link_name = "llvm.x86.mmx.pmaxu.b"]
@@ -98,6 +108,64 @@ pub unsafe fn _m_pminub(a: u8x8, b: u8x8) -> u8x8 {
     _mm_min_pu8(a, b)
 }
 
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(cvtpi2ps))]
+pub unsafe fn _mm_cvt_pi2ps(a: f32x4, b: i32x2) -> f32x4 {
+    cvtpi2ps(a, mem::transmute(b))
+}
+
+/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+/// returns it, as specified by the immediate integer operand.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(pextrw, imm2 = 0))]
+pub unsafe fn _mm_extract_pi16(a: i16x4, imm2: i32) -> i16 {
+    macro_rules! call {
+        ($imm2:expr) => { pextrw(mem::transmute(a), $imm2) as i16 }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Copies data from the 64-bit vector of [4 x i16] to the destination,
+/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
+/// specified by the immediate operand `n`.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(pinsrw, imm2 = 0))]
+pub unsafe fn _mm_insert_pi16(a: i16x4, d: i32, imm2: i32) -> i16x4 {
+    macro_rules! call {
+        ($imm2:expr) => { mem::transmute(pinsrw(mem::transmute(a), d, $imm2)) }
+    }
+    constify_imm2!(imm2, call)
+}
+
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+/// integer vector to create a 16-bit mask value. Zero-extends the value to
+/// 32-bit integer and writes it to the destination.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(pmovmskb))]
+pub unsafe fn _mm_movemask_pi8(a: i16x4) -> i32 {
+    pmovmskb(mem::transmute(a))
+}
+
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// destination, as specified by the immediate value operand.
+#[inline(always)]
+#[target_feature = "+sse"]
+#[cfg_attr(test, assert_instr(pshufw, imm8 = 0))]
+pub unsafe fn _mm_shuffle_pi16(a: i16x4, imm8: i8) -> i16x4 {
+    macro_rules! call {
+        ($imm8:expr) => { mem::transmute(pshufw(mem::transmute(a), $imm8)) }
+    }
+    constify_imm8!(imm8, call)
+}
+
 /// Convert the two lower packed single-precision (32-bit) floating-point
 /// elements in `a` to packed 32-bit integers with truncation.
 #[inline(always)]
@@ -205,6 +273,50 @@ mod tests {
         assert_eq!(r, sse::_m_pminub(a, b));
     }
 
+    #[simd_test = "sse"]
+    unsafe fn _mm_cvt_pi2ps() {
+        let a = f32x4::new(0., 0., 3., 4.);
+        let b = i32x2::new(1, 2);
+        let expected = f32x4::new(1., 2., 3., 4.);
+        let r = sse::_mm_cvt_pi2ps(a, b);
+        assert_eq!(r, expected);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_extract_pi16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let r = sse::_mm_extract_pi16(a, 0);
+        assert_eq!(r, 1);
+        let r = sse::_mm_extract_pi16(a, 1);
+        assert_eq!(r, 2);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_insert_pi16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let r = sse::_mm_insert_pi16(a, 0, 0b0);
+        let expected = i16x4::new(0, 2, 3, 4);
+        assert_eq!(r, expected);
+        let r = sse::_mm_insert_pi16(a, 0, 0b10);
+        let expected = i16x4::new(1, 2, 0, 4);
+        assert_eq!(r, expected);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_movemask_pi8() {
+        let a = i16x4::new(0b1000_0000, 0b0100_0000, 0b1000_0000, 0b0100_0000);
+        let r = sse::_mm_movemask_pi8(a);
+        assert_eq!(r, 0b10001);
+    }
+
+    #[simd_test = "sse"]
+    unsafe fn _mm_shuffle_pi16() {
+        let a = i16x4::new(1, 2, 3, 4);
+        let r = sse::_mm_shuffle_pi16(a, 0b00_01_01_11);
+        let expected = i16x4::new(4, 2, 2, 1);
+        assert_eq!(r, expected);
+    }
+
     #[simd_test = "sse"]
     unsafe fn _mm_cvtps_pi32() {
         let a = f32x4::new(1.0, 2.0, 3.0, 4.0);