Skip to content

Commit 141a23d

Browse files
nominoloalexcrichton
authored andcommitted
Add SSE _mm_store* intrinsics and _mm_move_ss (rust-lang#115)
* Add _mm_store* intrinsics and _mm_move_ss * Fix Win64 & Linux i586 failures * Make i586 codegen happy without breaking x86_64
1 parent 2dbe8d0 commit 141a23d

File tree

1 file changed

+312
-0
lines changed

1 file changed

+312
-0
lines changed

src/x86/sse.rs

+312
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,169 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 {
971971
simd_shuffle4(a, a, [3, 2, 1, 0])
972972
}
973973

974+
/// Store the upper half of `a` (64 bits) into memory.
975+
///
976+
/// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may
977+
/// choose to generate an equivalent sequence of other instructions.
978+
#[inline(always)]
979+
#[target_feature = "+sse"]
980+
// On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's fine.
981+
// On i586 (no SSE2) it just generates plain MOV instructions.
982+
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
983+
assert_instr(movhpd))]
984+
pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) {
985+
if cfg!(target_arch = "x86") {
986+
// If this is a `f64x2` then on i586, LLVM generates fldl & fstpl which
987+
// is just silly
988+
let a64: u64x2 = mem::transmute(a);
989+
let a_hi = a64.extract(1);
990+
*p = mem::transmute(a_hi);
991+
} else { // target_arch = "x86_64"
992+
// If this is a `u64x2` LLVM generates a pshufd + movq, but we really
993+
// want a a MOVHPD or MOVHPS here.
994+
let a64: f64x2 = mem::transmute(a);
995+
let a_hi = a64.extract(1);
996+
*p = mem::transmute(a_hi);
997+
}
998+
}
999+
1000+
/// Store the lower half of `a` (64 bits) into memory.
1001+
///
1002+
/// This intrinsic corresponds to the `MOVQ` instruction. The compiler may
1003+
/// choose to generate an equivalent sequence of other instructions.
1004+
#[inline(always)]
1005+
#[target_feature = "+sse"]
1006+
// On i586 the codegen just generates plane MOVs. No need to test for that.
1007+
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
1008+
not(target_family = "windows")),
1009+
assert_instr(movlps))]
1010+
// Win64 passes `a` by reference, which causes it to generate two 64 bit moves.
1011+
#[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
1012+
target_family = "windows"),
1013+
assert_instr(movsd))]
1014+
pub unsafe fn _mm_storel_pi(p: *mut u64, a: f32x4) {
1015+
if cfg!(target_arch = "x86") {
1016+
// Same as for _mm_storeh_pi: i586 code gen would use floating point
1017+
// stack.
1018+
let a64: u64x2 = mem::transmute(a);
1019+
let a_hi = a64.extract(0);
1020+
*p = mem::transmute(a_hi);
1021+
} else { // target_arch = "x86_64"
1022+
let a64: f64x2 = mem::transmute(a);
1023+
let a_hi = a64.extract(0);
1024+
*p = mem::transmute(a_hi);
1025+
}
1026+
}
1027+
1028+
/// Store the lowest 32 bit float of `a` into memory.
1029+
///
1030+
/// This intrinsic corresponds to the `MOVSS` instruction.
1031+
#[inline(always)]
1032+
#[target_feature = "+sse"]
1033+
#[cfg_attr(test, assert_instr(movss))]
1034+
pub unsafe fn _mm_store_ss(p: *mut f32, a: f32x4) {
1035+
*p = a.extract(0)
1036+
}
1037+
1038+
/// Store the lowest 32 bit float of `a` repeated four times into *aligned*
1039+
/// memory.
1040+
///
1041+
/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1042+
/// protection fault will be triggered (fatal program crash).
1043+
///
1044+
/// Functionally equivalent to the following code sequence (assuming `p`
1045+
/// satisfies the alignment restrictions):
1046+
///
1047+
/// ```text
1048+
/// let x = a.extract(0);
1049+
/// *p = x;
1050+
/// *p.offset(1) = x;
1051+
/// *p.offset(2) = x;
1052+
/// *p.offset(3) = x;
1053+
/// ```
1054+
#[inline(always)]
1055+
#[target_feature = "+sse"]
1056+
#[cfg_attr(test, assert_instr(movaps))]
1057+
pub unsafe fn _mm_store1_ps(p: *mut f32, a: f32x4) {
1058+
let b: f32x4 = simd_shuffle4(a, a, [0, 0, 0, 0]);
1059+
*(p as *mut f32x4) = b;
1060+
}
1061+
1062+
/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1063+
#[inline(always)]
1064+
#[target_feature = "+sse"]
1065+
#[cfg_attr(test, assert_instr(movaps))]
1066+
pub unsafe fn _mm_store_ps1(p: *mut f32, a: f32x4) {
1067+
_mm_store1_ps(p, a);
1068+
}
1069+
1070+
/// Store four 32-bit floats into *aligned* memory.
1071+
///
1072+
/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1073+
/// protection fault will be triggered (fatal program crash).
1074+
///
1075+
/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1076+
/// memory.
1077+
///
1078+
/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1079+
#[inline(always)]
1080+
#[target_feature = "+sse"]
1081+
#[cfg_attr(test, assert_instr(movaps))]
1082+
pub unsafe fn _mm_store_ps(p: *mut f32, a: f32x4) {
1083+
*(p as *mut f32x4) = a;
1084+
}
1085+
1086+
/// Store four 32-bit floats into memory. There are no restrictions on memory
1087+
/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1088+
/// faster.
1089+
///
1090+
/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1091+
#[inline(always)]
1092+
#[target_feature = "+sse"]
1093+
#[cfg_attr(test, assert_instr(movups))]
1094+
pub unsafe fn _mm_storeu_ps(p: *mut f32, a: f32x4) {
1095+
ptr::copy_nonoverlapping(
1096+
&a as *const f32x4 as *const u8,
1097+
p as *mut u8,
1098+
mem::size_of::<f32x4>());
1099+
}
1100+
1101+
/// Store four 32-bit floats into *aligned* memory in reverse order.
1102+
///
1103+
/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1104+
/// protection fault will be triggered (fatal program crash).
1105+
///
1106+
/// Functionally equivalent to the following code sequence (assuming `p`
1107+
/// satisfies the alignment restrictions):
1108+
///
1109+
/// ```text
1110+
/// *p = a.extract(3);
1111+
/// *p.offset(1) = a.extract(2);
1112+
/// *p.offset(2) = a.extract(1);
1113+
/// *p.offset(3) = a.extract(0);
1114+
/// ```
1115+
#[inline(always)]
1116+
#[target_feature = "+sse"]
1117+
#[cfg_attr(test, assert_instr(movaps))]
1118+
pub unsafe fn _mm_storer_ps(p: *mut f32, a: f32x4) {
1119+
let b: f32x4 = simd_shuffle4(a, a, [3, 2, 1, 0]);
1120+
*(p as *mut f32x4) = b;
1121+
}
1122+
1123+
/// Return a `f32x4` with the first component from `b` and the remaining
1124+
/// components from `a`.
1125+
///
1126+
/// In other words for any `a` and `b`:
1127+
/// ```text
1128+
/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1129+
/// ```
1130+
#[inline(always)]
1131+
#[target_feature = "+sse"]
1132+
#[cfg_attr(test, assert_instr(movss))]
1133+
pub unsafe fn _mm_move_ss(a: f32x4, b: f32x4) -> f32x4 {
1134+
simd_shuffle4(a, b, [4, 1, 2, 3])
1135+
}
1136+
9741137
/// Perform a serializing operation on all store-to-memory instructions that
9751138
/// were issued prior to this instruction.
9761139
///
@@ -2526,6 +2689,155 @@ mod tests {
25262689
assert_eq!(r, f32x4::new(4.0, 3.0, 2.0, 1.0) + f32x4::splat(fixup));
25272690
}
25282691

2692+
#[simd_test = "sse"]
2693+
unsafe fn _mm_storeh_pi() {
2694+
let mut vals = [0.0f32; 8];
2695+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2696+
sse::_mm_storeh_pi(vals.as_mut_ptr() as *mut f32 as *mut u64, a);
2697+
2698+
assert_eq!(vals[0], 3.0);
2699+
assert_eq!(vals[1], 4.0);
2700+
assert_eq!(vals[2], 0.0);
2701+
}
2702+
2703+
#[simd_test = "sse"]
2704+
unsafe fn _mm_storel_pi() {
2705+
let mut vals = [0.0f32; 8];
2706+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2707+
sse::_mm_storel_pi(vals.as_mut_ptr() as *mut f32 as *mut u64, a);
2708+
2709+
assert_eq!(vals[0], 1.0);
2710+
assert_eq!(vals[1], 2.0);
2711+
assert_eq!(vals[2], 0.0);
2712+
}
2713+
2714+
#[simd_test = "sse"]
2715+
unsafe fn _mm_store_ss() {
2716+
let mut vals = [0.0f32; 8];
2717+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2718+
sse::_mm_store_ss(vals.as_mut_ptr().offset(1), a);
2719+
2720+
assert_eq!(vals[0], 0.0);
2721+
assert_eq!(vals[1], 1.0);
2722+
assert_eq!(vals[2], 0.0);
2723+
}
2724+
2725+
#[simd_test = "sse"]
2726+
unsafe fn _mm_store1_ps() {
2727+
let mut vals = [0.0f32; 8];
2728+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2729+
2730+
let mut ofs = 0;
2731+
let mut p = vals.as_mut_ptr();
2732+
2733+
if (p as usize) & 0xf != 0 {
2734+
ofs = (16 - (p as usize) & 0xf) >> 2;
2735+
p = p.offset(ofs as isize);
2736+
}
2737+
2738+
sse::_mm_store1_ps(p, *black_box(&a));
2739+
2740+
if ofs > 0 {
2741+
assert_eq!(vals[ofs - 1], 0.0);
2742+
}
2743+
assert_eq!(vals[ofs + 0], 1.0);
2744+
assert_eq!(vals[ofs + 1], 1.0);
2745+
assert_eq!(vals[ofs + 2], 1.0);
2746+
assert_eq!(vals[ofs + 3], 1.0);
2747+
assert_eq!(vals[ofs + 4], 0.0);
2748+
}
2749+
2750+
#[simd_test = "sse"]
2751+
unsafe fn _mm_store_ps() {
2752+
let mut vals = [0.0f32; 8];
2753+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2754+
2755+
let mut ofs = 0;
2756+
let mut p = vals.as_mut_ptr();
2757+
2758+
// Align p to 16-byte boundary
2759+
if (p as usize) & 0xf != 0 {
2760+
ofs = (16 - (p as usize) & 0xf) >> 2;
2761+
p = p.offset(ofs as isize);
2762+
}
2763+
2764+
sse::_mm_store_ps(p, *black_box(&a));
2765+
2766+
if ofs > 0 {
2767+
assert_eq!(vals[ofs - 1], 0.0);
2768+
}
2769+
assert_eq!(vals[ofs + 0], 1.0);
2770+
assert_eq!(vals[ofs + 1], 2.0);
2771+
assert_eq!(vals[ofs + 2], 3.0);
2772+
assert_eq!(vals[ofs + 3], 4.0);
2773+
assert_eq!(vals[ofs + 4], 0.0);
2774+
}
2775+
2776+
#[simd_test = "sse"]
2777+
unsafe fn _mm_storer_ps() {
2778+
let mut vals = [0.0f32; 8];
2779+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2780+
2781+
let mut ofs = 0;
2782+
let mut p = vals.as_mut_ptr();
2783+
2784+
// Align p to 16-byte boundary
2785+
if (p as usize) & 0xf != 0 {
2786+
ofs = (16 - (p as usize) & 0xf) >> 2;
2787+
p = p.offset(ofs as isize);
2788+
}
2789+
2790+
sse::_mm_storer_ps(p, *black_box(&a));
2791+
2792+
if ofs > 0 {
2793+
assert_eq!(vals[ofs - 1], 0.0);
2794+
}
2795+
assert_eq!(vals[ofs + 0], 4.0);
2796+
assert_eq!(vals[ofs + 1], 3.0);
2797+
assert_eq!(vals[ofs + 2], 2.0);
2798+
assert_eq!(vals[ofs + 3], 1.0);
2799+
assert_eq!(vals[ofs + 4], 0.0);
2800+
}
2801+
2802+
#[simd_test = "sse"]
2803+
unsafe fn _mm_storeu_ps() {
2804+
let mut vals = [0.0f32; 8];
2805+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2806+
2807+
let mut ofs = 0;
2808+
let mut p = vals.as_mut_ptr();
2809+
2810+
// Make sure p is *not* aligned to 16-byte boundary
2811+
if (p as usize) & 0xf == 0 {
2812+
ofs = 1;
2813+
p = p.offset(1);
2814+
}
2815+
2816+
sse::_mm_storeu_ps(p, *black_box(&a));
2817+
2818+
if ofs > 0 {
2819+
assert_eq!(vals[ofs - 1], 0.0);
2820+
}
2821+
assert_eq!(vals[ofs + 0], 1.0);
2822+
assert_eq!(vals[ofs + 1], 2.0);
2823+
assert_eq!(vals[ofs + 2], 3.0);
2824+
assert_eq!(vals[ofs + 3], 4.0);
2825+
assert_eq!(vals[ofs + 4], 0.0);
2826+
}
2827+
2828+
#[simd_test = "sse"]
2829+
unsafe fn _mm_move_ss() {
2830+
let a = f32x4::new(1.0, 2.0, 3.0, 4.0);
2831+
let b = f32x4::new(5.0, 6.0, 7.0, 8.0);
2832+
2833+
let r1 = sse::_mm_move_ss(a, b);
2834+
let r2 = a.replace(0, b.extract(0));
2835+
2836+
let e = f32x4::new(5.0, 2.0, 3.0, 4.0);
2837+
assert_eq!(e, r1);
2838+
assert_eq!(e, r2);
2839+
}
2840+
25292841
#[simd_test = "sse"]
25302842
unsafe fn _mm_movemask_ps() {
25312843
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0));

0 commit comments

Comments
 (0)