@@ -971,6 +971,169 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 {
971
971
simd_shuffle4 ( a, a, [ 3 , 2 , 1 , 0 ] )
972
972
}
973
973
974
+ /// Store the upper half of `a` (64 bits) into memory.
975
+ ///
976
+ /// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may
977
+ /// choose to generate an equivalent sequence of other instructions.
978
+ #[ inline( always) ]
979
+ #[ target_feature = "+sse" ]
980
+ // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's fine.
981
+ // On i586 (no SSE2) it just generates plain MOV instructions.
982
+ #[ cfg_attr( all( test, any( target_arch = "x86_64" , target_feature = "sse2" ) ) ,
983
+ assert_instr( movhpd) ) ]
984
+ pub unsafe fn _mm_storeh_pi ( p : * mut u64 , a : f32x4 ) {
985
+ if cfg ! ( target_arch = "x86" ) {
986
+ // If this is a `f64x2` then on i586, LLVM generates fldl & fstpl which
987
+ // is just silly
988
+ let a64: u64x2 = mem:: transmute ( a) ;
989
+ let a_hi = a64. extract ( 1 ) ;
990
+ * p = mem:: transmute ( a_hi) ;
991
+ } else { // target_arch = "x86_64"
992
+ // If this is a `u64x2` LLVM generates a pshufd + movq, but we really
993
+ // want a a MOVHPD or MOVHPS here.
994
+ let a64: f64x2 = mem:: transmute ( a) ;
995
+ let a_hi = a64. extract ( 1 ) ;
996
+ * p = mem:: transmute ( a_hi) ;
997
+ }
998
+ }
999
+
1000
+ /// Store the lower half of `a` (64 bits) into memory.
1001
+ ///
1002
+ /// This intrinsic corresponds to the `MOVQ` instruction. The compiler may
1003
+ /// choose to generate an equivalent sequence of other instructions.
1004
+ #[ inline( always) ]
1005
+ #[ target_feature = "+sse" ]
1006
+ // On i586 the codegen just generates plane MOVs. No need to test for that.
1007
+ #[ cfg_attr( all( test, any( target_arch = "x86_64" , target_feature = "sse2" ) ,
1008
+ not( target_family = "windows" ) ) ,
1009
+ assert_instr( movlps) ) ]
1010
+ // Win64 passes `a` by reference, which causes it to generate two 64 bit moves.
1011
+ #[ cfg_attr( all( test, any( target_arch = "x86_64" , target_feature = "sse2" ) ,
1012
+ target_family = "windows" ) ,
1013
+ assert_instr( movsd) ) ]
1014
+ pub unsafe fn _mm_storel_pi ( p : * mut u64 , a : f32x4 ) {
1015
+ if cfg ! ( target_arch = "x86" ) {
1016
+ // Same as for _mm_storeh_pi: i586 code gen would use floating point
1017
+ // stack.
1018
+ let a64: u64x2 = mem:: transmute ( a) ;
1019
+ let a_hi = a64. extract ( 0 ) ;
1020
+ * p = mem:: transmute ( a_hi) ;
1021
+ } else { // target_arch = "x86_64"
1022
+ let a64: f64x2 = mem:: transmute ( a) ;
1023
+ let a_hi = a64. extract ( 0 ) ;
1024
+ * p = mem:: transmute ( a_hi) ;
1025
+ }
1026
+ }
1027
+
1028
+ /// Store the lowest 32 bit float of `a` into memory.
1029
+ ///
1030
+ /// This intrinsic corresponds to the `MOVSS` instruction.
1031
+ #[ inline( always) ]
1032
+ #[ target_feature = "+sse" ]
1033
+ #[ cfg_attr( test, assert_instr( movss) ) ]
1034
+ pub unsafe fn _mm_store_ss ( p : * mut f32 , a : f32x4 ) {
1035
+ * p = a. extract ( 0 )
1036
+ }
1037
+
1038
+ /// Store the lowest 32 bit float of `a` repeated four times into *aligned*
1039
+ /// memory.
1040
+ ///
1041
+ /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1042
+ /// protection fault will be triggered (fatal program crash).
1043
+ ///
1044
+ /// Functionally equivalent to the following code sequence (assuming `p`
1045
+ /// satisfies the alignment restrictions):
1046
+ ///
1047
+ /// ```text
1048
+ /// let x = a.extract(0);
1049
+ /// *p = x;
1050
+ /// *p.offset(1) = x;
1051
+ /// *p.offset(2) = x;
1052
+ /// *p.offset(3) = x;
1053
+ /// ```
1054
+ #[ inline( always) ]
1055
+ #[ target_feature = "+sse" ]
1056
+ #[ cfg_attr( test, assert_instr( movaps) ) ]
1057
+ pub unsafe fn _mm_store1_ps ( p : * mut f32 , a : f32x4 ) {
1058
+ let b: f32x4 = simd_shuffle4 ( a, a, [ 0 , 0 , 0 , 0 ] ) ;
1059
+ * ( p as * mut f32x4 ) = b;
1060
+ }
1061
+
1062
+ /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1063
+ #[ inline( always) ]
1064
+ #[ target_feature = "+sse" ]
1065
+ #[ cfg_attr( test, assert_instr( movaps) ) ]
1066
+ pub unsafe fn _mm_store_ps1 ( p : * mut f32 , a : f32x4 ) {
1067
+ _mm_store1_ps ( p, a) ;
1068
+ }
1069
+
1070
+ /// Store four 32-bit floats into *aligned* memory.
1071
+ ///
1072
+ /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1073
+ /// protection fault will be triggered (fatal program crash).
1074
+ ///
1075
+ /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1076
+ /// memory.
1077
+ ///
1078
+ /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1079
+ #[ inline( always) ]
1080
+ #[ target_feature = "+sse" ]
1081
+ #[ cfg_attr( test, assert_instr( movaps) ) ]
1082
+ pub unsafe fn _mm_store_ps ( p : * mut f32 , a : f32x4 ) {
1083
+ * ( p as * mut f32x4 ) = a;
1084
+ }
1085
+
1086
+ /// Store four 32-bit floats into memory. There are no restrictions on memory
1087
+ /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1088
+ /// faster.
1089
+ ///
1090
+ /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1091
+ #[ inline( always) ]
1092
+ #[ target_feature = "+sse" ]
1093
+ #[ cfg_attr( test, assert_instr( movups) ) ]
1094
+ pub unsafe fn _mm_storeu_ps ( p : * mut f32 , a : f32x4 ) {
1095
+ ptr:: copy_nonoverlapping (
1096
+ & a as * const f32x4 as * const u8 ,
1097
+ p as * mut u8 ,
1098
+ mem:: size_of :: < f32x4 > ( ) ) ;
1099
+ }
1100
+
1101
+ /// Store four 32-bit floats into *aligned* memory in reverse order.
1102
+ ///
1103
+ /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1104
+ /// protection fault will be triggered (fatal program crash).
1105
+ ///
1106
+ /// Functionally equivalent to the following code sequence (assuming `p`
1107
+ /// satisfies the alignment restrictions):
1108
+ ///
1109
+ /// ```text
1110
+ /// *p = a.extract(3);
1111
+ /// *p.offset(1) = a.extract(2);
1112
+ /// *p.offset(2) = a.extract(1);
1113
+ /// *p.offset(3) = a.extract(0);
1114
+ /// ```
1115
+ #[ inline( always) ]
1116
+ #[ target_feature = "+sse" ]
1117
+ #[ cfg_attr( test, assert_instr( movaps) ) ]
1118
+ pub unsafe fn _mm_storer_ps ( p : * mut f32 , a : f32x4 ) {
1119
+ let b: f32x4 = simd_shuffle4 ( a, a, [ 3 , 2 , 1 , 0 ] ) ;
1120
+ * ( p as * mut f32x4 ) = b;
1121
+ }
1122
+
1123
+ /// Return a `f32x4` with the first component from `b` and the remaining
1124
+ /// components from `a`.
1125
+ ///
1126
+ /// In other words for any `a` and `b`:
1127
+ /// ```text
1128
+ /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1129
+ /// ```
1130
+ #[ inline( always) ]
1131
+ #[ target_feature = "+sse" ]
1132
+ #[ cfg_attr( test, assert_instr( movss) ) ]
1133
+ pub unsafe fn _mm_move_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
1134
+ simd_shuffle4 ( a, b, [ 4 , 1 , 2 , 3 ] )
1135
+ }
1136
+
974
1137
/// Perform a serializing operation on all store-to-memory instructions that
975
1138
/// were issued prior to this instruction.
976
1139
///
@@ -2526,6 +2689,155 @@ mod tests {
2526
2689
assert_eq ! ( r, f32x4:: new( 4.0 , 3.0 , 2.0 , 1.0 ) + f32x4:: splat( fixup) ) ;
2527
2690
}
2528
2691
2692
+ #[ simd_test = "sse" ]
2693
+ unsafe fn _mm_storeh_pi ( ) {
2694
+ let mut vals = [ 0.0f32 ; 8 ] ;
2695
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2696
+ sse:: _mm_storeh_pi ( vals. as_mut_ptr ( ) as * mut f32 as * mut u64 , a) ;
2697
+
2698
+ assert_eq ! ( vals[ 0 ] , 3.0 ) ;
2699
+ assert_eq ! ( vals[ 1 ] , 4.0 ) ;
2700
+ assert_eq ! ( vals[ 2 ] , 0.0 ) ;
2701
+ }
2702
+
2703
+ #[ simd_test = "sse" ]
2704
+ unsafe fn _mm_storel_pi ( ) {
2705
+ let mut vals = [ 0.0f32 ; 8 ] ;
2706
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2707
+ sse:: _mm_storel_pi ( vals. as_mut_ptr ( ) as * mut f32 as * mut u64 , a) ;
2708
+
2709
+ assert_eq ! ( vals[ 0 ] , 1.0 ) ;
2710
+ assert_eq ! ( vals[ 1 ] , 2.0 ) ;
2711
+ assert_eq ! ( vals[ 2 ] , 0.0 ) ;
2712
+ }
2713
+
2714
+ #[ simd_test = "sse" ]
2715
+ unsafe fn _mm_store_ss ( ) {
2716
+ let mut vals = [ 0.0f32 ; 8 ] ;
2717
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2718
+ sse:: _mm_store_ss ( vals. as_mut_ptr ( ) . offset ( 1 ) , a) ;
2719
+
2720
+ assert_eq ! ( vals[ 0 ] , 0.0 ) ;
2721
+ assert_eq ! ( vals[ 1 ] , 1.0 ) ;
2722
+ assert_eq ! ( vals[ 2 ] , 0.0 ) ;
2723
+ }
2724
+
2725
+ #[ simd_test = "sse" ]
2726
+ unsafe fn _mm_store1_ps ( ) {
2727
+ let mut vals = [ 0.0f32 ; 8 ] ;
2728
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2729
+
2730
+ let mut ofs = 0 ;
2731
+ let mut p = vals. as_mut_ptr ( ) ;
2732
+
2733
+ if ( p as usize ) & 0xf != 0 {
2734
+ ofs = ( 16 - ( p as usize ) & 0xf ) >> 2 ;
2735
+ p = p. offset ( ofs as isize ) ;
2736
+ }
2737
+
2738
+ sse:: _mm_store1_ps ( p, * black_box ( & a) ) ;
2739
+
2740
+ if ofs > 0 {
2741
+ assert_eq ! ( vals[ ofs - 1 ] , 0.0 ) ;
2742
+ }
2743
+ assert_eq ! ( vals[ ofs + 0 ] , 1.0 ) ;
2744
+ assert_eq ! ( vals[ ofs + 1 ] , 1.0 ) ;
2745
+ assert_eq ! ( vals[ ofs + 2 ] , 1.0 ) ;
2746
+ assert_eq ! ( vals[ ofs + 3 ] , 1.0 ) ;
2747
+ assert_eq ! ( vals[ ofs + 4 ] , 0.0 ) ;
2748
+ }
2749
+
2750
+ #[ simd_test = "sse" ]
2751
+ unsafe fn _mm_store_ps ( ) {
2752
+ let mut vals = [ 0.0f32 ; 8 ] ;
2753
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2754
+
2755
+ let mut ofs = 0 ;
2756
+ let mut p = vals. as_mut_ptr ( ) ;
2757
+
2758
+ // Align p to 16-byte boundary
2759
+ if ( p as usize ) & 0xf != 0 {
2760
+ ofs = ( 16 - ( p as usize ) & 0xf ) >> 2 ;
2761
+ p = p. offset ( ofs as isize ) ;
2762
+ }
2763
+
2764
+ sse:: _mm_store_ps ( p, * black_box ( & a) ) ;
2765
+
2766
+ if ofs > 0 {
2767
+ assert_eq ! ( vals[ ofs - 1 ] , 0.0 ) ;
2768
+ }
2769
+ assert_eq ! ( vals[ ofs + 0 ] , 1.0 ) ;
2770
+ assert_eq ! ( vals[ ofs + 1 ] , 2.0 ) ;
2771
+ assert_eq ! ( vals[ ofs + 2 ] , 3.0 ) ;
2772
+ assert_eq ! ( vals[ ofs + 3 ] , 4.0 ) ;
2773
+ assert_eq ! ( vals[ ofs + 4 ] , 0.0 ) ;
2774
+ }
2775
+
2776
+ #[ simd_test = "sse" ]
2777
+ unsafe fn _mm_storer_ps ( ) {
2778
+ let mut vals = [ 0.0f32 ; 8 ] ;
2779
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2780
+
2781
+ let mut ofs = 0 ;
2782
+ let mut p = vals. as_mut_ptr ( ) ;
2783
+
2784
+ // Align p to 16-byte boundary
2785
+ if ( p as usize ) & 0xf != 0 {
2786
+ ofs = ( 16 - ( p as usize ) & 0xf ) >> 2 ;
2787
+ p = p. offset ( ofs as isize ) ;
2788
+ }
2789
+
2790
+ sse:: _mm_storer_ps ( p, * black_box ( & a) ) ;
2791
+
2792
+ if ofs > 0 {
2793
+ assert_eq ! ( vals[ ofs - 1 ] , 0.0 ) ;
2794
+ }
2795
+ assert_eq ! ( vals[ ofs + 0 ] , 4.0 ) ;
2796
+ assert_eq ! ( vals[ ofs + 1 ] , 3.0 ) ;
2797
+ assert_eq ! ( vals[ ofs + 2 ] , 2.0 ) ;
2798
+ assert_eq ! ( vals[ ofs + 3 ] , 1.0 ) ;
2799
+ assert_eq ! ( vals[ ofs + 4 ] , 0.0 ) ;
2800
+ }
2801
+
2802
+ #[ simd_test = "sse" ]
2803
+ unsafe fn _mm_storeu_ps ( ) {
2804
+ let mut vals = [ 0.0f32 ; 8 ] ;
2805
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2806
+
2807
+ let mut ofs = 0 ;
2808
+ let mut p = vals. as_mut_ptr ( ) ;
2809
+
2810
+ // Make sure p is *not* aligned to 16-byte boundary
2811
+ if ( p as usize ) & 0xf == 0 {
2812
+ ofs = 1 ;
2813
+ p = p. offset ( 1 ) ;
2814
+ }
2815
+
2816
+ sse:: _mm_storeu_ps ( p, * black_box ( & a) ) ;
2817
+
2818
+ if ofs > 0 {
2819
+ assert_eq ! ( vals[ ofs - 1 ] , 0.0 ) ;
2820
+ }
2821
+ assert_eq ! ( vals[ ofs + 0 ] , 1.0 ) ;
2822
+ assert_eq ! ( vals[ ofs + 1 ] , 2.0 ) ;
2823
+ assert_eq ! ( vals[ ofs + 2 ] , 3.0 ) ;
2824
+ assert_eq ! ( vals[ ofs + 3 ] , 4.0 ) ;
2825
+ assert_eq ! ( vals[ ofs + 4 ] , 0.0 ) ;
2826
+ }
2827
+
2828
+ #[ simd_test = "sse" ]
2829
+ unsafe fn _mm_move_ss ( ) {
2830
+ let a = f32x4:: new ( 1.0 , 2.0 , 3.0 , 4.0 ) ;
2831
+ let b = f32x4:: new ( 5.0 , 6.0 , 7.0 , 8.0 ) ;
2832
+
2833
+ let r1 = sse:: _mm_move_ss ( a, b) ;
2834
+ let r2 = a. replace ( 0 , b. extract ( 0 ) ) ;
2835
+
2836
+ let e = f32x4:: new ( 5.0 , 2.0 , 3.0 , 4.0 ) ;
2837
+ assert_eq ! ( e, r1) ;
2838
+ assert_eq ! ( e, r2) ;
2839
+ }
2840
+
2529
2841
#[ simd_test = "sse" ]
2530
2842
unsafe fn _mm_movemask_ps ( ) {
2531
2843
let r = sse:: _mm_movemask_ps ( f32x4:: new ( -1.0 , 5.0 , -5.0 , 0.0 ) ) ;
0 commit comments