@@ -4,6 +4,83 @@ use v128::*;
4
4
#[ cfg( test) ]
5
5
use assert_instr:: assert_instr;
6
6
7
+ /// Adds the first component of `a` and `b`, the other components are copied
8
+ /// from `a`.
9
+ #[ inline( always) ]
10
+ #[ target_feature = "+sse" ]
11
+ #[ cfg_attr( test, assert_instr( addss) ) ]
12
+ pub fn _mm_add_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
13
+ unsafe { addss ( a, b) }
14
+ }
15
+
16
+ /// Adds f32x4 vectors.
17
+ #[ inline( always) ]
18
+ #[ target_feature = "+sse" ]
19
+ #[ cfg_attr( test, assert_instr( addps) ) ]
20
+ pub fn _mm_add_ps ( a : f32x4 , b : f32x4 ) -> f32x4 {
21
+ a + b
22
+ }
23
+
24
+ /// Subtracts the first component of `b` from `a`, the other components are
25
+ /// copied from `a`.
26
+ #[ inline( always) ]
27
+ #[ target_feature = "+sse" ]
28
+ #[ cfg_attr( test, assert_instr( subss) ) ]
29
+ pub fn _mm_sub_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
30
+ unsafe { subss ( a, b) }
31
+ }
32
+
33
+ /// Subtracts f32x4 vectors.
34
+ #[ inline( always) ]
35
+ #[ target_feature = "+sse" ]
36
+ #[ cfg_attr( test, assert_instr( subps) ) ]
37
+ pub fn _mm_sub_ps ( a : f32x4 , b : f32x4 ) -> f32x4 {
38
+ a - b
39
+ }
40
+
41
+ /// Multiplies the first component of `a` and `b`, the other components are
42
+ /// copied from `a`.
43
+ #[ inline( always) ]
44
+ #[ target_feature = "+sse" ]
45
+ #[ cfg_attr( test, assert_instr( mulss) ) ]
46
+ pub fn _mm_mul_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
47
+ unsafe { mulss ( a, b) }
48
+ }
49
+
50
+ /// Multiplies f32x4 vectors.
51
+ #[ inline( always) ]
52
+ #[ target_feature = "+sse" ]
53
+ #[ cfg_attr( test, assert_instr( mulps) ) ]
54
+ pub fn _mm_mul_ps ( a : f32x4 , b : f32x4 ) -> f32x4 {
55
+ a * b
56
+ }
57
+
58
+ /// Divides the first component of `b` by `a`, the other components are
59
+ /// copied from `a`.
60
+ #[ inline( always) ]
61
+ #[ target_feature = "+sse" ]
62
+ #[ cfg_attr( test, assert_instr( divss) ) ]
63
+ pub fn _mm_div_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
64
+ unsafe { divss ( a, b) }
65
+ }
66
+
67
+ /// Divides f32x4 vectors.
68
+ #[ inline( always) ]
69
+ #[ target_feature = "+sse" ]
70
+ #[ cfg_attr( test, assert_instr( divps) ) ]
71
+ pub fn _mm_div_ps ( a : f32x4 , b : f32x4 ) -> f32x4 {
72
+ a / b
73
+ }
74
+
75
+ /// Return the square root of the first single-precision (32-bit)
76
+ /// floating-point element in `a`, the other elements are unchanged.
77
+ #[ inline( always) ]
78
+ #[ target_feature = "+sse" ]
79
+ #[ cfg_attr( test, assert_instr( sqrtss) ) ]
80
+ pub fn _mm_sqrt_ss ( a : f32x4 ) -> f32x4 {
81
+ unsafe { sqrtss ( a) }
82
+ }
83
+
7
84
/// Return the square root of packed single-precision (32-bit) floating-point
8
85
/// elements in `a`.
9
86
#[ inline( always) ]
@@ -13,6 +90,15 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
13
90
unsafe { sqrtps ( a) }
14
91
}
15
92
93
+ /// Return the approximate reciprocal of the first single-precision
94
+ /// (32-bit) floating-point element in `a`, the other elements are unchanged.
95
+ #[ inline( always) ]
96
+ #[ target_feature = "+sse" ]
97
+ #[ cfg_attr( test, assert_instr( rcpss) ) ]
98
+ pub fn _mm_rcp_ss ( a : f32x4 ) -> f32x4 {
99
+ unsafe { rcpss ( a) }
100
+ }
101
+
16
102
/// Return the approximate reciprocal of packed single-precision (32-bit)
17
103
/// floating-point elements in `a`.
18
104
#[ inline( always) ]
@@ -22,6 +108,15 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
22
108
unsafe { rcpps ( a) }
23
109
}
24
110
111
+ /// Return the approximate reciprocal square root of the fist single-precision
112
+ /// (32-bit) floating-point elements in `a`, the other elements are unchanged.
113
+ #[ inline( always) ]
114
+ #[ target_feature = "+sse" ]
115
+ #[ cfg_attr( test, assert_instr( rsqrtss) ) ]
116
+ pub fn _mm_rsqrt_ss ( a : f32x4 ) -> f32x4 {
117
+ unsafe { rsqrtss ( a) }
118
+ }
119
+
25
120
/// Return the approximate reciprocal square root of packed single-precision
26
121
/// (32-bit) floating-point elements in `a`.
27
122
#[ inline( always) ]
@@ -31,6 +126,16 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
31
126
unsafe { rsqrtps ( a) }
32
127
}
33
128
129
+ /// Compare the first single-precision (32-bit) floating-point element of `a`
130
+ /// and `b`, and return the minimum value in the first element of the return
131
+ /// value, the other elements are copied from `a`.
132
+ #[ inline( always) ]
133
+ #[ target_feature = "+sse" ]
134
+ #[ cfg_attr( test, assert_instr( minss) ) ]
135
+ pub fn _mm_min_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
136
+ unsafe { minss ( a, b) }
137
+ }
138
+
34
139
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
35
140
/// `b`, and return the corresponding minimum values.
36
141
#[ inline( always) ]
@@ -40,6 +145,16 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
40
145
unsafe { minps ( a, b) }
41
146
}
42
147
148
+ /// Compare the first single-precision (32-bit) floating-point element of `a`
149
+ /// and `b`, and return the maximum value in the first element of the return
150
+ /// value, the other elements are copied from `a`.
151
+ #[ inline( always) ]
152
+ #[ target_feature = "+sse" ]
153
+ #[ cfg_attr( test, assert_instr( maxss) ) ]
154
+ pub fn _mm_max_ss ( a : f32x4 , b : f32x4 ) -> f32x4 {
155
+ unsafe { maxss ( a, b) }
156
+ }
157
+
43
158
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
44
159
/// `b`, and return the corresponding maximum values.
45
160
#[ inline( always) ]
@@ -70,14 +185,32 @@ pub fn _mm_movemask_ps(a: f32x4) -> i32 {
70
185
71
186
#[ allow( improper_ctypes) ]
72
187
extern {
188
+ #[ link_name = "llvm.x86.sse.add.ss" ]
189
+ fn addss ( a : f32x4 , b : f32x4 ) -> f32x4 ;
190
+ #[ link_name = "llvm.x86.sse.sub.ss" ]
191
+ fn subss ( a : f32x4 , b : f32x4 ) -> f32x4 ;
192
+ #[ link_name = "llvm.x86.sse.mul.ss" ]
193
+ fn mulss ( a : f32x4 , b : f32x4 ) -> f32x4 ;
194
+ #[ link_name = "llvm.x86.sse.div.ss" ]
195
+ fn divss ( a : f32x4 , b : f32x4 ) -> f32x4 ;
196
+ #[ link_name = "llvm.x86.sse.sqrt.ss" ]
197
+ fn sqrtss ( a : f32x4 ) -> f32x4 ;
73
198
#[ link_name = "llvm.x86.sse.sqrt.ps" ]
74
199
fn sqrtps ( a : f32x4 ) -> f32x4 ;
200
+ #[ link_name = "llvm.x86.sse.rcp.ss" ]
201
+ fn rcpss ( a : f32x4 ) -> f32x4 ;
75
202
#[ link_name = "llvm.x86.sse.rcp.ps" ]
76
203
fn rcpps ( a : f32x4 ) -> f32x4 ;
204
+ #[ link_name = "llvm.x86.sse.rsqrt.ss" ]
205
+ fn rsqrtss ( a : f32x4 ) -> f32x4 ;
77
206
#[ link_name = "llvm.x86.sse.rsqrt.ps" ]
78
207
fn rsqrtps ( a : f32x4 ) -> f32x4 ;
208
+ #[ link_name = "llvm.x86.sse.min.ss" ]
209
+ fn minss ( a : f32x4 , b : f32x4 ) -> f32x4 ;
79
210
#[ link_name = "llvm.x86.sse.min.ps" ]
80
211
fn minps ( a : f32x4 , b : f32x4 ) -> f32x4 ;
212
+ #[ link_name = "llvm.x86.sse.max.ss" ]
213
+ fn maxss ( a : f32x4 , b : f32x4 ) -> f32x4 ;
81
214
#[ link_name = "llvm.x86.sse.max.ps" ]
82
215
fn maxps ( a : f32x4 , b : f32x4 ) -> f32x4 ;
83
216
#[ link_name = "llvm.x86.sse.movmsk.ps" ]
@@ -89,6 +222,87 @@ mod tests {
89
222
use v128:: * ;
90
223
use x86:: sse;
91
224
225
+ #[ test]
226
+ #[ target_feature = "+sse" ]
227
+ fn _mm_add_ps ( ) {
228
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
229
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
230
+ let r = sse:: _mm_add_ps ( a, b) ;
231
+ assert_eq ! ( r, f32x4:: new( -101.0 , 25.0 , 0.0 , -15.0 ) ) ;
232
+ }
233
+
234
+ #[ test]
235
+ #[ target_feature = "+sse" ]
236
+ fn _mm_add_ss ( ) {
237
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
238
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
239
+ let r = sse:: _mm_add_ss ( a, b) ;
240
+ assert_eq ! ( r, f32x4:: new( -101.0 , 5.0 , 0.0 , -10.0 ) ) ;
241
+ }
242
+
243
+ #[ test]
244
+ #[ target_feature = "+sse" ]
245
+ fn _mm_sub_ps ( ) {
246
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
247
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
248
+ let r = sse:: _mm_sub_ps ( a, b) ;
249
+ assert_eq ! ( r, f32x4:: new( 99.0 , -15.0 , 0.0 , -5.0 ) ) ;
250
+ }
251
+
252
+ #[ test]
253
+ #[ target_feature = "+sse" ]
254
+ fn _mm_sub_ss ( ) {
255
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
256
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
257
+ let r = sse:: _mm_sub_ss ( a, b) ;
258
+ assert_eq ! ( r, f32x4:: new( 99.0 , 5.0 , 0.0 , -10.0 ) ) ;
259
+ }
260
+
261
+ #[ test]
262
+ #[ target_feature = "+sse" ]
263
+ fn _mm_mul_ps ( ) {
264
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
265
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
266
+ let r = sse:: _mm_mul_ps ( a, b) ;
267
+ assert_eq ! ( r, f32x4:: new( 100.0 , 100.0 , 0.0 , 50.0 ) ) ;
268
+ }
269
+
270
+ #[ test]
271
+ #[ target_feature = "+sse" ]
272
+ fn _mm_mul_ss ( ) {
273
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
274
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
275
+ let r = sse:: _mm_mul_ss ( a, b) ;
276
+ assert_eq ! ( r, f32x4:: new( 100.0 , 5.0 , 0.0 , -10.0 ) ) ;
277
+ }
278
+
279
+ #[ test]
280
+ #[ target_feature = "+sse" ]
281
+ fn _mm_div_ps ( ) {
282
+ let a = f32x4:: new ( -1.0 , 5.0 , 2.0 , -10.0 ) ;
283
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.2 , -5.0 ) ;
284
+ let r = sse:: _mm_div_ps ( a, b) ;
285
+ assert_eq ! ( r, f32x4:: new( 0.01 , 0.25 , 10.0 , 2.0 ) ) ;
286
+ }
287
+
288
+ #[ test]
289
+ #[ target_feature = "+sse" ]
290
+ fn _mm_div_ss ( ) {
291
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
292
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
293
+ let r = sse:: _mm_div_ss ( a, b) ;
294
+ assert_eq ! ( r, f32x4:: new( 0.01 , 5.0 , 0.0 , -10.0 ) ) ;
295
+ }
296
+
297
+ #[ test]
298
+ #[ target_feature = "+sse" ]
299
+ fn _mm_sqrt_ss ( ) {
300
+ let a = f32x4:: new ( 4.0 , 13.0 , 16.0 , 100.0 ) ;
301
+ let r = sse:: _mm_sqrt_ss ( a) ;
302
+ let e = f32x4:: new ( 2.0 , 13.0 , 16.0 , 100.0 ) ;
303
+ assert_eq ! ( r, e) ;
304
+ }
305
+
92
306
#[ test]
93
307
#[ target_feature = "+sse" ]
94
308
fn _mm_sqrt_ps ( ) {
@@ -98,6 +312,15 @@ mod tests {
98
312
assert_eq ! ( r, e) ;
99
313
}
100
314
315
+ #[ test]
316
+ #[ target_feature = "+sse" ]
317
+ fn _mm_rcp_ss ( ) {
318
+ let a = f32x4:: new ( 4.0 , 13.0 , 16.0 , 100.0 ) ;
319
+ let r = sse:: _mm_rcp_ss ( a) ;
320
+ let e = f32x4:: new ( 0.24993896 , 13.0 , 16.0 , 100.0 ) ;
321
+ assert_eq ! ( r, e) ;
322
+ }
323
+
101
324
#[ test]
102
325
#[ target_feature = "+sse" ]
103
326
fn _mm_rcp_ps ( ) {
@@ -107,6 +330,15 @@ mod tests {
107
330
assert_eq ! ( r, e) ;
108
331
}
109
332
333
+ #[ test]
334
+ #[ target_feature = "+sse" ]
335
+ fn _mm_rsqrt_ss ( ) {
336
+ let a = f32x4:: new ( 4.0 , 13.0 , 16.0 , 100.0 ) ;
337
+ let r = sse:: _mm_rsqrt_ss ( a) ;
338
+ let e = f32x4:: new ( 0.49987793 , 13.0 , 16.0 , 100.0 ) ;
339
+ assert_eq ! ( r, e) ;
340
+ }
341
+
110
342
#[ test]
111
343
#[ target_feature = "+sse" ]
112
344
fn _mm_rsqrt_ps ( ) {
@@ -116,6 +348,15 @@ mod tests {
116
348
assert_eq ! ( r, e) ;
117
349
}
118
350
351
+ #[ test]
352
+ #[ target_feature = "+sse" ]
353
+ fn _mm_min_ss ( ) {
354
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
355
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
356
+ let r = sse:: _mm_min_ss ( a, b) ;
357
+ assert_eq ! ( r, f32x4:: new( -100.0 , 5.0 , 0.0 , -10.0 ) ) ;
358
+ }
359
+
119
360
#[ test]
120
361
#[ target_feature = "+sse" ]
121
362
fn _mm_min_ps ( ) {
@@ -125,6 +366,15 @@ mod tests {
125
366
assert_eq ! ( r, f32x4:: new( -100.0 , 5.0 , 0.0 , -10.0 ) ) ;
126
367
}
127
368
369
+ #[ test]
370
+ #[ target_feature = "+sse" ]
371
+ fn _mm_max_ss ( ) {
372
+ let a = f32x4:: new ( -1.0 , 5.0 , 0.0 , -10.0 ) ;
373
+ let b = f32x4:: new ( -100.0 , 20.0 , 0.0 , -5.0 ) ;
374
+ let r = sse:: _mm_max_ss ( a, b) ;
375
+ assert_eq ! ( r, f32x4:: new( -1.0 , 5.0 , 0.0 , -10.0 ) ) ;
376
+ }
377
+
128
378
#[ test]
129
379
#[ target_feature = "+sse" ]
130
380
fn _mm_max_ps ( ) {
0 commit comments