Skip to content

Commit 4c5378a

Browse files
authored
Merge pull request rust-lang#38 from kocsis1david/master
Some SSE instructions
2 parents ec103b7 + b23775b commit 4c5378a

File tree

2 files changed

+265
-15
lines changed

2 files changed

+265
-15
lines changed

TODO.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,23 +59,23 @@ sse
5959
* [ ] `_m_pmovmskb`
6060
* [ ] `_mm_shuffle_pi16`
6161
* [ ] `_m_pshufw`
62-
* [ ] `_mm_add_ss`
63-
* [ ] `_mm_add_ps`
64-
* [ ] `_mm_sub_ss`
65-
* [ ] `_mm_sub_ps`
66-
* [ ] `_mm_mul_ss`
67-
* [ ] `_mm_mul_ps`
68-
* [ ] `_mm_div_ss`
69-
* [ ] `_mm_div_ps`
70-
* [ ] `_mm_sqrt_ss`
62+
* [x] `_mm_add_ss`
63+
* [x] `_mm_add_ps`
64+
* [x] `_mm_sub_ss`
65+
* [x] `_mm_sub_ps`
66+
* [x] `_mm_mul_ss`
67+
* [x] `_mm_mul_ps`
68+
* [x] `_mm_div_ss`
69+
* [x] `_mm_div_ps`
70+
* [x] `_mm_sqrt_ss`
7171
* [x] `_mm_sqrt_ps`
72-
* [ ] `_mm_rcp_ss`
72+
* [x] `_mm_rcp_ss`
7373
* [x] `_mm_rcp_ps`
74-
* [ ] `_mm_rsqrt_ss`
74+
* [x] `_mm_rsqrt_ss`
7575
* [x] `_mm_rsqrt_ps`
76-
* [ ] `_mm_min_ss`
76+
* [x] `_mm_min_ss`
7777
* [x] `_mm_min_ps`
78-
* [ ] `_mm_max_ss`
78+
* [x] `_mm_max_ss`
7979
* [x] `_mm_max_ps`
8080
* [ ] `_mm_and_ps`
8181
* [ ] `_mm_andnot_ps`
@@ -458,8 +458,8 @@ sse4.1
458458
* [ ] `_mm_blendv_ps`
459459
* [x] `_mm_blendv_epi8`
460460
* [ ] `_mm_blend_epi16`
461-
* [ ] `_mm_dp_pd`
462-
* [ ] `_mm_dp_ps`
461+
* [x] `_mm_dp_pd`
462+
* [x] `_mm_dp_ps`
463463
* [ ] `_mm_extract_ps`
464464
* [ ] `_mm_extract_epi8`
465465
* [ ] `_mm_extract_epi32`

src/x86/sse.rs

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,83 @@ use v128::*;
44
#[cfg(test)]
55
use assert_instr::assert_instr;
66

7+
/// Adds the first component of `a` and `b`, the other components are copied
8+
/// from `a`.
9+
#[inline(always)]
10+
#[target_feature = "+sse"]
11+
#[cfg_attr(test, assert_instr(addss))]
12+
pub fn _mm_add_ss(a: f32x4, b: f32x4) -> f32x4 {
13+
unsafe { addss(a, b) }
14+
}
15+
16+
/// Adds f32x4 vectors.
17+
#[inline(always)]
18+
#[target_feature = "+sse"]
19+
#[cfg_attr(test, assert_instr(addps))]
20+
pub fn _mm_add_ps(a: f32x4, b: f32x4) -> f32x4 {
21+
a + b
22+
}
23+
24+
/// Subtracts the first component of `b` from `a`, the other components are
25+
/// copied from `a`.
26+
#[inline(always)]
27+
#[target_feature = "+sse"]
28+
#[cfg_attr(test, assert_instr(subss))]
29+
pub fn _mm_sub_ss(a: f32x4, b: f32x4) -> f32x4 {
30+
unsafe { subss(a, b) }
31+
}
32+
33+
/// Subtracts f32x4 vectors.
34+
#[inline(always)]
35+
#[target_feature = "+sse"]
36+
#[cfg_attr(test, assert_instr(subps))]
37+
pub fn _mm_sub_ps(a: f32x4, b: f32x4) -> f32x4 {
38+
a - b
39+
}
40+
41+
/// Multiplies the first component of `a` and `b`, the other components are
42+
/// copied from `a`.
43+
#[inline(always)]
44+
#[target_feature = "+sse"]
45+
#[cfg_attr(test, assert_instr(mulss))]
46+
pub fn _mm_mul_ss(a: f32x4, b: f32x4) -> f32x4 {
47+
unsafe { mulss(a, b) }
48+
}
49+
50+
/// Multiplies f32x4 vectors.
51+
#[inline(always)]
52+
#[target_feature = "+sse"]
53+
#[cfg_attr(test, assert_instr(mulps))]
54+
pub fn _mm_mul_ps(a: f32x4, b: f32x4) -> f32x4 {
55+
a * b
56+
}
57+
58+
/// Divides the first component of `b` by `a`, the other components are
59+
/// copied from `a`.
60+
#[inline(always)]
61+
#[target_feature = "+sse"]
62+
#[cfg_attr(test, assert_instr(divss))]
63+
pub fn _mm_div_ss(a: f32x4, b: f32x4) -> f32x4 {
64+
unsafe { divss(a, b) }
65+
}
66+
67+
/// Divides f32x4 vectors.
68+
#[inline(always)]
69+
#[target_feature = "+sse"]
70+
#[cfg_attr(test, assert_instr(divps))]
71+
pub fn _mm_div_ps(a: f32x4, b: f32x4) -> f32x4 {
72+
a / b
73+
}
74+
75+
/// Return the square root of the first single-precision (32-bit)
76+
/// floating-point element in `a`, the other elements are unchanged.
77+
#[inline(always)]
78+
#[target_feature = "+sse"]
79+
#[cfg_attr(test, assert_instr(sqrtss))]
80+
pub fn _mm_sqrt_ss(a: f32x4) -> f32x4 {
81+
unsafe { sqrtss(a) }
82+
}
83+
784
/// Return the square root of packed single-precision (32-bit) floating-point
885
/// elements in `a`.
986
#[inline(always)]
@@ -13,6 +90,15 @@ pub fn _mm_sqrt_ps(a: f32x4) -> f32x4 {
1390
unsafe { sqrtps(a) }
1491
}
1592

93+
/// Return the approximate reciprocal of the first single-precision
94+
/// (32-bit) floating-point element in `a`, the other elements are unchanged.
95+
#[inline(always)]
96+
#[target_feature = "+sse"]
97+
#[cfg_attr(test, assert_instr(rcpss))]
98+
pub fn _mm_rcp_ss(a: f32x4) -> f32x4 {
99+
unsafe { rcpss(a) }
100+
}
101+
16102
/// Return the approximate reciprocal of packed single-precision (32-bit)
17103
/// floating-point elements in `a`.
18104
#[inline(always)]
@@ -22,6 +108,15 @@ pub fn _mm_rcp_ps(a: f32x4) -> f32x4 {
22108
unsafe { rcpps(a) }
23109
}
24110

111+
/// Return the approximate reciprocal square root of the fist single-precision
112+
/// (32-bit) floating-point elements in `a`, the other elements are unchanged.
113+
#[inline(always)]
114+
#[target_feature = "+sse"]
115+
#[cfg_attr(test, assert_instr(rsqrtss))]
116+
pub fn _mm_rsqrt_ss(a: f32x4) -> f32x4 {
117+
unsafe { rsqrtss(a) }
118+
}
119+
25120
/// Return the approximate reciprocal square root of packed single-precision
26121
/// (32-bit) floating-point elements in `a`.
27122
#[inline(always)]
@@ -31,6 +126,16 @@ pub fn _mm_rsqrt_ps(a: f32x4) -> f32x4 {
31126
unsafe { rsqrtps(a) }
32127
}
33128

129+
/// Compare the first single-precision (32-bit) floating-point element of `a`
130+
/// and `b`, and return the minimum value in the first element of the return
131+
/// value, the other elements are copied from `a`.
132+
#[inline(always)]
133+
#[target_feature = "+sse"]
134+
#[cfg_attr(test, assert_instr(minss))]
135+
pub fn _mm_min_ss(a: f32x4, b: f32x4) -> f32x4 {
136+
unsafe { minss(a, b) }
137+
}
138+
34139
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
35140
/// `b`, and return the corresponding minimum values.
36141
#[inline(always)]
@@ -40,6 +145,16 @@ pub fn _mm_min_ps(a: f32x4, b: f32x4) -> f32x4 {
40145
unsafe { minps(a, b) }
41146
}
42147

148+
/// Compare the first single-precision (32-bit) floating-point element of `a`
149+
/// and `b`, and return the maximum value in the first element of the return
150+
/// value, the other elements are copied from `a`.
151+
#[inline(always)]
152+
#[target_feature = "+sse"]
153+
#[cfg_attr(test, assert_instr(maxss))]
154+
pub fn _mm_max_ss(a: f32x4, b: f32x4) -> f32x4 {
155+
unsafe { maxss(a, b) }
156+
}
157+
43158
/// Compare packed single-precision (32-bit) floating-point elements in `a` and
44159
/// `b`, and return the corresponding maximum values.
45160
#[inline(always)]
@@ -70,14 +185,32 @@ pub fn _mm_movemask_ps(a: f32x4) -> i32 {
70185

71186
#[allow(improper_ctypes)]
72187
extern {
188+
#[link_name = "llvm.x86.sse.add.ss"]
189+
fn addss(a: f32x4, b: f32x4) -> f32x4;
190+
#[link_name = "llvm.x86.sse.sub.ss"]
191+
fn subss(a: f32x4, b: f32x4) -> f32x4;
192+
#[link_name = "llvm.x86.sse.mul.ss"]
193+
fn mulss(a: f32x4, b: f32x4) -> f32x4;
194+
#[link_name = "llvm.x86.sse.div.ss"]
195+
fn divss(a: f32x4, b: f32x4) -> f32x4;
196+
#[link_name = "llvm.x86.sse.sqrt.ss"]
197+
fn sqrtss(a: f32x4) -> f32x4;
73198
#[link_name = "llvm.x86.sse.sqrt.ps"]
74199
fn sqrtps(a: f32x4) -> f32x4;
200+
#[link_name = "llvm.x86.sse.rcp.ss"]
201+
fn rcpss(a: f32x4) -> f32x4;
75202
#[link_name = "llvm.x86.sse.rcp.ps"]
76203
fn rcpps(a: f32x4) -> f32x4;
204+
#[link_name = "llvm.x86.sse.rsqrt.ss"]
205+
fn rsqrtss(a: f32x4) -> f32x4;
77206
#[link_name = "llvm.x86.sse.rsqrt.ps"]
78207
fn rsqrtps(a: f32x4) -> f32x4;
208+
#[link_name = "llvm.x86.sse.min.ss"]
209+
fn minss(a: f32x4, b: f32x4) -> f32x4;
79210
#[link_name = "llvm.x86.sse.min.ps"]
80211
fn minps(a: f32x4, b: f32x4) -> f32x4;
212+
#[link_name = "llvm.x86.sse.max.ss"]
213+
fn maxss(a: f32x4, b: f32x4) -> f32x4;
81214
#[link_name = "llvm.x86.sse.max.ps"]
82215
fn maxps(a: f32x4, b: f32x4) -> f32x4;
83216
#[link_name = "llvm.x86.sse.movmsk.ps"]
@@ -89,6 +222,87 @@ mod tests {
89222
use v128::*;
90223
use x86::sse;
91224

225+
#[test]
226+
#[target_feature = "+sse"]
227+
fn _mm_add_ps() {
228+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
229+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
230+
let r = sse::_mm_add_ps(a, b);
231+
assert_eq!(r, f32x4::new(-101.0, 25.0, 0.0, -15.0));
232+
}
233+
234+
#[test]
235+
#[target_feature = "+sse"]
236+
fn _mm_add_ss() {
237+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
238+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
239+
let r = sse::_mm_add_ss(a, b);
240+
assert_eq!(r, f32x4::new(-101.0, 5.0, 0.0, -10.0));
241+
}
242+
243+
#[test]
244+
#[target_feature = "+sse"]
245+
fn _mm_sub_ps() {
246+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
247+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
248+
let r = sse::_mm_sub_ps(a, b);
249+
assert_eq!(r, f32x4::new(99.0, -15.0, 0.0, -5.0));
250+
}
251+
252+
#[test]
253+
#[target_feature = "+sse"]
254+
fn _mm_sub_ss() {
255+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
256+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
257+
let r = sse::_mm_sub_ss(a, b);
258+
assert_eq!(r, f32x4::new(99.0, 5.0, 0.0, -10.0));
259+
}
260+
261+
#[test]
262+
#[target_feature = "+sse"]
263+
fn _mm_mul_ps() {
264+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
265+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
266+
let r = sse::_mm_mul_ps(a, b);
267+
assert_eq!(r, f32x4::new(100.0, 100.0, 0.0, 50.0));
268+
}
269+
270+
#[test]
271+
#[target_feature = "+sse"]
272+
fn _mm_mul_ss() {
273+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
274+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
275+
let r = sse::_mm_mul_ss(a, b);
276+
assert_eq!(r, f32x4::new(100.0, 5.0, 0.0, -10.0));
277+
}
278+
279+
#[test]
280+
#[target_feature = "+sse"]
281+
fn _mm_div_ps() {
282+
let a = f32x4::new(-1.0, 5.0, 2.0, -10.0);
283+
let b = f32x4::new(-100.0, 20.0, 0.2, -5.0);
284+
let r = sse::_mm_div_ps(a, b);
285+
assert_eq!(r, f32x4::new(0.01, 0.25, 10.0, 2.0));
286+
}
287+
288+
#[test]
289+
#[target_feature = "+sse"]
290+
fn _mm_div_ss() {
291+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
292+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
293+
let r = sse::_mm_div_ss(a, b);
294+
assert_eq!(r, f32x4::new(0.01, 5.0, 0.0, -10.0));
295+
}
296+
297+
#[test]
298+
#[target_feature = "+sse"]
299+
fn _mm_sqrt_ss() {
300+
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
301+
let r = sse::_mm_sqrt_ss(a);
302+
let e = f32x4::new(2.0, 13.0, 16.0, 100.0);
303+
assert_eq!(r, e);
304+
}
305+
92306
#[test]
93307
#[target_feature = "+sse"]
94308
fn _mm_sqrt_ps() {
@@ -98,6 +312,15 @@ mod tests {
98312
assert_eq!(r, e);
99313
}
100314

315+
#[test]
316+
#[target_feature = "+sse"]
317+
fn _mm_rcp_ss() {
318+
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
319+
let r = sse::_mm_rcp_ss(a);
320+
let e = f32x4::new(0.24993896, 13.0, 16.0, 100.0);
321+
assert_eq!(r, e);
322+
}
323+
101324
#[test]
102325
#[target_feature = "+sse"]
103326
fn _mm_rcp_ps() {
@@ -107,6 +330,15 @@ mod tests {
107330
assert_eq!(r, e);
108331
}
109332

333+
#[test]
334+
#[target_feature = "+sse"]
335+
fn _mm_rsqrt_ss() {
336+
let a = f32x4::new(4.0, 13.0, 16.0, 100.0);
337+
let r = sse::_mm_rsqrt_ss(a);
338+
let e = f32x4::new(0.49987793, 13.0, 16.0, 100.0);
339+
assert_eq!(r, e);
340+
}
341+
110342
#[test]
111343
#[target_feature = "+sse"]
112344
fn _mm_rsqrt_ps() {
@@ -116,6 +348,15 @@ mod tests {
116348
assert_eq!(r, e);
117349
}
118350

351+
#[test]
352+
#[target_feature = "+sse"]
353+
fn _mm_min_ss() {
354+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
355+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
356+
let r = sse::_mm_min_ss(a, b);
357+
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
358+
}
359+
119360
#[test]
120361
#[target_feature = "+sse"]
121362
fn _mm_min_ps() {
@@ -125,6 +366,15 @@ mod tests {
125366
assert_eq!(r, f32x4::new(-100.0, 5.0, 0.0, -10.0));
126367
}
127368

369+
#[test]
370+
#[target_feature = "+sse"]
371+
fn _mm_max_ss() {
372+
let a = f32x4::new(-1.0, 5.0, 0.0, -10.0);
373+
let b = f32x4::new(-100.0, 20.0, 0.0, -5.0);
374+
let r = sse::_mm_max_ss(a, b);
375+
assert_eq!(r, f32x4::new(-1.0, 5.0, 0.0, -10.0));
376+
}
377+
128378
#[test]
129379
#[target_feature = "+sse"]
130380
fn _mm_max_ps() {

0 commit comments

Comments
 (0)