Skip to content

Commit 8abb73c

Browse files
gwennalexcrichton
authored andcommitted
avx: _mm256_stream_si256, _mm256_stream_pd, _mm256_stream_ps (rust-lang#227)
1 parent 84b957c commit 8abb73c

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

coresimd/src/x86/i586/avx.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,6 +1537,37 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const i8x32) -> i8x32 {
15371537
vlddqu(mem_addr as *const i8)
15381538
}
15391539

1540+
/// Moves integer data from a 256-bit integer vector to a 32-byte
1541+
/// aligned memory location. To minimize caching, the data is flagged as
1542+
/// non-temporal (unlikely to be used again soon)
1543+
#[inline(always)]
1544+
#[target_feature = "+avx"]
1545+
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
1546+
pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
1547+
::core::intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
1548+
}
1549+
1550+
/// Moves double-precision values from a 256-bit vector of [4 x double]
1551+
/// to a 32-byte aligned memory location. To minimize caching, the data is
1552+
/// flagged as non-temporal (unlikely to be used again soon).
1553+
#[inline(always)]
1554+
#[target_feature = "+avx"]
1555+
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
1556+
pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: f64x4) {
1557+
::core::intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
1558+
}
1559+
1560+
/// Moves single-precision floating point values from a 256-bit vector
1561+
/// of [8 x float] to a 32-byte aligned memory location. To minimize
1562+
/// caching, the data is flagged as non-temporal (unlikely to be used again
1563+
/// soon).
1564+
#[inline(always)]
1565+
#[target_feature = "+avx"]
1566+
#[cfg_attr(test, assert_instr(vmovntps))]
1567+
pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: f32x8) {
1568+
::core::intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
1569+
}
1570+
15401571
/// Compute the approximate reciprocal of packed single-precision (32-bit)
15411572
/// floating-point elements in `a`, and return the results. The maximum
15421573
/// relative error for this approximation is less than 1.5*2^-12.
@@ -3532,6 +3563,44 @@ mod tests {
35323563
assert_eq!(r, e);
35333564
}
35343565

3566+
#[simd_test = "avx"]
3567+
unsafe fn _mm256_stream_si256() {
3568+
let a = __m256i::from(avx::_mm256_setr_epi64x(1, 2, 3, 4));
3569+
let mut r = avx::_mm256_undefined_si256();
3570+
avx::_mm256_stream_si256(&mut r as *mut _, a);
3571+
assert_eq!(r, a);
3572+
}
3573+
3574+
#[simd_test = "avx"]
3575+
unsafe fn _mm256_stream_pd() {
3576+
#[repr(align(32))]
3577+
struct Memory {
3578+
pub data: [f64; 4],
3579+
}
3580+
let a = f64x4::splat(7.0);
3581+
let mut mem = Memory { data: [-1.0; 4] };
3582+
3583+
avx::_mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
3584+
for i in 0..4 {
3585+
assert_eq!(mem.data[i], a.extract(i as u32));
3586+
}
3587+
}
3588+
3589+
#[simd_test = "avx"]
3590+
unsafe fn _mm256_stream_ps() {
3591+
#[repr(align(32))]
3592+
struct Memory {
3593+
pub data: [f32; 8],
3594+
}
3595+
let a = f32x8::splat(7.0);
3596+
let mut mem = Memory { data: [-1.0; 8] };
3597+
3598+
avx::_mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
3599+
for i in 0..8 {
3600+
assert_eq!(mem.data[i], a.extract(i as u32));
3601+
}
3602+
}
3603+
35353604
#[simd_test = "avx"]
35363605
unsafe fn _mm256_rcp_ps() {
35373606
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);

0 commit comments

Comments
 (0)