Skip to content

Commit d5a7ae7

Browse files
committed
Implement the float part of the gather family vendor intrinsics
1 parent c8729e9 commit d5a7ae7

File tree

1 file changed

+87
-0
lines changed

1 file changed

+87
-0
lines changed

src/intrinsics/llvm_x86.rs

+87
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,93 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
7474
ret.write_cvalue(fx, val);
7575
}
7676

77+
"llvm.x86.avx2.gather.d.ps"
78+
| "llvm.x86.avx2.gather.d.pd"
79+
| "llvm.x86.avx2.gather.d.ps.256"
80+
| "llvm.x86.avx2.gather.d.pd.256"
81+
| "llvm.x86.avx2.gather.q.ps"
82+
| "llvm.x86.avx2.gather.q.pd"
83+
| "llvm.x86.avx2.gather.q.ps.256"
84+
| "llvm.x86.avx2.gather.q.pd.256" => {
85+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818
86+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=3819
87+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=3821
88+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=3822
89+
// ...
90+
91+
intrinsic_args!(fx, args => (src, ptr, index, mask, scale); intrinsic);
92+
93+
let (src_lane_count, src_lane_ty) = src.layout().ty.simd_size_and_type(fx.tcx);
94+
let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx);
95+
let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx);
96+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
97+
assert!(src_lane_ty.is_floating_point());
98+
assert!(index_lane_ty.is_integral());
99+
assert!(mask_lane_ty.is_floating_point());
100+
assert!(ret_lane_ty.is_floating_point());
101+
assert_eq!(src_lane_count, mask_lane_count);
102+
assert_eq!(src_lane_count, ret_lane_count);
103+
104+
let lane_clif_ty = fx.clif_type(ret_lane_ty).unwrap();
105+
let index_lane_clif_ty = fx.clif_type(index_lane_ty).unwrap();
106+
let mask_lane_clif_ty = fx.clif_type(mask_lane_ty).unwrap();
107+
let ret_lane_layout = fx.layout_of(ret_lane_ty);
108+
109+
let ptr = ptr.load_scalar(fx);
110+
let scale = scale.load_scalar(fx);
111+
let scale = fx.bcx.ins().uextend(types::I64, scale);
112+
for lane_idx in 0..std::cmp::min(src_lane_count, index_lane_count) {
113+
let src_lane = src.value_lane(fx, lane_idx).load_scalar(fx);
114+
let index_lane = index.value_lane(fx, lane_idx).load_scalar(fx);
115+
let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx);
116+
let mask_lane =
117+
fx.bcx.ins().bitcast(mask_lane_clif_ty.as_int(), MemFlags::new(), mask_lane);
118+
119+
let if_enabled = fx.bcx.create_block();
120+
let if_disabled = fx.bcx.create_block();
121+
let next = fx.bcx.create_block();
122+
let res_lane = fx.bcx.append_block_param(next, lane_clif_ty);
123+
124+
let mask_lane = match mask_lane_clif_ty {
125+
types::F32 => fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64),
126+
types::F64 => fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64),
127+
_ => unreachable!(),
128+
};
129+
fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]);
130+
fx.bcx.seal_block(if_enabled);
131+
fx.bcx.seal_block(if_disabled);
132+
133+
fx.bcx.switch_to_block(if_enabled);
134+
let index_lane = if index_lane_clif_ty != types::I64 {
135+
fx.bcx.ins().sextend(types::I64, index_lane)
136+
} else {
137+
index_lane
138+
};
139+
let offset = fx.bcx.ins().imul(index_lane, scale);
140+
let lane_ptr = fx.bcx.ins().iadd(ptr, offset);
141+
let res = fx.bcx.ins().load(lane_clif_ty, MemFlags::trusted(), lane_ptr, 0);
142+
fx.bcx.ins().jump(next, &[res]);
143+
144+
fx.bcx.switch_to_block(if_disabled);
145+
fx.bcx.ins().jump(next, &[src_lane]);
146+
147+
fx.bcx.seal_block(next);
148+
fx.bcx.switch_to_block(next);
149+
150+
fx.bcx.ins().nop();
151+
152+
ret.place_lane(fx, lane_idx)
153+
.write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout));
154+
}
155+
156+
for lane_idx in std::cmp::min(src_lane_count, index_lane_count)..ret_lane_count {
157+
let zero_lane = fx.bcx.ins().iconst(mask_lane_clif_ty.as_int(), 0);
158+
let zero_lane = fx.bcx.ins().bitcast(mask_lane_clif_ty, MemFlags::new(), zero_lane);
159+
ret.place_lane(fx, lane_idx)
160+
.write_cvalue(fx, CValue::by_val(zero_lane, ret_lane_layout));
161+
}
162+
}
163+
77164
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
78165
let (x, y, kind) = match args {
79166
[x, y, kind] => (x, y, kind),

0 commit comments

Comments
 (0)