@@ -74,6 +74,93 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
74
74
ret. write_cvalue ( fx, val) ;
75
75
}
76
76
77
+ "llvm.x86.avx2.gather.d.ps"
78
+ | "llvm.x86.avx2.gather.d.pd"
79
+ | "llvm.x86.avx2.gather.d.ps.256"
80
+ | "llvm.x86.avx2.gather.d.pd.256"
81
+ | "llvm.x86.avx2.gather.q.ps"
82
+ | "llvm.x86.avx2.gather.q.pd"
83
+ | "llvm.x86.avx2.gather.q.ps.256"
84
+ | "llvm.x86.avx2.gather.q.pd.256" => {
85
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818
86
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=3819
87
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=3821
88
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=3822
89
+ // ...
90
+
91
+ intrinsic_args ! ( fx, args => ( src, ptr, index, mask, scale) ; intrinsic) ;
92
+
93
+ let ( src_lane_count, src_lane_ty) = src. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
94
+ let ( index_lane_count, index_lane_ty) = index. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
95
+ let ( mask_lane_count, mask_lane_ty) = mask. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
96
+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
97
+ assert ! ( src_lane_ty. is_floating_point( ) ) ;
98
+ assert ! ( index_lane_ty. is_integral( ) ) ;
99
+ assert ! ( mask_lane_ty. is_floating_point( ) ) ;
100
+ assert ! ( ret_lane_ty. is_floating_point( ) ) ;
101
+ assert_eq ! ( src_lane_count, mask_lane_count) ;
102
+ assert_eq ! ( src_lane_count, ret_lane_count) ;
103
+
104
+ let lane_clif_ty = fx. clif_type ( ret_lane_ty) . unwrap ( ) ;
105
+ let index_lane_clif_ty = fx. clif_type ( index_lane_ty) . unwrap ( ) ;
106
+ let mask_lane_clif_ty = fx. clif_type ( mask_lane_ty) . unwrap ( ) ;
107
+ let ret_lane_layout = fx. layout_of ( ret_lane_ty) ;
108
+
109
+ let ptr = ptr. load_scalar ( fx) ;
110
+ let scale = scale. load_scalar ( fx) ;
111
+ let scale = fx. bcx . ins ( ) . uextend ( types:: I64 , scale) ;
112
+ for lane_idx in 0 ..std:: cmp:: min ( src_lane_count, index_lane_count) {
113
+ let src_lane = src. value_lane ( fx, lane_idx) . load_scalar ( fx) ;
114
+ let index_lane = index. value_lane ( fx, lane_idx) . load_scalar ( fx) ;
115
+ let mask_lane = mask. value_lane ( fx, lane_idx) . load_scalar ( fx) ;
116
+ let mask_lane =
117
+ fx. bcx . ins ( ) . bitcast ( mask_lane_clif_ty. as_int ( ) , MemFlags :: new ( ) , mask_lane) ;
118
+
119
+ let if_enabled = fx. bcx . create_block ( ) ;
120
+ let if_disabled = fx. bcx . create_block ( ) ;
121
+ let next = fx. bcx . create_block ( ) ;
122
+ let res_lane = fx. bcx . append_block_param ( next, lane_clif_ty) ;
123
+
124
+ let mask_lane = match mask_lane_clif_ty {
125
+ types:: F32 => fx. bcx . ins ( ) . band_imm ( mask_lane, 0x8000_0000u64 as i64 ) ,
126
+ types:: F64 => fx. bcx . ins ( ) . band_imm ( mask_lane, 0x8000_0000_0000_0000u64 as i64 ) ,
127
+ _ => unreachable ! ( ) ,
128
+ } ;
129
+ fx. bcx . ins ( ) . brif ( mask_lane, if_enabled, & [ ] , if_disabled, & [ ] ) ;
130
+ fx. bcx . seal_block ( if_enabled) ;
131
+ fx. bcx . seal_block ( if_disabled) ;
132
+
133
+ fx. bcx . switch_to_block ( if_enabled) ;
134
+ let index_lane = if index_lane_clif_ty != types:: I64 {
135
+ fx. bcx . ins ( ) . sextend ( types:: I64 , index_lane)
136
+ } else {
137
+ index_lane
138
+ } ;
139
+ let offset = fx. bcx . ins ( ) . imul ( index_lane, scale) ;
140
+ let lane_ptr = fx. bcx . ins ( ) . iadd ( ptr, offset) ;
141
+ let res = fx. bcx . ins ( ) . load ( lane_clif_ty, MemFlags :: trusted ( ) , lane_ptr, 0 ) ;
142
+ fx. bcx . ins ( ) . jump ( next, & [ res] ) ;
143
+
144
+ fx. bcx . switch_to_block ( if_disabled) ;
145
+ fx. bcx . ins ( ) . jump ( next, & [ src_lane] ) ;
146
+
147
+ fx. bcx . seal_block ( next) ;
148
+ fx. bcx . switch_to_block ( next) ;
149
+
150
+ fx. bcx . ins ( ) . nop ( ) ;
151
+
152
+ ret. place_lane ( fx, lane_idx)
153
+ . write_cvalue ( fx, CValue :: by_val ( res_lane, ret_lane_layout) ) ;
154
+ }
155
+
156
+ for lane_idx in std:: cmp:: min ( src_lane_count, index_lane_count) ..ret_lane_count {
157
+ let zero_lane = fx. bcx . ins ( ) . iconst ( mask_lane_clif_ty. as_int ( ) , 0 ) ;
158
+ let zero_lane = fx. bcx . ins ( ) . bitcast ( mask_lane_clif_ty, MemFlags :: new ( ) , zero_lane) ;
159
+ ret. place_lane ( fx, lane_idx)
160
+ . write_cvalue ( fx, CValue :: by_val ( zero_lane, ret_lane_layout) ) ;
161
+ }
162
+ }
163
+
77
164
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
78
165
let ( x, y, kind) = match args {
79
166
[ x, y, kind] => ( x, y, kind) ,
0 commit comments