Skip to content

Commit b8af805

Browse files
committed
Add f16/f128 intrinsic support
1 parent 9e83090 commit b8af805

File tree

2 files changed

+141
-8
lines changed

2 files changed

+141
-8
lines changed

src/codegen_f16_f128.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,43 @@ pub(crate) fn neg_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
117117
fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
118118
}
119119

120+
pub(crate) fn abs_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
121+
let bits = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), value);
122+
let bits = fx.bcx.ins().band_imm(bits, 0x7fff);
123+
fx.bcx.ins().bitcast(types::F16, MemFlags::new(), bits)
124+
}
125+
126+
pub(crate) fn abs_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
127+
let bits = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), value);
128+
let low = fx.bcx.ins().iconst(types::I64, 0xffff_ffff_ffff_ffff_u64 as i64);
129+
let high = fx.bcx.ins().iconst(types::I64, 0x7fff_ffff_ffff_ffff_u64 as i64);
130+
let mask = fx.bcx.ins().iconcat(low, high);
131+
let bits = fx.bcx.ins().band(bits, mask);
132+
fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
133+
}
134+
135+
pub(crate) fn copysign_f16(fx: &mut FunctionCx<'_, '_, '_>, lhs: Value, rhs: Value) -> Value {
136+
let lhs = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), lhs);
137+
let rhs = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), rhs);
138+
let res = fx.bcx.ins().band_imm(lhs, 0x7fff);
139+
let sign = fx.bcx.ins().band_imm(rhs, 0x8000);
140+
let res = fx.bcx.ins().bor(res, sign);
141+
fx.bcx.ins().bitcast(types::F16, MemFlags::new(), res)
142+
}
143+
144+
pub(crate) fn copysign_f128(fx: &mut FunctionCx<'_, '_, '_>, lhs: Value, rhs: Value) -> Value {
145+
let lhs = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), lhs);
146+
let rhs = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), rhs);
147+
let low = fx.bcx.ins().iconst(types::I64, 0xffff_ffff_ffff_ffff_u64 as i64);
148+
let high = fx.bcx.ins().iconst(types::I64, 0x7fff_ffff_ffff_ffff_u64 as i64);
149+
let mask = fx.bcx.ins().iconcat(low, high);
150+
let sign_mask = fx.bcx.ins().bnot(mask);
151+
let res = fx.bcx.ins().band(lhs, mask);
152+
let sign = fx.bcx.ins().band(rhs, sign_mask);
153+
let res = fx.bcx.ins().bor(res, sign);
154+
fx.bcx.ins().bitcast(types::F128, MemFlags::new(), res)
155+
}
156+
120157
pub(crate) fn codegen_cast(
121158
fx: &mut FunctionCx<'_, '_, '_>,
122159
from: Value,
@@ -220,6 +257,14 @@ pub(crate) fn codegen_cast(
220257
}
221258
}
222259

260+
pub(crate) fn fma_f16(fx: &mut FunctionCx<'_, '_, '_>, x: Value, y: Value, z: Value) -> Value {
261+
let x = f16_to_f64(fx, x);
262+
let y = f16_to_f64(fx, y);
263+
let z = f16_to_f64(fx, z);
264+
let res = fx.bcx.ins().fma(x, y, z);
265+
f64_to_f16(fx, res)
266+
}
267+
223268
pub(crate) fn fmin_f128(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
224269
fx.lib_call(
225270
"fminimumf128",

src/intrinsics/mod.rs

Lines changed: 96 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -249,8 +249,10 @@ fn bool_to_zero_or_max_uint<'tcx>(
249249
let ty = fx.clif_type(ty).unwrap();
250250

251251
let int_ty = match ty {
252+
types::F16 => types::I16,
252253
types::F32 => types::I32,
253254
types::F64 => types::I64,
255+
types::F128 => types::I128,
254256
ty => ty,
255257
};
256258

@@ -309,45 +311,83 @@ fn codegen_float_intrinsic_call<'tcx>(
309311
ret: CPlace<'tcx>,
310312
) -> bool {
311313
let (name, arg_count, ty, clif_ty) = match intrinsic {
314+
sym::expf16 => ("expf16", 1, fx.tcx.types.f16, types::F16),
312315
sym::expf32 => ("expf", 1, fx.tcx.types.f32, types::F32),
313316
sym::expf64 => ("exp", 1, fx.tcx.types.f64, types::F64),
317+
sym::expf128 => ("expf128", 1, fx.tcx.types.f128, types::F128),
318+
sym::exp2f16 => ("exp2f16", 1, fx.tcx.types.f16, types::F16),
314319
sym::exp2f32 => ("exp2f", 1, fx.tcx.types.f32, types::F32),
315320
sym::exp2f64 => ("exp2", 1, fx.tcx.types.f64, types::F64),
321+
sym::exp2f128 => ("exp2f128", 1, fx.tcx.types.f128, types::F128),
322+
sym::sqrtf16 => ("sqrtf16", 1, fx.tcx.types.f16, types::F16),
316323
sym::sqrtf32 => ("sqrtf", 1, fx.tcx.types.f32, types::F32),
317324
sym::sqrtf64 => ("sqrt", 1, fx.tcx.types.f64, types::F64),
325+
sym::sqrtf128 => ("sqrtf128", 1, fx.tcx.types.f128, types::F128),
326+
sym::powif16 => ("__powisf2", 2, fx.tcx.types.f16, types::F16), // compiler-builtins
318327
sym::powif32 => ("__powisf2", 2, fx.tcx.types.f32, types::F32), // compiler-builtins
319328
sym::powif64 => ("__powidf2", 2, fx.tcx.types.f64, types::F64), // compiler-builtins
329+
sym::powif128 => ("__powitf2", 2, fx.tcx.types.f128, types::F128), // compiler-builtins
330+
sym::powf16 => ("powf16", 2, fx.tcx.types.f16, types::F16),
320331
sym::powf32 => ("powf", 2, fx.tcx.types.f32, types::F32),
321332
sym::powf64 => ("pow", 2, fx.tcx.types.f64, types::F64),
333+
sym::powf128 => ("powf128", 2, fx.tcx.types.f128, types::F128),
334+
sym::logf16 => ("logf16", 1, fx.tcx.types.f16, types::F16),
322335
sym::logf32 => ("logf", 1, fx.tcx.types.f32, types::F32),
323336
sym::logf64 => ("log", 1, fx.tcx.types.f64, types::F64),
337+
sym::logf128 => ("logf128", 1, fx.tcx.types.f128, types::F128),
338+
sym::log2f16 => ("log2f16", 1, fx.tcx.types.f16, types::F16),
324339
sym::log2f32 => ("log2f", 1, fx.tcx.types.f32, types::F32),
325340
sym::log2f64 => ("log2", 1, fx.tcx.types.f64, types::F64),
341+
sym::log2f128 => ("log2f128", 1, fx.tcx.types.f128, types::F128),
342+
sym::log10f16 => ("log10f16", 1, fx.tcx.types.f16, types::F16),
326343
sym::log10f32 => ("log10f", 1, fx.tcx.types.f32, types::F32),
327344
sym::log10f64 => ("log10", 1, fx.tcx.types.f64, types::F64),
345+
sym::log10f128 => ("log10f128", 1, fx.tcx.types.f128, types::F128),
346+
sym::fabsf16 => ("fabsf16", 1, fx.tcx.types.f16, types::F16),
328347
sym::fabsf32 => ("fabsf", 1, fx.tcx.types.f32, types::F32),
329348
sym::fabsf64 => ("fabs", 1, fx.tcx.types.f64, types::F64),
349+
sym::fabsf128 => ("fabsf128", 1, fx.tcx.types.f128, types::F128),
350+
sym::fmaf16 => ("fmaf16", 3, fx.tcx.types.f16, types::F16),
330351
sym::fmaf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32),
331352
sym::fmaf64 => ("fma", 3, fx.tcx.types.f64, types::F64),
353+
sym::fmaf128 => ("fmaf128", 3, fx.tcx.types.f128, types::F128),
332354
// FIXME: calling `fma` from libc without FMA target feature uses expensive sofware emulation
355+
sym::fmuladdf16 => ("fmaf16", 3, fx.tcx.types.f16, types::F16), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f16
333356
sym::fmuladdf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f32
334357
sym::fmuladdf64 => ("fma", 3, fx.tcx.types.f64, types::F64), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f64
358+
sym::fmuladdf128 => ("fmaf128", 3, fx.tcx.types.f128, types::F128), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f128
359+
sym::copysignf16 => ("copysignf16", 2, fx.tcx.types.f16, types::F16),
335360
sym::copysignf32 => ("copysignf", 2, fx.tcx.types.f32, types::F32),
336361
sym::copysignf64 => ("copysign", 2, fx.tcx.types.f64, types::F64),
362+
sym::copysignf128 => ("copysignf128", 2, fx.tcx.types.f128, types::F128),
363+
sym::floorf16 => ("floorf16", 1, fx.tcx.types.f16, types::F16),
337364
sym::floorf32 => ("floorf", 1, fx.tcx.types.f32, types::F32),
338365
sym::floorf64 => ("floor", 1, fx.tcx.types.f64, types::F64),
366+
sym::floorf128 => ("floorf128", 1, fx.tcx.types.f128, types::F128),
367+
sym::ceilf16 => ("ceilf16", 1, fx.tcx.types.f16, types::F16),
339368
sym::ceilf32 => ("ceilf", 1, fx.tcx.types.f32, types::F32),
340369
sym::ceilf64 => ("ceil", 1, fx.tcx.types.f64, types::F64),
370+
sym::ceilf128 => ("ceilf128", 1, fx.tcx.types.f128, types::F128),
371+
sym::truncf16 => ("truncf16", 1, fx.tcx.types.f16, types::F16),
341372
sym::truncf32 => ("truncf", 1, fx.tcx.types.f32, types::F32),
342373
sym::truncf64 => ("trunc", 1, fx.tcx.types.f64, types::F64),
374+
sym::truncf128 => ("truncf128", 1, fx.tcx.types.f128, types::F128),
375+
sym::round_ties_even_f16 => ("rintf16", 1, fx.tcx.types.f16, types::F16),
343376
sym::round_ties_even_f32 => ("rintf", 1, fx.tcx.types.f32, types::F32),
344377
sym::round_ties_even_f64 => ("rint", 1, fx.tcx.types.f64, types::F64),
378+
sym::round_ties_even_f128 => ("rintf128", 1, fx.tcx.types.f128, types::F128),
379+
sym::roundf16 => ("roundf16", 1, fx.tcx.types.f16, types::F16),
345380
sym::roundf32 => ("roundf", 1, fx.tcx.types.f32, types::F32),
346381
sym::roundf64 => ("round", 1, fx.tcx.types.f64, types::F64),
382+
sym::roundf128 => ("roundf128", 1, fx.tcx.types.f128, types::F128),
383+
sym::sinf16 => ("sinf16", 1, fx.tcx.types.f16, types::F16),
347384
sym::sinf32 => ("sinf", 1, fx.tcx.types.f32, types::F32),
348385
sym::sinf64 => ("sin", 1, fx.tcx.types.f64, types::F64),
386+
sym::sinf128 => ("sinf128", 1, fx.tcx.types.f128, types::F128),
387+
sym::cosf16 => ("cosf16", 1, fx.tcx.types.f16, types::F16),
349388
sym::cosf32 => ("cosf", 1, fx.tcx.types.f32, types::F32),
350389
sym::cosf64 => ("cos", 1, fx.tcx.types.f64, types::F64),
390+
sym::cosf128 => ("cosf128", 1, fx.tcx.types.f128, types::F128),
351391
_ => return false,
352392
};
353393

@@ -380,13 +420,26 @@ fn codegen_float_intrinsic_call<'tcx>(
380420
};
381421

382422
let layout = fx.layout_of(ty);
423+
// FIXME(bytecodealliance/wasmtime#8312): Use native Cranelift operations
424+
// for `f16` and `f128` once the lowerings have been implemented in Cranelift.
383425
let res = match intrinsic {
426+
sym::fmaf16 | sym::fmuladdf16 => {
427+
CValue::by_val(codegen_f16_f128::fma_f16(fx, args[0], args[1], args[2]), layout)
428+
}
384429
sym::fmaf32 | sym::fmaf64 | sym::fmuladdf32 | sym::fmuladdf64 => {
385430
CValue::by_val(fx.bcx.ins().fma(args[0], args[1], args[2]), layout)
386431
}
432+
sym::copysignf16 => {
433+
CValue::by_val(codegen_f16_f128::copysign_f16(fx, args[0], args[1]), layout)
434+
}
435+
sym::copysignf128 => {
436+
CValue::by_val(codegen_f16_f128::copysign_f128(fx, args[0], args[1]), layout)
437+
}
387438
sym::copysignf32 | sym::copysignf64 => {
388439
CValue::by_val(fx.bcx.ins().fcopysign(args[0], args[1]), layout)
389440
}
441+
sym::fabsf16 => CValue::by_val(codegen_f16_f128::abs_f16(fx, args[0]), layout),
442+
sym::fabsf128 => CValue::by_val(codegen_f16_f128::abs_f128(fx, args[0]), layout),
390443
sym::fabsf32
391444
| sym::fabsf64
392445
| sym::floorf32
@@ -400,14 +453,25 @@ fn codegen_float_intrinsic_call<'tcx>(
400453
| sym::sqrtf32
401454
| sym::sqrtf64 => {
402455
let val = match intrinsic {
403-
sym::fabsf32 | sym::fabsf64 => fx.bcx.ins().fabs(args[0]),
404-
sym::floorf32 | sym::floorf64 => fx.bcx.ins().floor(args[0]),
405-
sym::ceilf32 | sym::ceilf64 => fx.bcx.ins().ceil(args[0]),
406-
sym::truncf32 | sym::truncf64 => fx.bcx.ins().trunc(args[0]),
407-
sym::round_ties_even_f32 | sym::round_ties_even_f64 => {
408-
fx.bcx.ins().nearest(args[0])
456+
sym::fabsf16 | sym::fabsf32 | sym::fabsf64 | sym::fabsf128 => {
457+
fx.bcx.ins().fabs(args[0])
458+
}
459+
sym::floorf16 | sym::floorf32 | sym::floorf64 | sym::floorf128 => {
460+
fx.bcx.ins().floor(args[0])
461+
}
462+
sym::ceilf16 | sym::ceilf32 | sym::ceilf64 | sym::ceilf128 => {
463+
fx.bcx.ins().ceil(args[0])
464+
}
465+
sym::truncf16 | sym::truncf32 | sym::truncf64 | sym::truncf128 => {
466+
fx.bcx.ins().trunc(args[0])
467+
}
468+
sym::round_ties_even_f16
469+
| sym::round_ties_even_f32
470+
| sym::round_ties_even_f64
471+
| sym::round_ties_even_f128 => fx.bcx.ins().nearest(args[0]),
472+
sym::sqrtf16 | sym::sqrtf32 | sym::sqrtf64 | sym::sqrtf128 => {
473+
fx.bcx.ins().sqrt(args[0])
409474
}
410-
sym::sqrtf32 | sym::sqrtf64 => fx.bcx.ins().sqrt(args[0]),
411475
_ => unreachable!(),
412476
};
413477

@@ -416,12 +480,36 @@ fn codegen_float_intrinsic_call<'tcx>(
416480

417481
// These intrinsics aren't supported natively by Cranelift.
418482
// Lower them to a libcall.
419-
sym::powif32 | sym::powif64 => {
483+
sym::powif16 | sym::powif32 | sym::powif64 | sym::powif128 => {
484+
let temp;
485+
let (clif_ty, args) = if intrinsic == sym::powif16 {
486+
temp = [codegen_f16_f128::f16_to_f32(fx, args[0]), args[1]];
487+
(types::F32, temp.as_slice())
488+
} else {
489+
(clif_ty, args)
490+
};
420491
let input_tys: Vec<_> =
421492
vec![AbiParam::new(clif_ty), lib_call_arg_param(fx.tcx, types::I32, true)];
422493
let ret_val = fx.lib_call(name, input_tys, vec![AbiParam::new(clif_ty)], &args)[0];
494+
let ret_val = if intrinsic == sym::powif16 {
495+
codegen_f16_f128::f32_to_f16(fx, ret_val)
496+
} else {
497+
ret_val
498+
};
423499
CValue::by_val(ret_val, fx.layout_of(ty))
424500
}
501+
sym::powf16 => {
502+
// FIXME(f16_f128): Rust `compiler-builtins` doesn't export `powf16` yet.
503+
let x = codegen_f16_f128::f16_to_f32(fx, args[0]);
504+
let y = codegen_f16_f128::f16_to_f32(fx, args[1]);
505+
let ret_val = fx.lib_call(
506+
"powf",
507+
vec![AbiParam::new(types::F32), AbiParam::new(types::F32)],
508+
vec![AbiParam::new(types::F32)],
509+
&[x, y],
510+
)[0];
511+
CValue::by_val(codegen_f16_f128::f32_to_f16(fx, ret_val), fx.layout_of(ty))
512+
}
425513
_ => {
426514
let input_tys: Vec<_> = args.iter().map(|_| AbiParam::new(clif_ty)).collect();
427515
let ret_val = fx.lib_call(name, input_tys, vec![AbiParam::new(clif_ty)], &args)[0];

0 commit comments

Comments
 (0)