From 4901a4395ce238ec21d8bf65e481bd9ee927e712 Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Tue, 2 Jan 2024 21:11:22 +0100 Subject: [PATCH] Fix lane offsets for AVX2 pack instructions `fast_image_resize` yielded broken images, a little bit of println bisecting revealed the SIMD instruction that was at fault. A bit of staring at the cg_clif impl and the Intel manual then revealed the place of the bug. There is a lot of copy pasting here, so I'm not surprised it's buggy ^^'. --- src/intrinsics/llvm_x86.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intrinsics/llvm_x86.rs b/src/intrinsics/llvm_x86.rs index 81114cbf4..99fa11f71 100644 --- a/src/intrinsics/llvm_x86.rs +++ b/src/intrinsics/llvm_x86.rs @@ -682,7 +682,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } for idx in 0..lane_count / 2 { - let lane = a.value_lane(fx, idx).load_scalar(fx); + let lane = a.value_lane(fx, lane_count / 2 + idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, zero); let sat = fx.bcx.ins().umin(sat, max_u8); let res = fx.bcx.ins().ireduce(types::I8, sat); @@ -692,7 +692,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } for idx in 0..lane_count / 2 { - let lane = b.value_lane(fx, idx).load_scalar(fx); + let lane = b.value_lane(fx, lane_count / 2 + idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, zero); let sat = fx.bcx.ins().umin(sat, max_u8); let res = fx.bcx.ins().ireduce(types::I8, sat); @@ -816,7 +816,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } for idx in 0..lane_count / 2 { - let lane = a.value_lane(fx, idx).load_scalar(fx); + let lane = a.value_lane(fx, (lane_count / 2) + idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat); @@ -826,7 +826,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>( } for idx in 0..lane_count / 2 { - let lane = b.value_lane(fx, idx).load_scalar(fx); + let lane = b.value_lane(fx, (lane_count / 2) + idx).load_scalar(fx); let sat = fx.bcx.ins().smax(lane, min_i16); let sat = fx.bcx.ins().smin(sat, max_i16); let res = fx.bcx.ins().ireduce(types::I16, sat);