Skip to content

Commit b3b36e9

Browse files
authored
Merge pull request #1443 from Nilstrieb/x86-signed-pack
Restructure x86 signed pack instructions
2 parents 3b8794e + 22019db commit b3b36e9

File tree

1 file changed

+134
-196
lines changed

1 file changed

+134
-196
lines changed

src/intrinsics/llvm_x86.rs

+134-196
Original file line numberDiff line numberDiff line change
@@ -610,230 +610,56 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
610610
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
611611
intrinsic_args!(fx, args => (a, b); intrinsic);
612612

613-
assert_eq!(a.layout(), b.layout());
614-
let layout = a.layout();
615-
616-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
617-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
618-
assert_eq!(lane_ty, fx.tcx.types.i16);
619-
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
620-
assert_eq!(lane_count * 2, ret_lane_count);
621-
622-
let zero = fx.bcx.ins().iconst(types::I16, 0);
623-
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
624-
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
625-
626-
for idx in 0..lane_count {
627-
let lane = a.value_lane(fx, idx).load_scalar(fx);
628-
let sat = fx.bcx.ins().smax(lane, zero);
629-
let sat = fx.bcx.ins().umin(sat, max_u8);
630-
let res = fx.bcx.ins().ireduce(types::I8, sat);
631-
632-
let res_lane = CValue::by_val(res, ret_lane_layout);
633-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
634-
}
613+
pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Sse);
614+
}
635615

636-
for idx in 0..lane_count {
637-
let lane = b.value_lane(fx, idx).load_scalar(fx);
638-
let sat = fx.bcx.ins().smax(lane, zero);
639-
let sat = fx.bcx.ins().umin(sat, max_u8);
640-
let res = fx.bcx.ins().ireduce(types::I8, sat);
616+
"llvm.x86.sse2.packsswb.128" => {
617+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16&ig_expand=4848
618+
intrinsic_args!(fx, args => (a, b); intrinsic);
641619

642-
let res_lane = CValue::by_val(res, ret_lane_layout);
643-
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
644-
}
620+
pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Sse);
645621
}
646622

647623
"llvm.x86.avx2.packuswb" => {
648624
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
649625
intrinsic_args!(fx, args => (a, b); intrinsic);
650626

651-
assert_eq!(a.layout(), b.layout());
652-
let layout = a.layout();
653-
654-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
655-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
656-
assert_eq!(lane_ty, fx.tcx.types.i16);
657-
assert_eq!(ret_lane_ty, fx.tcx.types.u8);
658-
assert_eq!(lane_count * 2, ret_lane_count);
659-
660-
let zero = fx.bcx.ins().iconst(types::I16, 0);
661-
let max_u8 = fx.bcx.ins().iconst(types::I16, 255);
662-
let ret_lane_layout = fx.layout_of(fx.tcx.types.u8);
663-
664-
for idx in 0..lane_count / 2 {
665-
let lane = a.value_lane(fx, idx).load_scalar(fx);
666-
let sat = fx.bcx.ins().smax(lane, zero);
667-
let sat = fx.bcx.ins().umin(sat, max_u8);
668-
let res = fx.bcx.ins().ireduce(types::I8, sat);
669-
670-
let res_lane = CValue::by_val(res, ret_lane_layout);
671-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
672-
}
673-
674-
for idx in 0..lane_count / 2 {
675-
let lane = b.value_lane(fx, idx).load_scalar(fx);
676-
let sat = fx.bcx.ins().smax(lane, zero);
677-
let sat = fx.bcx.ins().umin(sat, max_u8);
678-
let res = fx.bcx.ins().ireduce(types::I8, sat);
679-
680-
let res_lane = CValue::by_val(res, ret_lane_layout);
681-
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
682-
}
683-
684-
for idx in 0..lane_count / 2 {
685-
let lane = a.value_lane(fx, idx).load_scalar(fx);
686-
let sat = fx.bcx.ins().smax(lane, zero);
687-
let sat = fx.bcx.ins().umin(sat, max_u8);
688-
let res = fx.bcx.ins().ireduce(types::I8, sat);
689-
690-
let res_lane = CValue::by_val(res, ret_lane_layout);
691-
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
692-
}
693-
694-
for idx in 0..lane_count / 2 {
695-
let lane = b.value_lane(fx, idx).load_scalar(fx);
696-
let sat = fx.bcx.ins().smax(lane, zero);
697-
let sat = fx.bcx.ins().umin(sat, max_u8);
698-
let res = fx.bcx.ins().ireduce(types::I8, sat);
699-
700-
let res_lane = CValue::by_val(res, ret_lane_layout);
701-
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
702-
}
627+
pack_instruction(fx, a, b, ret, PackSize::U8, PackWidth::Avx);
703628
}
704629

705-
"llvm.x86.sse2.packssdw.128" => {
706-
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
630+
"llvm.x86.avx2.packsswb" => {
631+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16&ig_expand=4851
707632
intrinsic_args!(fx, args => (a, b); intrinsic);
708633

709-
assert_eq!(a.layout(), b.layout());
710-
let layout = a.layout();
711-
712-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
713-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
714-
assert_eq!(lane_ty, fx.tcx.types.i32);
715-
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
716-
assert_eq!(lane_count * 2, ret_lane_count);
717-
718-
let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
719-
let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
720-
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
721-
722-
for idx in 0..lane_count {
723-
let lane = a.value_lane(fx, idx).load_scalar(fx);
724-
let sat = fx.bcx.ins().smax(lane, min_i16);
725-
let sat = fx.bcx.ins().smin(sat, max_i16);
726-
let res = fx.bcx.ins().ireduce(types::I16, sat);
727-
728-
let res_lane = CValue::by_val(res, ret_lane_layout);
729-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
730-
}
731-
732-
for idx in 0..lane_count {
733-
let lane = b.value_lane(fx, idx).load_scalar(fx);
734-
let sat = fx.bcx.ins().smax(lane, min_i16);
735-
let sat = fx.bcx.ins().smin(sat, max_i16);
736-
let res = fx.bcx.ins().ireduce(types::I16, sat);
737-
738-
let res_lane = CValue::by_val(res, ret_lane_layout);
739-
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
740-
}
634+
pack_instruction(fx, a, b, ret, PackSize::S8, PackWidth::Avx);
741635
}
742636

743637
"llvm.x86.sse41.packusdw" => {
744638
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
745639
intrinsic_args!(fx, args => (a, b); intrinsic);
746640

747-
assert_eq!(a.layout(), b.layout());
748-
let layout = a.layout();
749-
750-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
751-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
752-
assert_eq!(lane_ty, fx.tcx.types.i32);
753-
assert_eq!(ret_lane_ty, fx.tcx.types.u16);
754-
assert_eq!(lane_count * 2, ret_lane_count);
755-
756-
let min_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MIN));
757-
let max_u16 = fx.bcx.ins().iconst(types::I32, i64::from(u16::MAX));
758-
let ret_lane_layout = fx.layout_of(fx.tcx.types.u16);
641+
pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Sse);
642+
}
759643

760-
for idx in 0..lane_count {
761-
let lane = a.value_lane(fx, idx).load_scalar(fx);
762-
let sat = fx.bcx.ins().smax(lane, min_u16);
763-
let sat = fx.bcx.ins().smin(sat, max_u16);
764-
let res = fx.bcx.ins().ireduce(types::I16, sat);
644+
"llvm.x86.sse2.packssdw.128" => {
645+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
646+
intrinsic_args!(fx, args => (a, b); intrinsic);
765647

766-
let res_lane = CValue::by_val(res, ret_lane_layout);
767-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
768-
}
648+
pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Sse);
649+
}
769650

770-
for idx in 0..lane_count {
771-
let lane = b.value_lane(fx, idx).load_scalar(fx);
772-
let sat = fx.bcx.ins().smax(lane, min_u16);
773-
let sat = fx.bcx.ins().smin(sat, max_u16);
774-
let res = fx.bcx.ins().ireduce(types::I16, sat);
651+
"llvm.x86.avx2.packusdw" => {
652+
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32&ig_expand=4883
653+
intrinsic_args!(fx, args => (a, b); intrinsic);
775654

776-
let res_lane = CValue::by_val(res, ret_lane_layout);
777-
ret.place_lane(fx, lane_count + idx).write_cvalue(fx, res_lane);
778-
}
655+
pack_instruction(fx, a, b, ret, PackSize::U16, PackWidth::Avx);
779656
}
780657

781658
"llvm.x86.avx2.packssdw" => {
782659
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
783660
intrinsic_args!(fx, args => (a, b); intrinsic);
784661

785-
assert_eq!(a.layout(), b.layout());
786-
let layout = a.layout();
787-
788-
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
789-
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
790-
assert_eq!(lane_ty, fx.tcx.types.i32);
791-
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
792-
assert_eq!(lane_count * 2, ret_lane_count);
793-
794-
let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
795-
let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
796-
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
797-
798-
for idx in 0..lane_count / 2 {
799-
let lane = a.value_lane(fx, idx).load_scalar(fx);
800-
let sat = fx.bcx.ins().smax(lane, min_i16);
801-
let sat = fx.bcx.ins().smin(sat, max_i16);
802-
let res = fx.bcx.ins().ireduce(types::I16, sat);
803-
804-
let res_lane = CValue::by_val(res, ret_lane_layout);
805-
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
806-
}
807-
808-
for idx in 0..lane_count / 2 {
809-
let lane = b.value_lane(fx, idx).load_scalar(fx);
810-
let sat = fx.bcx.ins().smax(lane, min_i16);
811-
let sat = fx.bcx.ins().smin(sat, max_i16);
812-
let res = fx.bcx.ins().ireduce(types::I16, sat);
813-
814-
let res_lane = CValue::by_val(res, ret_lane_layout);
815-
ret.place_lane(fx, lane_count / 2 + idx).write_cvalue(fx, res_lane);
816-
}
817-
818-
for idx in 0..lane_count / 2 {
819-
let lane = a.value_lane(fx, idx).load_scalar(fx);
820-
let sat = fx.bcx.ins().smax(lane, min_i16);
821-
let sat = fx.bcx.ins().smin(sat, max_i16);
822-
let res = fx.bcx.ins().ireduce(types::I16, sat);
823-
824-
let res_lane = CValue::by_val(res, ret_lane_layout);
825-
ret.place_lane(fx, lane_count / 2 * 2 + idx).write_cvalue(fx, res_lane);
826-
}
827-
828-
for idx in 0..lane_count / 2 {
829-
let lane = b.value_lane(fx, idx).load_scalar(fx);
830-
let sat = fx.bcx.ins().smax(lane, min_i16);
831-
let sat = fx.bcx.ins().smin(sat, max_i16);
832-
let res = fx.bcx.ins().ireduce(types::I16, sat);
833-
834-
let res_lane = CValue::by_val(res, ret_lane_layout);
835-
ret.place_lane(fx, lane_count / 2 * 3 + idx).write_cvalue(fx, res_lane);
836-
}
662+
pack_instruction(fx, a, b, ret, PackSize::S16, PackWidth::Avx);
837663
}
838664

839665
"llvm.x86.fma.vfmaddsub.ps"
@@ -1407,3 +1233,115 @@ fn llvm_add_sub<'tcx>(
14071233

14081234
(cb_out, c)
14091235
}
1236+
1237+
enum PackSize {
1238+
U8,
1239+
U16,
1240+
S8,
1241+
S16,
1242+
}
1243+
1244+
impl PackSize {
1245+
fn ret_clif_type(&self) -> Type {
1246+
match self {
1247+
Self::U8 | Self::S8 => types::I8,
1248+
Self::U16 | Self::S16 => types::I16,
1249+
}
1250+
}
1251+
fn src_clif_type(&self) -> Type {
1252+
match self {
1253+
Self::U8 | Self::S8 => types::I16,
1254+
Self::U16 | Self::S16 => types::I32,
1255+
}
1256+
}
1257+
fn src_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {
1258+
match self {
1259+
Self::U8 | Self::S8 => tcx.types.i16,
1260+
Self::U16 | Self::S16 => tcx.types.i32,
1261+
}
1262+
}
1263+
fn ret_ty<'tcx>(&self, tcx: TyCtxt<'tcx>) -> Ty<'tcx> {
1264+
match self {
1265+
Self::U8 => tcx.types.u8,
1266+
Self::S8 => tcx.types.i8,
1267+
Self::U16 => tcx.types.u16,
1268+
Self::S16 => tcx.types.i16,
1269+
}
1270+
}
1271+
fn max(&self) -> i64 {
1272+
match self {
1273+
Self::U8 => u8::MAX as u64 as i64,
1274+
Self::S8 => i8::MAX as u8 as u64 as i64,
1275+
Self::U16 => u16::MAX as u64 as i64,
1276+
Self::S16 => i16::MAX as u64 as u64 as i64,
1277+
}
1278+
}
1279+
fn min(&self) -> i64 {
1280+
match self {
1281+
Self::U8 | Self::U16 => 0,
1282+
Self::S8 => i16::from(i8::MIN) as u16 as i64,
1283+
Self::S16 => i32::from(i16::MIN) as u32 as i64,
1284+
}
1285+
}
1286+
}
1287+
1288+
enum PackWidth {
1289+
Sse = 1,
1290+
Avx = 2,
1291+
}
1292+
impl PackWidth {
1293+
fn divisor(&self) -> u64 {
1294+
match self {
1295+
Self::Sse => 1,
1296+
Self::Avx => 2,
1297+
}
1298+
}
1299+
}
1300+
1301+
/// Implement an x86 pack instruction with the intrinsic `_mm{,256}pack{us,s}_epi{16,32}`.
1302+
/// Validated for correctness against LLVM, see commit `c8f5d35508e062bd2d95e6c03429bfec831db6d3`.
1303+
fn pack_instruction<'tcx>(
1304+
fx: &mut FunctionCx<'_, '_, 'tcx>,
1305+
a: CValue<'tcx>,
1306+
b: CValue<'tcx>,
1307+
ret: CPlace<'tcx>,
1308+
ret_size: PackSize,
1309+
width: PackWidth,
1310+
) {
1311+
assert_eq!(a.layout(), b.layout());
1312+
let layout = a.layout();
1313+
1314+
let (src_lane_count, src_lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
1315+
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
1316+
assert_eq!(src_lane_ty, ret_size.src_ty(fx.tcx));
1317+
assert_eq!(ret_lane_ty, ret_size.ret_ty(fx.tcx));
1318+
assert_eq!(src_lane_count * 2, ret_lane_count);
1319+
1320+
let min = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.min());
1321+
let max = fx.bcx.ins().iconst(ret_size.src_clif_type(), ret_size.max());
1322+
let ret_lane_layout = fx.layout_of(ret_size.ret_ty(fx.tcx));
1323+
1324+
let mut round = |source: CValue<'tcx>, source_offset: u64, dest_offset: u64| {
1325+
let step_amount = src_lane_count / width.divisor();
1326+
let dest_offset = step_amount * dest_offset;
1327+
for idx in 0..step_amount {
1328+
let lane = source.value_lane(fx, step_amount * source_offset + idx).load_scalar(fx);
1329+
let sat = fx.bcx.ins().smax(lane, min);
1330+
let sat = match ret_size {
1331+
PackSize::U8 | PackSize::U16 => fx.bcx.ins().umin(sat, max),
1332+
PackSize::S8 | PackSize::S16 => fx.bcx.ins().smin(sat, max),
1333+
};
1334+
let res = fx.bcx.ins().ireduce(ret_size.ret_clif_type(), sat);
1335+
let res_lane = CValue::by_val(res, ret_lane_layout);
1336+
ret.place_lane(fx, dest_offset + idx).write_cvalue(fx, res_lane);
1337+
}
1338+
};
1339+
1340+
round(a, 0, 0);
1341+
round(b, 0, 1);
1342+
1343+
if let PackWidth::Avx = width {
1344+
round(a, 1, 2);
1345+
round(b, 1, 3);
1346+
}
1347+
}

0 commit comments

Comments
 (0)