@@ -610,230 +610,56 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
610
610
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16&ig_expand=4903
611
611
intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
612
612
613
- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
614
- let layout = a. layout ( ) ;
615
-
616
- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
617
- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
618
- assert_eq ! ( lane_ty, fx. tcx. types. i16 ) ;
619
- assert_eq ! ( ret_lane_ty, fx. tcx. types. u8 ) ;
620
- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
621
-
622
- let zero = fx. bcx . ins ( ) . iconst ( types:: I16 , 0 ) ;
623
- let max_u8 = fx. bcx . ins ( ) . iconst ( types:: I16 , 255 ) ;
624
- let ret_lane_layout = fx. layout_of ( fx. tcx . types . u8 ) ;
625
-
626
- for idx in 0 ..lane_count {
627
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
628
- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
629
- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
630
- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
631
-
632
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
633
- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
634
- }
613
+ pack_instruction ( fx, a, b, ret, PackSize :: U8 , PackWidth :: Sse ) ;
614
+ }
635
615
636
- for idx in 0 ..lane_count {
637
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
638
- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
639
- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
640
- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
616
+ "llvm.x86.sse2.packsswb.128" => {
617
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16&ig_expand=4848
618
+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
641
619
642
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
643
- ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
644
- }
620
+ pack_instruction ( fx, a, b, ret, PackSize :: S8 , PackWidth :: Sse ) ;
645
621
}
646
622
647
623
"llvm.x86.avx2.packuswb" => {
648
624
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16&ig_expand=4906
649
625
intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
650
626
651
- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
652
- let layout = a. layout ( ) ;
653
-
654
- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
655
- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
656
- assert_eq ! ( lane_ty, fx. tcx. types. i16 ) ;
657
- assert_eq ! ( ret_lane_ty, fx. tcx. types. u8 ) ;
658
- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
659
-
660
- let zero = fx. bcx . ins ( ) . iconst ( types:: I16 , 0 ) ;
661
- let max_u8 = fx. bcx . ins ( ) . iconst ( types:: I16 , 255 ) ;
662
- let ret_lane_layout = fx. layout_of ( fx. tcx . types . u8 ) ;
663
-
664
- for idx in 0 ..lane_count / 2 {
665
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
666
- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
667
- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
668
- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
669
-
670
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
671
- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
672
- }
673
-
674
- for idx in 0 ..lane_count / 2 {
675
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
676
- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
677
- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
678
- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
679
-
680
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
681
- ret. place_lane ( fx, lane_count / 2 + idx) . write_cvalue ( fx, res_lane) ;
682
- }
683
-
684
- for idx in 0 ..lane_count / 2 {
685
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
686
- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
687
- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
688
- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
689
-
690
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
691
- ret. place_lane ( fx, lane_count / 2 * 2 + idx) . write_cvalue ( fx, res_lane) ;
692
- }
693
-
694
- for idx in 0 ..lane_count / 2 {
695
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
696
- let sat = fx. bcx . ins ( ) . smax ( lane, zero) ;
697
- let sat = fx. bcx . ins ( ) . umin ( sat, max_u8) ;
698
- let res = fx. bcx . ins ( ) . ireduce ( types:: I8 , sat) ;
699
-
700
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
701
- ret. place_lane ( fx, lane_count / 2 * 3 + idx) . write_cvalue ( fx, res_lane) ;
702
- }
627
+ pack_instruction ( fx, a, b, ret, PackSize :: U8 , PackWidth :: Avx ) ;
703
628
}
704
629
705
- "llvm.x86.sse2.packssdw.128 " => {
706
- // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 &ig_expand=4889
630
+ "llvm.x86.avx2.packsswb " => {
631
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16 &ig_expand=4851
707
632
intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
708
633
709
- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
710
- let layout = a. layout ( ) ;
711
-
712
- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
713
- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
714
- assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
715
- assert_eq ! ( ret_lane_ty, fx. tcx. types. i16 ) ;
716
- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
717
-
718
- let min_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MIN ) as u32 as i64 ) ;
719
- let max_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MAX ) as u32 as i64 ) ;
720
- let ret_lane_layout = fx. layout_of ( fx. tcx . types . i16 ) ;
721
-
722
- for idx in 0 ..lane_count {
723
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
724
- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
725
- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
726
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
727
-
728
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
729
- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
730
- }
731
-
732
- for idx in 0 ..lane_count {
733
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
734
- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
735
- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
736
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
737
-
738
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
739
- ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
740
- }
634
+ pack_instruction ( fx, a, b, ret, PackSize :: S8 , PackWidth :: Avx ) ;
741
635
}
742
636
743
637
"llvm.x86.sse41.packusdw" => {
744
638
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32&ig_expand=4912
745
639
intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
746
640
747
- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
748
- let layout = a. layout ( ) ;
749
-
750
- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
751
- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
752
- assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
753
- assert_eq ! ( ret_lane_ty, fx. tcx. types. u16 ) ;
754
- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
755
-
756
- let min_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MIN ) ) ;
757
- let max_u16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i64:: from ( u16:: MAX ) ) ;
758
- let ret_lane_layout = fx. layout_of ( fx. tcx . types . u16 ) ;
641
+ pack_instruction ( fx, a, b, ret, PackSize :: U16 , PackWidth :: Sse ) ;
642
+ }
759
643
760
- for idx in 0 ..lane_count {
761
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
762
- let sat = fx. bcx . ins ( ) . smax ( lane, min_u16) ;
763
- let sat = fx. bcx . ins ( ) . smin ( sat, max_u16) ;
764
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
644
+ "llvm.x86.sse2.packssdw.128" => {
645
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32&ig_expand=4889
646
+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
765
647
766
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
767
- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
768
- }
648
+ pack_instruction ( fx, a, b, ret, PackSize :: S16 , PackWidth :: Sse ) ;
649
+ }
769
650
770
- for idx in 0 ..lane_count {
771
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
772
- let sat = fx. bcx . ins ( ) . smax ( lane, min_u16) ;
773
- let sat = fx. bcx . ins ( ) . smin ( sat, max_u16) ;
774
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
651
+ "llvm.x86.avx2.packusdw" => {
652
+ // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32&ig_expand=4883
653
+ intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
775
654
776
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
777
- ret. place_lane ( fx, lane_count + idx) . write_cvalue ( fx, res_lane) ;
778
- }
655
+ pack_instruction ( fx, a, b, ret, PackSize :: U16 , PackWidth :: Avx ) ;
779
656
}
780
657
781
658
"llvm.x86.avx2.packssdw" => {
782
659
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32&ig_expand=4892
783
660
intrinsic_args ! ( fx, args => ( a, b) ; intrinsic) ;
784
661
785
- assert_eq ! ( a. layout( ) , b. layout( ) ) ;
786
- let layout = a. layout ( ) ;
787
-
788
- let ( lane_count, lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
789
- let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
790
- assert_eq ! ( lane_ty, fx. tcx. types. i32 ) ;
791
- assert_eq ! ( ret_lane_ty, fx. tcx. types. i16 ) ;
792
- assert_eq ! ( lane_count * 2 , ret_lane_count) ;
793
-
794
- let min_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MIN ) as u32 as i64 ) ;
795
- let max_i16 = fx. bcx . ins ( ) . iconst ( types:: I32 , i32:: from ( i16:: MAX ) as u32 as i64 ) ;
796
- let ret_lane_layout = fx. layout_of ( fx. tcx . types . i16 ) ;
797
-
798
- for idx in 0 ..lane_count / 2 {
799
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
800
- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
801
- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
802
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
803
-
804
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
805
- ret. place_lane ( fx, idx) . write_cvalue ( fx, res_lane) ;
806
- }
807
-
808
- for idx in 0 ..lane_count / 2 {
809
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
810
- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
811
- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
812
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
813
-
814
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
815
- ret. place_lane ( fx, lane_count / 2 + idx) . write_cvalue ( fx, res_lane) ;
816
- }
817
-
818
- for idx in 0 ..lane_count / 2 {
819
- let lane = a. value_lane ( fx, idx) . load_scalar ( fx) ;
820
- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
821
- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
822
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
823
-
824
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
825
- ret. place_lane ( fx, lane_count / 2 * 2 + idx) . write_cvalue ( fx, res_lane) ;
826
- }
827
-
828
- for idx in 0 ..lane_count / 2 {
829
- let lane = b. value_lane ( fx, idx) . load_scalar ( fx) ;
830
- let sat = fx. bcx . ins ( ) . smax ( lane, min_i16) ;
831
- let sat = fx. bcx . ins ( ) . smin ( sat, max_i16) ;
832
- let res = fx. bcx . ins ( ) . ireduce ( types:: I16 , sat) ;
833
-
834
- let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
835
- ret. place_lane ( fx, lane_count / 2 * 3 + idx) . write_cvalue ( fx, res_lane) ;
836
- }
662
+ pack_instruction ( fx, a, b, ret, PackSize :: S16 , PackWidth :: Avx ) ;
837
663
}
838
664
839
665
"llvm.x86.fma.vfmaddsub.ps"
@@ -1407,3 +1233,115 @@ fn llvm_add_sub<'tcx>(
1407
1233
1408
1234
( cb_out, c)
1409
1235
}
1236
+
1237
+ enum PackSize {
1238
+ U8 ,
1239
+ U16 ,
1240
+ S8 ,
1241
+ S16 ,
1242
+ }
1243
+
1244
+ impl PackSize {
1245
+ fn ret_clif_type ( & self ) -> Type {
1246
+ match self {
1247
+ Self :: U8 | Self :: S8 => types:: I8 ,
1248
+ Self :: U16 | Self :: S16 => types:: I16 ,
1249
+ }
1250
+ }
1251
+ fn src_clif_type ( & self ) -> Type {
1252
+ match self {
1253
+ Self :: U8 | Self :: S8 => types:: I16 ,
1254
+ Self :: U16 | Self :: S16 => types:: I32 ,
1255
+ }
1256
+ }
1257
+ fn src_ty < ' tcx > ( & self , tcx : TyCtxt < ' tcx > ) -> Ty < ' tcx > {
1258
+ match self {
1259
+ Self :: U8 | Self :: S8 => tcx. types . i16 ,
1260
+ Self :: U16 | Self :: S16 => tcx. types . i32 ,
1261
+ }
1262
+ }
1263
+ fn ret_ty < ' tcx > ( & self , tcx : TyCtxt < ' tcx > ) -> Ty < ' tcx > {
1264
+ match self {
1265
+ Self :: U8 => tcx. types . u8 ,
1266
+ Self :: S8 => tcx. types . i8 ,
1267
+ Self :: U16 => tcx. types . u16 ,
1268
+ Self :: S16 => tcx. types . i16 ,
1269
+ }
1270
+ }
1271
+ fn max ( & self ) -> i64 {
1272
+ match self {
1273
+ Self :: U8 => u8:: MAX as u64 as i64 ,
1274
+ Self :: S8 => i8:: MAX as u8 as u64 as i64 ,
1275
+ Self :: U16 => u16:: MAX as u64 as i64 ,
1276
+ Self :: S16 => i16:: MAX as u64 as u64 as i64 ,
1277
+ }
1278
+ }
1279
+ fn min ( & self ) -> i64 {
1280
+ match self {
1281
+ Self :: U8 | Self :: U16 => 0 ,
1282
+ Self :: S8 => i16:: from ( i8:: MIN ) as u16 as i64 ,
1283
+ Self :: S16 => i32:: from ( i16:: MIN ) as u32 as i64 ,
1284
+ }
1285
+ }
1286
+ }
1287
+
1288
+ enum PackWidth {
1289
+ Sse = 1 ,
1290
+ Avx = 2 ,
1291
+ }
1292
+ impl PackWidth {
1293
+ fn divisor ( & self ) -> u64 {
1294
+ match self {
1295
+ Self :: Sse => 1 ,
1296
+ Self :: Avx => 2 ,
1297
+ }
1298
+ }
1299
+ }
1300
+
1301
+ /// Implement an x86 pack instruction with the intrinsic `_mm{,256}pack{us,s}_epi{16,32}`.
1302
+ /// Validated for correctness against LLVM, see commit `c8f5d35508e062bd2d95e6c03429bfec831db6d3`.
1303
+ fn pack_instruction < ' tcx > (
1304
+ fx : & mut FunctionCx < ' _ , ' _ , ' tcx > ,
1305
+ a : CValue < ' tcx > ,
1306
+ b : CValue < ' tcx > ,
1307
+ ret : CPlace < ' tcx > ,
1308
+ ret_size : PackSize ,
1309
+ width : PackWidth ,
1310
+ ) {
1311
+ assert_eq ! ( a. layout( ) , b. layout( ) ) ;
1312
+ let layout = a. layout ( ) ;
1313
+
1314
+ let ( src_lane_count, src_lane_ty) = layout. ty . simd_size_and_type ( fx. tcx ) ;
1315
+ let ( ret_lane_count, ret_lane_ty) = ret. layout ( ) . ty . simd_size_and_type ( fx. tcx ) ;
1316
+ assert_eq ! ( src_lane_ty, ret_size. src_ty( fx. tcx) ) ;
1317
+ assert_eq ! ( ret_lane_ty, ret_size. ret_ty( fx. tcx) ) ;
1318
+ assert_eq ! ( src_lane_count * 2 , ret_lane_count) ;
1319
+
1320
+ let min = fx. bcx . ins ( ) . iconst ( ret_size. src_clif_type ( ) , ret_size. min ( ) ) ;
1321
+ let max = fx. bcx . ins ( ) . iconst ( ret_size. src_clif_type ( ) , ret_size. max ( ) ) ;
1322
+ let ret_lane_layout = fx. layout_of ( ret_size. ret_ty ( fx. tcx ) ) ;
1323
+
1324
+ let mut round = |source : CValue < ' tcx > , source_offset : u64 , dest_offset : u64 | {
1325
+ let step_amount = src_lane_count / width. divisor ( ) ;
1326
+ let dest_offset = step_amount * dest_offset;
1327
+ for idx in 0 ..step_amount {
1328
+ let lane = source. value_lane ( fx, step_amount * source_offset + idx) . load_scalar ( fx) ;
1329
+ let sat = fx. bcx . ins ( ) . smax ( lane, min) ;
1330
+ let sat = match ret_size {
1331
+ PackSize :: U8 | PackSize :: U16 => fx. bcx . ins ( ) . umin ( sat, max) ,
1332
+ PackSize :: S8 | PackSize :: S16 => fx. bcx . ins ( ) . smin ( sat, max) ,
1333
+ } ;
1334
+ let res = fx. bcx . ins ( ) . ireduce ( ret_size. ret_clif_type ( ) , sat) ;
1335
+ let res_lane = CValue :: by_val ( res, ret_lane_layout) ;
1336
+ ret. place_lane ( fx, dest_offset + idx) . write_cvalue ( fx, res_lane) ;
1337
+ }
1338
+ } ;
1339
+
1340
+ round ( a, 0 , 0 ) ;
1341
+ round ( b, 0 , 1 ) ;
1342
+
1343
+ if let PackWidth :: Avx = width {
1344
+ round ( a, 1 , 2 ) ;
1345
+ round ( b, 1 , 3 ) ;
1346
+ }
1347
+ }
0 commit comments