|
5 | 5 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64
|
6 | 6 | ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32
|
7 | 7 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64
|
| 8 | +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2-SLOW,AVX2-SLOW32 |
| 9 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+slow-pmulld | FileCheck %s --check-prefixes=AVX2-SLOW,AVX2-SLOW64 |
8 | 10 | ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32
|
9 | 11 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64
|
10 | 12 | ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32
|
@@ -42,6 +44,18 @@ define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
|
42 | 44 | ; SSE4-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
43 | 45 | ; SSE4-64-NEXT: retq
|
44 | 46 | ;
|
| 47 | +; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8: |
| 48 | +; AVX2-SLOW32: # %bb.0: |
| 49 | +; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| 50 | +; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 |
| 51 | +; AVX2-SLOW32-NEXT: retl |
| 52 | +; |
| 53 | +; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8: |
| 54 | +; AVX2-SLOW64: # %bb.0: |
| 55 | +; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| 56 | +; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| 57 | +; AVX2-SLOW64-NEXT: retq |
| 58 | +; |
45 | 59 | ; AVX2-32-LABEL: test_mul_v4i32_v4i8:
|
46 | 60 | ; AVX2-32: # %bb.0:
|
47 | 61 | ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
@@ -165,6 +179,18 @@ define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
|
165 | 179 | ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
|
166 | 180 | ; SSE4-64-NEXT: retq
|
167 | 181 | ;
|
| 182 | +; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8: |
| 183 | +; AVX2-SLOW32: # %bb.0: |
| 184 | +; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 185 | +; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 |
| 186 | +; AVX2-SLOW32-NEXT: retl |
| 187 | +; |
| 188 | +; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8: |
| 189 | +; AVX2-SLOW64: # %bb.0: |
| 190 | +; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 191 | +; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 192 | +; AVX2-SLOW64-NEXT: retq |
| 193 | +; |
168 | 194 | ; AVX2-32-LABEL: test_mul_v8i32_v8i8:
|
169 | 195 | ; AVX2-32: # %bb.0:
|
170 | 196 | ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
@@ -336,6 +362,22 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
|
336 | 362 | ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
|
337 | 363 | ; SSE4-64-NEXT: retq
|
338 | 364 | ;
|
| 365 | +; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8: |
| 366 | +; AVX2-SLOW: # %bb.0: |
| 367 | +; AVX2-SLOW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero |
| 368 | +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] |
| 369 | +; AVX2-SLOW-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 |
| 370 | +; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 |
| 371 | +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 372 | +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 373 | +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 |
| 374 | +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 |
| 375 | +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 |
| 376 | +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 377 | +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 378 | +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 |
| 379 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 380 | +; |
339 | 381 | ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
|
340 | 382 | ; AVX2-32: # %bb.0:
|
341 | 383 | ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
|
@@ -427,6 +469,14 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
|
427 | 469 | ; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
428 | 470 | ; SSE4-64-NEXT: retq
|
429 | 471 | ;
|
| 472 | +; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16: |
| 473 | +; AVX2-SLOW: # %bb.0: |
| 474 | +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> |
| 475 | +; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 |
| 476 | +; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 |
| 477 | +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 478 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 479 | +; |
430 | 480 | ; AVX-32-LABEL: test_mul_v4i32_v4i16:
|
431 | 481 | ; AVX-32: # %bb.0:
|
432 | 482 | ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
@@ -516,6 +566,16 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
|
516 | 566 | ; SSE4-64-NEXT: movdqa %xmm2, %xmm0
|
517 | 567 | ; SSE4-64-NEXT: retq
|
518 | 568 | ;
|
| 569 | +; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16: |
| 570 | +; AVX2-SLOW: # %bb.0: |
| 571 | +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] |
| 572 | +; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 |
| 573 | +; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 |
| 574 | +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] |
| 575 | +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] |
| 576 | +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 |
| 577 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 578 | +; |
519 | 579 | ; AVX-32-LABEL: test_mul_v8i32_v8i16:
|
520 | 580 | ; AVX-32: # %bb.0:
|
521 | 581 | ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
@@ -643,6 +703,21 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
|
643 | 703 | ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
|
644 | 704 | ; SSE4-64-NEXT: retq
|
645 | 705 | ;
|
| 706 | +; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16: |
| 707 | +; AVX2-SLOW: # %bb.0: |
| 708 | +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] |
| 709 | +; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 |
| 710 | +; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 |
| 711 | +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 712 | +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 713 | +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 |
| 714 | +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 |
| 715 | +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 |
| 716 | +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] |
| 717 | +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| 718 | +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 |
| 719 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 720 | +; |
646 | 721 | ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
|
647 | 722 | ; AVX2-32: # %bb.0:
|
648 | 723 | ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
|
@@ -708,6 +783,18 @@ define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
|
708 | 783 | ; SSE4-64-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
709 | 784 | ; SSE4-64-NEXT: retq
|
710 | 785 | ;
|
| 786 | +; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i8_minsize: |
| 787 | +; AVX2-SLOW32: # %bb.0: |
| 788 | +; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| 789 | +; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 |
| 790 | +; AVX2-SLOW32-NEXT: retl |
| 791 | +; |
| 792 | +; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i8_minsize: |
| 793 | +; AVX2-SLOW64: # %bb.0: |
| 794 | +; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| 795 | +; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 |
| 796 | +; AVX2-SLOW64-NEXT: retq |
| 797 | +; |
711 | 798 | ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
|
712 | 799 | ; AVX2-32: # %bb.0:
|
713 | 800 | ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
|
@@ -823,6 +910,18 @@ define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
|
823 | 910 | ; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1
|
824 | 911 | ; SSE4-64-NEXT: retq
|
825 | 912 | ;
|
| 913 | +; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i8_minsize: |
| 914 | +; AVX2-SLOW32: # %bb.0: |
| 915 | +; AVX2-SLOW32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 916 | +; AVX2-SLOW32-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 |
| 917 | +; AVX2-SLOW32-NEXT: retl |
| 918 | +; |
| 919 | +; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i8_minsize: |
| 920 | +; AVX2-SLOW64: # %bb.0: |
| 921 | +; AVX2-SLOW64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 922 | +; AVX2-SLOW64-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 |
| 923 | +; AVX2-SLOW64-NEXT: retq |
| 924 | +; |
826 | 925 | ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
|
827 | 926 | ; AVX2-32: # %bb.0:
|
828 | 927 | ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
@@ -974,6 +1073,16 @@ define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
|
974 | 1073 | ; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3
|
975 | 1074 | ; SSE4-64-NEXT: retq
|
976 | 1075 | ;
|
| 1076 | +; AVX2-SLOW-LABEL: test_mul_v16i32_v16i8_minsize: |
| 1077 | +; AVX2-SLOW: # %bb.0: |
| 1078 | +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| 1079 | +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero |
| 1080 | +; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero |
| 1081 | +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] |
| 1082 | +; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 |
| 1083 | +; AVX2-SLOW-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 |
| 1084 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 1085 | +; |
977 | 1086 | ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
|
978 | 1087 | ; AVX2-32: # %bb.0:
|
979 | 1088 | ; AVX2-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
|
@@ -1059,6 +1168,13 @@ define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
|
1059 | 1168 | ; SSE4-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
1060 | 1169 | ; SSE4-64-NEXT: retq
|
1061 | 1170 | ;
|
| 1171 | +; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16_minsize: |
| 1172 | +; AVX2-SLOW: # %bb.0: |
| 1173 | +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero |
| 1174 | +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] |
| 1175 | +; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 |
| 1176 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 1177 | +; |
1062 | 1178 | ; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
|
1063 | 1179 | ; AVX-32: # %bb.0:
|
1064 | 1180 | ; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
|
@@ -1122,6 +1238,13 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
|
1122 | 1238 | ; SSE4-64-NEXT: movdqa %xmm2, %xmm0
|
1123 | 1239 | ; SSE4-64-NEXT: retq
|
1124 | 1240 | ;
|
| 1241 | +; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16_minsize: |
| 1242 | +; AVX2-SLOW: # %bb.0: |
| 1243 | +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| 1244 | +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] |
| 1245 | +; AVX2-SLOW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 |
| 1246 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 1247 | +; |
1125 | 1248 | ; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize:
|
1126 | 1249 | ; AVX-32: # %bb.0:
|
1127 | 1250 | ; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
@@ -1237,6 +1360,16 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
|
1237 | 1360 | ; SSE4-64-NEXT: movdqa %xmm4, %xmm1
|
1238 | 1361 | ; SSE4-64-NEXT: retq
|
1239 | 1362 | ;
|
| 1363 | +; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16_minsize: |
| 1364 | +; AVX2-SLOW: # %bb.0: |
| 1365 | +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 |
| 1366 | +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero |
| 1367 | +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero |
| 1368 | +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] |
| 1369 | +; AVX2-SLOW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 |
| 1370 | +; AVX2-SLOW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 |
| 1371 | +; AVX2-SLOW-NEXT: ret{{[l|q]}} |
| 1372 | +; |
1240 | 1373 | ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
|
1241 | 1374 | ; AVX2-32: # %bb.0:
|
1242 | 1375 | ; AVX2-32-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
0 commit comments