Skip to content

Commit 7746596

Browse files
committed
[SLP][X86] Add VBMI2 coverage for funnel shift tests
VBMI2 CPUs actually have vector funnel shift instruction support
1 parent 865104a commit 7746596

File tree

4 files changed

+150
-0
lines changed

4 files changed

+150
-0
lines changed

llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
66
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
77
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
8+
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
89
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
910

1011
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -128,6 +129,13 @@ define void @fshl_v8i64() {
128129
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
129130
; AVX512-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
130131
; AVX512-NEXT: ret void
132+
;
133+
; AVX512VBMI2-LABEL: @fshl_v8i64(
134+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
135+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
136+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
137+
; AVX512VBMI2-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
138+
; AVX512VBMI2-NEXT: ret void
131139
;
132140
%a0 = load i64, ptr @a64, align 8
133141
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -249,6 +257,13 @@ define void @fshl_v16i32() {
249257
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
250258
; AVX512-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
251259
; AVX512-NEXT: ret void
260+
;
261+
; AVX512VBMI2-LABEL: @fshl_v16i32(
262+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
263+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
264+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
265+
; AVX512VBMI2-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
266+
; AVX512VBMI2-NEXT: ret void
252267
;
253268
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
254269
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -335,6 +350,13 @@ define void @fshl_v32i16() {
335350
; AVX512-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
336351
; AVX512-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
337352
; AVX512-NEXT: ret void
353+
;
354+
; AVX512VBMI2-LABEL: @fshl_v32i16(
355+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
356+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
357+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
358+
; AVX512VBMI2-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
359+
; AVX512VBMI2-NEXT: ret void
338360
;
339361
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
340362
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -504,6 +526,13 @@ define void @fshl_v64i8() {
504526
; AVX512-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
505527
; AVX512-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
506528
; AVX512-NEXT: ret void
529+
;
530+
; AVX512VBMI2-LABEL: @fshl_v64i8(
531+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
532+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
533+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
534+
; AVX512VBMI2-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
535+
; AVX512VBMI2-NEXT: ret void
507536
;
508537
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
509538
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -811,6 +840,13 @@ define void @fshl_v2i32() {
811840
; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
812841
; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
813842
; AVX512-NEXT: ret void
843+
;
844+
; AVX512VBMI2-LABEL: @fshl_v2i32(
845+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
846+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
847+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
848+
; AVX512VBMI2-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
849+
; AVX512VBMI2-NEXT: ret void
814850
;
815851
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
816852
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -863,6 +899,12 @@ define void @fshl_v2i32_uniformconst() {
863899
; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
864900
; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
865901
; AVX512-NEXT: ret void
902+
;
903+
; AVX512VBMI2-LABEL: @fshl_v2i32_uniformconst(
904+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
905+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
906+
; AVX512VBMI2-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
907+
; AVX512VBMI2-NEXT: ret void
866908
;
867909
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
868910
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4

llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
66
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
77
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
8+
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
89
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
910

1011
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -129,6 +130,14 @@ define void @fshl_v8i64() {
129130
; AVX512-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
130131
; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
131132
; AVX512-NEXT: ret void
133+
;
134+
; AVX512VBMI2-LABEL: @fshl_v8i64(
135+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
136+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
137+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @c64, align 8
138+
; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
139+
; AVX512VBMI2-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
140+
; AVX512VBMI2-NEXT: ret void
132141
;
133142
%a0 = load i64, ptr @a64, align 8
134143
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -277,6 +286,14 @@ define void @fshl_v16i32() {
277286
; AVX512-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
278287
; AVX512-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
279288
; AVX512-NEXT: ret void
289+
;
290+
; AVX512VBMI2-LABEL: @fshl_v16i32(
291+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
292+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
293+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @c32, align 4
294+
; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
295+
; AVX512VBMI2-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
296+
; AVX512VBMI2-NEXT: ret void
280297
;
281298
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
282299
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -405,6 +422,14 @@ define void @fshl_v32i16() {
405422
; AVX512-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
406423
; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
407424
; AVX512-NEXT: ret void
425+
;
426+
; AVX512VBMI2-LABEL: @fshl_v32i16(
427+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
428+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
429+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr @c16, align 2
430+
; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
431+
; AVX512VBMI2-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
432+
; AVX512VBMI2-NEXT: ret void
408433
;
409434
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
410435
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -613,6 +638,14 @@ define void @fshl_v64i8() {
613638
; AVX512-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
614639
; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
615640
; AVX512-NEXT: ret void
641+
;
642+
; AVX512VBMI2-LABEL: @fshl_v64i8(
643+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
644+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
645+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr @c8, align 1
646+
; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
647+
; AVX512VBMI2-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
648+
; AVX512VBMI2-NEXT: ret void
616649
;
617650
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
618651
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1

llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
66
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
77
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
8+
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
89
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
910

1011
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -128,6 +129,13 @@ define void @fshr_v8i64() {
128129
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
129130
; AVX512-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
130131
; AVX512-NEXT: ret void
132+
;
133+
; AVX512VBMI2-LABEL: @fshr_v8i64(
134+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
135+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
136+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
137+
; AVX512VBMI2-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
138+
; AVX512VBMI2-NEXT: ret void
131139
;
132140
%a0 = load i64, ptr @a64, align 8
133141
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -249,6 +257,13 @@ define void @fshr_v16i32() {
249257
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
250258
; AVX512-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
251259
; AVX512-NEXT: ret void
260+
;
261+
; AVX512VBMI2-LABEL: @fshr_v16i32(
262+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
263+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
264+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
265+
; AVX512VBMI2-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
266+
; AVX512VBMI2-NEXT: ret void
252267
;
253268
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
254269
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -335,6 +350,13 @@ define void @fshr_v32i16() {
335350
; AVX512-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
336351
; AVX512-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
337352
; AVX512-NEXT: ret void
353+
;
354+
; AVX512VBMI2-LABEL: @fshr_v32i16(
355+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
356+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
357+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
358+
; AVX512VBMI2-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
359+
; AVX512VBMI2-NEXT: ret void
338360
;
339361
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
340362
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -504,6 +526,13 @@ define void @fshr_v64i8() {
504526
; AVX512-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
505527
; AVX512-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
506528
; AVX512-NEXT: ret void
529+
;
530+
; AVX512VBMI2-LABEL: @fshr_v64i8(
531+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
532+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
533+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
534+
; AVX512VBMI2-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
535+
; AVX512VBMI2-NEXT: ret void
507536
;
508537
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
509538
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -811,6 +840,13 @@ define void @fshr_v2i32() {
811840
; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
812841
; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
813842
; AVX512-NEXT: ret void
843+
;
844+
; AVX512VBMI2-LABEL: @fshr_v2i32(
845+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
846+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
847+
; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
848+
; AVX512VBMI2-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
849+
; AVX512VBMI2-NEXT: ret void
814850
;
815851
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
816852
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -863,6 +899,12 @@ define void @fshr_v2i32_uniformconst() {
863899
; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
864900
; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
865901
; AVX512-NEXT: ret void
902+
;
903+
; AVX512VBMI2-LABEL: @fshr_v2i32_uniformconst(
904+
; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
905+
; AVX512VBMI2-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
906+
; AVX512VBMI2-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
907+
; AVX512VBMI2-NEXT: ret void
866908
;
867909
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
868910
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4

0 commit comments

Comments
 (0)