@@ -105,8 +105,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
105
105
;
106
106
; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
107
107
; SSSE3-FAST: # %bb.0:
108
- ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
109
108
; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
109
+ ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
110
110
; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
111
111
; SSSE3-FAST-NEXT: retq
112
112
;
@@ -126,12 +126,12 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
126
126
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
127
127
; AVX1-SLOW-NEXT: retq
128
128
;
129
- ; AVX -FAST-LABEL: pair_sum_v4i32_v4i32:
130
- ; AVX -FAST: # %bb.0:
131
- ; AVX -FAST-NEXT: vphaddd %xmm3 , %xmm2 , %xmm2
132
- ; AVX -FAST-NEXT: vphaddd %xmm1 , %xmm0 , %xmm0
133
- ; AVX -FAST-NEXT: vphaddd %xmm2 , %xmm0, %xmm0
134
- ; AVX -FAST-NEXT: retq
129
+ ; AVX1 -FAST-LABEL: pair_sum_v4i32_v4i32:
130
+ ; AVX1 -FAST: # %bb.0:
131
+ ; AVX1 -FAST-NEXT: vphaddd %xmm1 , %xmm0 , %xmm0
132
+ ; AVX1 -FAST-NEXT: vphaddd %xmm3 , %xmm2 , %xmm1
133
+ ; AVX1 -FAST-NEXT: vphaddd %xmm1 , %xmm0, %xmm0
134
+ ; AVX1 -FAST-NEXT: retq
135
135
;
136
136
; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
137
137
; AVX2-SLOW: # %bb.0:
@@ -147,6 +147,13 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
147
147
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
148
148
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
149
149
; AVX2-SLOW-NEXT: retq
150
+ ;
151
+ ; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32:
152
+ ; AVX2-FAST: # %bb.0:
153
+ ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
154
+ ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
155
+ ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
156
+ ; AVX2-FAST-NEXT: retq
150
157
%5 = shufflevector <4 x i32 > %0 , <4 x i32 > poison, <2 x i32 > <i32 0 , i32 2 >
151
158
%6 = shufflevector <4 x i32 > %0 , <4 x i32 > poison, <2 x i32 > <i32 1 , i32 3 >
152
159
%7 = add <2 x i32 > %5 , %6
@@ -451,20 +458,20 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
451
458
; AVX2-FAST: # %bb.0:
452
459
; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
453
460
; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
454
- ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1
455
- ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
461
+ ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1
462
+ ; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm4
463
+ ; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm5
456
464
; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
457
- ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
458
- ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
459
- ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
460
- ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
461
- ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
462
- ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
463
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
464
- ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
465
- ; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1
466
- ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm2
467
- ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
465
+ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,3]
466
+ ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0]
467
+ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3]
468
+ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
469
+ ; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
470
+ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
471
+ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
472
+ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
473
+ ; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2
474
+ ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm2, %xmm1
468
475
; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1
469
476
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
470
477
; AVX2-FAST-NEXT: retq
@@ -993,8 +1000,8 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
993
1000
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
994
1001
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
995
1002
; SSSE3-FAST-NEXT: addps %xmm3, %xmm1
996
- ; SSSE3-FAST-NEXT: haddps %xmm0 , %xmm1
997
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0 ]
1003
+ ; SSSE3-FAST-NEXT: haddps %xmm1 , %xmm0
1004
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2 ]
998
1005
; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0
999
1006
; SSSE3-FAST-NEXT: retq
1000
1007
;
@@ -1028,8 +1035,8 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
1028
1035
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
1029
1036
; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
1030
1037
; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
1031
- ; AVX-FAST-NEXT: vhaddps %xmm1 , %xmm2 , %xmm1
1032
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0 ]
1038
+ ; AVX-FAST-NEXT: vhaddps %xmm2 , %xmm1 , %xmm1
1039
+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2 ]
1033
1040
; AVX-FAST-NEXT: retq
1034
1041
%5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %0 )
1035
1042
%6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %1 )
@@ -1105,39 +1112,20 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
1105
1112
; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1106
1113
; AVX-SLOW-NEXT: retq
1107
1114
;
1108
- ; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32:
1109
- ; AVX1-FAST: # %bb.0:
1110
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1111
- ; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1112
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1113
- ; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1114
- ; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1115
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1116
- ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1117
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1118
- ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1119
- ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1120
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
1121
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1122
- ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1123
- ; AVX1-FAST-NEXT: retq
1124
- ;
1125
- ; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32:
1126
- ; AVX2-FAST: # %bb.0:
1127
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1128
- ; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1129
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1130
- ; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1131
- ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1132
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1133
- ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1134
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1135
- ; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1136
- ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1137
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
1138
- ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1139
- ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1140
- ; AVX2-FAST-NEXT: retq
1115
+ ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
1116
+ ; AVX-FAST: # %bb.0:
1117
+ ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1118
+ ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1119
+ ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1120
+ ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1121
+ ; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1122
+ ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1123
+ ; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1124
+ ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1125
+ ; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1126
+ ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1127
+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1128
+ ; AVX-FAST-NEXT: retq
1141
1129
%5 = call i32 @llvm.vector.reduce.add.i32.v4i32 (<4 x i32 > %0 )
1142
1130
%6 = call i32 @llvm.vector.reduce.add.i32.v4i32 (<4 x i32 > %1 )
1143
1131
%7 = call i32 @llvm.vector.reduce.add.i32.v4i32 (<4 x i32 > %2 )
0 commit comments