Skip to content

Commit b8801ba

Browse files
committed
[AArch64] Common patterns between UMULL and int_aarch64_neon_umull
We have some duplicate patterns between the AArch64ISD::UMULL (/SMULL) and the int_aarch64_neon_umull (/smull) intrinsics. They did not replicate all the patterns though, leaving some gaps on instructions like umlal2 from codegen. This commons all the patterns by converting all int_aarch64_neon_umull intrinsics to UMULL nodes and removing the duplicate for umull/smull intrinsics, so that all instructions go through the same tablegen pattern. This improves some of the longer-than-legal mla patterns, helping them replace ext with umlal2. Differential Revision: https://reviews.llvm.org/D119887
1 parent 4a01ec4 commit b8801ba

File tree

3 files changed

+48
-116
lines changed

3 files changed

+48
-116
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -15447,7 +15447,11 @@ static SDValue performIntrinsicCombine(SDNode *N,
1544715447
return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
1544815448
N->getOperand(1), N->getOperand(2));
1544915449
case Intrinsic::aarch64_neon_smull:
15450+
return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
15451+
N->getOperand(1), N->getOperand(2));
1545015452
case Intrinsic::aarch64_neon_umull:
15453+
return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
15454+
N->getOperand(1), N->getOperand(2));
1545115455
case Intrinsic::aarch64_neon_pmull:
1545215456
case Intrinsic::aarch64_neon_sqdmull:
1545315457
return tryCombineLongOpWithDup(IID, N, DCI, DAG);
@@ -18131,6 +18135,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1813118135
return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
1813218136
case AArch64ISD::UADDV:
1813318137
return performUADDVCombine(N, DAG);
18138+
case AArch64ISD::SMULL:
18139+
case AArch64ISD::UMULL:
18140+
return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
1813418141
case ISD::INTRINSIC_VOID:
1813518142
case ISD::INTRINSIC_W_CHAIN:
1813618143
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+16-77
Original file line numberDiff line numberDiff line change
@@ -5105,10 +5105,10 @@ defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
51055105
defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
51065106
BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
51075107
defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
5108-
TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
5108+
TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
51095109
defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
5110-
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
5111-
defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
5110+
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
5111+
defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>;
51125112
defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
51135113
int_aarch64_neon_sqadd>;
51145114
defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
@@ -5126,10 +5126,10 @@ defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
51265126
defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
51275127
BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>;
51285128
defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
5129-
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
5129+
TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
51305130
defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
5131-
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
5132-
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
5131+
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
5132+
defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>;
51335133
defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
51345134
BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>;
51355135
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
@@ -5164,74 +5164,15 @@ multiclass Neon_mul_acc_widen_patterns<SDPatternOperator opnode, SDPatternOperat
51645164
V64:$Rn, V64:$Rm)), dsub)>;
51655165
}
51665166

5167-
defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_umull,
5167+
defm : Neon_mul_acc_widen_patterns<add, AArch64umull,
51685168
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
5169-
defm : Neon_mul_acc_widen_patterns<add, int_aarch64_neon_smull,
5169+
defm : Neon_mul_acc_widen_patterns<add, AArch64smull,
51705170
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
5171-
defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_umull,
5171+
defm : Neon_mul_acc_widen_patterns<sub, AArch64umull,
51725172
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
5173-
defm : Neon_mul_acc_widen_patterns<sub, int_aarch64_neon_smull,
5173+
defm : Neon_mul_acc_widen_patterns<sub, AArch64smull,
51745174
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
51755175

5176-
// Additional patterns for SMULL and UMULL
5177-
multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
5178-
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
5179-
def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
5180-
(INST8B V64:$Rn, V64:$Rm)>;
5181-
def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
5182-
(INST4H V64:$Rn, V64:$Rm)>;
5183-
def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
5184-
(INST2S V64:$Rn, V64:$Rm)>;
5185-
}
5186-
5187-
defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
5188-
SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
5189-
defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
5190-
UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
5191-
5192-
// Patterns for smull2/umull2.
5193-
multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
5194-
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
5195-
def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
5196-
(extract_high_v16i8 V128:$Rm))),
5197-
(INST8B V128:$Rn, V128:$Rm)>;
5198-
def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
5199-
(extract_high_v8i16 V128:$Rm))),
5200-
(INST4H V128:$Rn, V128:$Rm)>;
5201-
def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
5202-
(extract_high_v4i32 V128:$Rm))),
5203-
(INST2S V128:$Rn, V128:$Rm)>;
5204-
}
5205-
5206-
defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
5207-
SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
5208-
defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
5209-
UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
5210-
5211-
// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
5212-
multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
5213-
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
5214-
def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
5215-
(INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
5216-
def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
5217-
(INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
5218-
def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
5219-
(INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
5220-
}
5221-
5222-
defm : Neon_mulacc_widen_patterns<
5223-
TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
5224-
SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
5225-
defm : Neon_mulacc_widen_patterns<
5226-
TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
5227-
UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
5228-
defm : Neon_mulacc_widen_patterns<
5229-
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
5230-
SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
5231-
defm : Neon_mulacc_widen_patterns<
5232-
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
5233-
UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
5234-
52355176
// Patterns for 64-bit pmull
52365177
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
52375178
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -6404,11 +6345,10 @@ defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
64046345

64056346
defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
64066347
defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
6407-
TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
6348+
TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
64086349
defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
6409-
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
6410-
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
6411-
int_aarch64_neon_smull>;
6350+
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
6351+
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
64126352
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
64136353
int_aarch64_neon_sqadd>;
64146354
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
@@ -6419,11 +6359,10 @@ defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
64196359
int_aarch64_neon_sqrdmlsh>;
64206360
defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
64216361
defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
6422-
TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
6362+
TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
64236363
defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
6424-
TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
6425-
defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
6426-
int_aarch64_neon_umull>;
6364+
TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>;
6365+
defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", AArch64umull>;
64276366

64286367
// A scalar sqdmull with the second operand being a vector lane can be
64296368
// handled directly with the indexed instruction encoding.

llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll

+25-39
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,10 @@ entry:
7171
define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
7272
; CHECK-LABEL: mla_i16:
7373
; CHECK: // %bb.0: // %entry
74-
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
75-
; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
74+
; CHECK-NEXT: umlal2 v3.8h, v0.16b, v1.16b
7675
; CHECK-NEXT: umlal v2.8h, v0.8b, v1.8b
77-
; CHECK-NEXT: umlal v3.8h, v4.8b, v5.8b
78-
; CHECK-NEXT: mov v0.16b, v2.16b
7976
; CHECK-NEXT: mov v1.16b, v3.16b
77+
; CHECK-NEXT: mov v0.16b, v2.16b
8078
; CHECK-NEXT: ret
8179
entry:
8280
%ea = zext <16 x i8> %a to <16 x i16>
@@ -91,18 +89,14 @@ define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
9189
; CHECK: // %bb.0: // %entry
9290
; CHECK-NEXT: ushll v6.8h, v0.8b, #0
9391
; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
94-
; CHECK-NEXT: ushll v7.8h, v1.8b, #0
95-
; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
96-
; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8
97-
; CHECK-NEXT: ext v17.16b, v0.16b, v0.16b, #8
98-
; CHECK-NEXT: ext v18.16b, v7.16b, v7.16b, #8
99-
; CHECK-NEXT: ext v19.16b, v1.16b, v1.16b, #8
100-
; CHECK-NEXT: umlal v4.4s, v0.4h, v1.4h
101-
; CHECK-NEXT: umlal v2.4s, v6.4h, v7.4h
102-
; CHECK-NEXT: umlal v3.4s, v16.4h, v18.4h
103-
; CHECK-NEXT: umlal v5.4s, v17.4h, v19.4h
104-
; CHECK-NEXT: mov v0.16b, v2.16b
92+
; CHECK-NEXT: ushll2 v7.8h, v1.16b, #0
93+
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
94+
; CHECK-NEXT: umlal2 v5.4s, v0.8h, v7.8h
95+
; CHECK-NEXT: umlal2 v3.4s, v6.8h, v1.8h
96+
; CHECK-NEXT: umlal v2.4s, v6.4h, v1.4h
97+
; CHECK-NEXT: umlal v4.4s, v0.4h, v7.4h
10598
; CHECK-NEXT: mov v1.16b, v3.16b
99+
; CHECK-NEXT: mov v0.16b, v2.16b
106100
; CHECK-NEXT: mov v2.16b, v4.16b
107101
; CHECK-NEXT: mov v3.16b, v5.16b
108102
; CHECK-NEXT: ret
@@ -117,43 +111,35 @@ entry:
117111
define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
118112
; CHECK-LABEL: mla_i64:
119113
; CHECK: // %bb.0: // %entry
114+
; CHECK-NEXT: mov v17.16b, v7.16b
115+
; CHECK-NEXT: mov v16.16b, v6.16b
116+
; CHECK-NEXT: ldp q6, q7, [sp]
120117
; CHECK-NEXT: ushll v18.8h, v0.8b, #0
121118
; CHECK-NEXT: ushll2 v0.8h, v0.16b, #0
122-
; CHECK-NEXT: ushll v25.8h, v1.8b, #0
119+
; CHECK-NEXT: ushll v21.8h, v1.8b, #0
123120
; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0
124121
; CHECK-NEXT: ushll v19.4s, v18.4h, #0
125122
; CHECK-NEXT: ushll v20.4s, v0.4h, #0
126123
; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0
127-
; CHECK-NEXT: ushll v26.4s, v25.4h, #0
128-
; CHECK-NEXT: ushll v27.4s, v1.4h, #0
129-
; CHECK-NEXT: ushll2 v25.4s, v25.8h, #0
130-
; CHECK-NEXT: mov v16.16b, v7.16b
131-
; CHECK-NEXT: mov v17.16b, v6.16b
132-
; CHECK-NEXT: ldp q6, q7, [sp]
124+
; CHECK-NEXT: ushll v22.4s, v21.4h, #0
125+
; CHECK-NEXT: ushll v23.4s, v1.4h, #0
126+
; CHECK-NEXT: ushll2 v21.4s, v21.8h, #0
133127
; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0
134128
; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
135-
; CHECK-NEXT: ext v21.16b, v19.16b, v19.16b, #8
136-
; CHECK-NEXT: ext v22.16b, v20.16b, v20.16b, #8
137-
; CHECK-NEXT: ext v23.16b, v18.16b, v18.16b, #8
138-
; CHECK-NEXT: ext v28.16b, v26.16b, v26.16b, #8
139-
; CHECK-NEXT: ext v29.16b, v27.16b, v27.16b, #8
140-
; CHECK-NEXT: ext v30.16b, v25.16b, v25.16b, #8
141-
; CHECK-NEXT: ext v24.16b, v0.16b, v0.16b, #8
142-
; CHECK-NEXT: ext v31.16b, v1.16b, v1.16b, #8
143-
; CHECK-NEXT: umlal v4.2d, v18.2s, v25.2s
144-
; CHECK-NEXT: umlal v17.2d, v20.2s, v27.2s
145-
; CHECK-NEXT: umlal v2.2d, v19.2s, v26.2s
146-
; CHECK-NEXT: umlal v3.2d, v21.2s, v28.2s
147-
; CHECK-NEXT: umlal v5.2d, v23.2s, v30.2s
148-
; CHECK-NEXT: umlal v16.2d, v22.2s, v29.2s
129+
; CHECK-NEXT: umlal2 v5.2d, v18.4s, v21.4s
130+
; CHECK-NEXT: umlal2 v17.2d, v20.4s, v23.4s
131+
; CHECK-NEXT: umlal2 v3.2d, v19.4s, v22.4s
132+
; CHECK-NEXT: umlal v2.2d, v19.2s, v22.2s
133+
; CHECK-NEXT: umlal v4.2d, v18.2s, v21.2s
134+
; CHECK-NEXT: umlal v16.2d, v20.2s, v23.2s
135+
; CHECK-NEXT: umlal2 v7.2d, v0.4s, v1.4s
149136
; CHECK-NEXT: umlal v6.2d, v0.2s, v1.2s
150-
; CHECK-NEXT: umlal v7.2d, v24.2s, v31.2s
151137
; CHECK-NEXT: mov v0.16b, v2.16b
152138
; CHECK-NEXT: mov v1.16b, v3.16b
153139
; CHECK-NEXT: mov v2.16b, v4.16b
154140
; CHECK-NEXT: mov v3.16b, v5.16b
155-
; CHECK-NEXT: mov v4.16b, v17.16b
156-
; CHECK-NEXT: mov v5.16b, v16.16b
141+
; CHECK-NEXT: mov v4.16b, v16.16b
142+
; CHECK-NEXT: mov v5.16b, v17.16b
157143
; CHECK-NEXT: ret
158144
entry:
159145
%ea = zext <16 x i8> %a to <16 x i64>

0 commit comments

Comments
 (0)