Skip to content

Commit 93deac2

Browse files
red1bluelostdavemgreen
authored andcommitted
[AArch64] Optimize add/sub with immediate through MIPeepholeOpt
Fixes the build issue with D111034, whose goal was to optimize add/sub with long immediates. Optimize ([add|sub] r, imm) -> ([ADD|SUB] ([ADD|SUB] r, #imm0, lsl rust-lang#12), #imm1), if imm == (imm0<<12)+imm1. and both imm0 and imm1 are non-zero 12-bit unsigned integers. Optimize ([add|sub] r, imm) -> ([SUB|ADD] ([SUB|ADD] r, #imm0, lsl rust-lang#12), #imm1), if imm == -(imm0<<12)-imm1, and both imm0 and imm1 are non-zero 12-bit unsigned integers. The change which fixed the build issue in D111034 was the use of new virtual registers so that SSA form is maintained until deleting MI. Differential Revision: https://reviews.llvm.org/D117429
1 parent 4041354 commit 93deac2

File tree

4 files changed

+302
-66
lines changed

4 files changed

+302
-66
lines changed

llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp

+171-33
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,19 @@
1111
// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
1212
// MOVi64imm + ANDXrr ==> ANDXri + ANDXri
1313
//
14+
// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
15+
// MOVi64imm + ADDXrr ==> ANDXri + ANDXri
16+
//
17+
// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
18+
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
19+
//
1420
// The mov pseudo instruction could be expanded to multiple mov instructions
1521
// later. In this case, we could try to split the constant operand of mov
16-
// instruction into two bitmask immediates. It makes two AND instructions
17-
// intead of multiple `mov` + `and` instructions.
22+
// instruction into two immediates which can be directly encoded into
23+
// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
24+
// multiple `mov` + `and/add/sub` instructions.
1825
//
19-
// 2. Remove redundant ORRWrs which is generated by zero-extend.
26+
// 4. Remove redundant ORRWrs which is generated by zero-extend.
2027
//
2128
// %3:gpr32 = ORRWrs $wzr, %2, 0
2229
// %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
@@ -51,6 +58,12 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
5158
MachineLoopInfo *MLI;
5259
MachineRegisterInfo *MRI;
5360

61+
bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
62+
MachineInstr *&SubregToRegMI);
63+
64+
template <typename T>
65+
bool visitADDSUB(MachineInstr &MI,
66+
SmallSetVector<MachineInstr *, 8> &ToBeRemoved, bool IsAdd);
5467
template <typename T>
5568
bool visitAND(MachineInstr &MI,
5669
SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
@@ -131,36 +144,9 @@ bool AArch64MIPeepholeOpt::visitAND(
131144
assert((RegSize == 32 || RegSize == 64) &&
132145
"Invalid RegSize for AND bitmask peephole optimization");
133146

134-
// Check whether AND's MBB is in loop and the AND is loop invariant.
135-
MachineBasicBlock *MBB = MI.getParent();
136-
MachineLoop *L = MLI->getLoopFor(MBB);
137-
if (L && !L->isLoopInvariant(MI))
138-
return false;
139-
140-
// Check whether AND's operand is MOV with immediate.
141-
MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
142-
if (!MovMI)
143-
return false;
144-
145-
MachineInstr *SubregToRegMI = nullptr;
146-
// If it is SUBREG_TO_REG, check its operand.
147-
if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
148-
SubregToRegMI = MovMI;
149-
MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
150-
if (!MovMI)
151-
return false;
152-
}
153-
154-
if (MovMI->getOpcode() != AArch64::MOVi32imm &&
155-
MovMI->getOpcode() != AArch64::MOVi64imm)
156-
return false;
157-
158-
// If the MOV has multiple uses, do not split the immediate because it causes
159-
// more instructions.
160-
if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
161-
return false;
162-
163-
if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
147+
// Perform several essential checks against current MI.
148+
MachineInstr *MovMI = nullptr, *SubregToRegMI = nullptr;
149+
if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
164150
return false;
165151

166152
// Split the bitmask immediate into two.
@@ -177,6 +163,7 @@ bool AArch64MIPeepholeOpt::visitAND(
177163

178164
// Create new AND MIs.
179165
DebugLoc DL = MI.getDebugLoc();
166+
MachineBasicBlock *MBB = MI.getParent();
180167
const TargetRegisterClass *ANDImmRC =
181168
(RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
182169
Register DstReg = MI.getOperand(0).getReg();
@@ -251,6 +238,145 @@ bool AArch64MIPeepholeOpt::visitORR(
251238
return true;
252239
}
253240

241+
template <typename T>
242+
static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
243+
// The immediate must be in the form of ((imm0 << 12) + imm1), in which both
244+
// imm0 and imm1 are non-zero 12-bit unsigned int.
245+
if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
246+
(Imm & ~static_cast<T>(0xffffff)) != 0)
247+
return false;
248+
249+
// The immediate can not be composed via a single instruction.
250+
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
251+
AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
252+
if (Insn.size() == 1)
253+
return false;
254+
255+
// Split Imm into (Imm0 << 12) + Imm1;
256+
Imm0 = (Imm >> 12) & 0xfff;
257+
Imm1 = Imm & 0xfff;
258+
return true;
259+
}
260+
261+
template <typename T>
262+
bool AArch64MIPeepholeOpt::visitADDSUB(
263+
MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
264+
bool IsAdd) {
265+
// Try below transformation.
266+
//
267+
// MOVi32imm + ADDWrr ==> ADDWri + ADDWri
268+
// MOVi64imm + ADDXrr ==> ADDXri + ADDXri
269+
//
270+
// MOVi32imm + SUBWrr ==> SUBWri + SUBWri
271+
// MOVi64imm + SUBXrr ==> SUBXri + SUBXri
272+
//
273+
// The mov pseudo instruction could be expanded to multiple mov instructions
274+
// later. Let's try to split the constant operand of mov instruction into two
275+
// legal add/sub immediates. It makes only two ADD/SUB instructions intead of
276+
// multiple `mov` + `and/sub` instructions.
277+
278+
unsigned RegSize = sizeof(T) * 8;
279+
assert((RegSize == 32 || RegSize == 64) &&
280+
"Invalid RegSize for legal add/sub immediate peephole optimization");
281+
282+
// Perform several essential checks against current MI.
283+
MachineInstr *MovMI, *SubregToRegMI;
284+
if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
285+
return false;
286+
287+
// Split the immediate to Imm0 and Imm1, and calculate the Opcode.
288+
T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
289+
unsigned Opcode;
290+
if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) {
291+
if (IsAdd)
292+
Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri;
293+
else
294+
Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri;
295+
} else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) {
296+
if (IsAdd)
297+
Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri;
298+
else
299+
Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri;
300+
} else {
301+
return false;
302+
}
303+
304+
// Create new ADD/SUB MIs.
305+
DebugLoc DL = MI.getDebugLoc();
306+
MachineBasicBlock *MBB = MI.getParent();
307+
const TargetRegisterClass *RC =
308+
(RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
309+
Register DstReg = MI.getOperand(0).getReg();
310+
Register SrcReg = MI.getOperand(1).getReg();
311+
Register NewTmpReg = MRI->createVirtualRegister(RC);
312+
Register NewDstReg = MRI->createVirtualRegister(RC);
313+
314+
MRI->constrainRegClass(SrcReg, RC);
315+
BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
316+
.addReg(SrcReg)
317+
.addImm(Imm0)
318+
.addImm(12);
319+
320+
MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
321+
BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
322+
.addReg(NewTmpReg)
323+
.addImm(Imm1)
324+
.addImm(0);
325+
326+
MRI->replaceRegWith(DstReg, NewDstReg);
327+
// replaceRegWith changes MI's definition register. Keep it for SSA form until
328+
// deleting MI.
329+
MI.getOperand(0).setReg(DstReg);
330+
331+
// Record the MIs need to be removed.
332+
ToBeRemoved.insert(&MI);
333+
if (SubregToRegMI)
334+
ToBeRemoved.insert(SubregToRegMI);
335+
ToBeRemoved.insert(MovMI);
336+
337+
return true;
338+
}
339+
340+
// Checks if the corresponding MOV immediate instruction is applicable for
341+
// this peephole optimization.
342+
bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
343+
MachineInstr *&MovMI,
344+
MachineInstr *&SubregToRegMI) {
345+
// Check whether current MBB is in loop and the AND is loop invariant.
346+
MachineBasicBlock *MBB = MI.getParent();
347+
MachineLoop *L = MLI->getLoopFor(MBB);
348+
if (L && !L->isLoopInvariant(MI))
349+
return false;
350+
351+
// Check whether current MI's operand is MOV with immediate.
352+
MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
353+
if (!MovMI)
354+
return false;
355+
356+
// If it is SUBREG_TO_REG, check its operand.
357+
SubregToRegMI = nullptr;
358+
if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
359+
SubregToRegMI = MovMI;
360+
MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
361+
if (!MovMI)
362+
return false;
363+
}
364+
365+
if (MovMI->getOpcode() != AArch64::MOVi32imm &&
366+
MovMI->getOpcode() != AArch64::MOVi64imm)
367+
return false;
368+
369+
// If the MOV has multiple uses, do not split the immediate because it causes
370+
// more instructions.
371+
if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
372+
return false;
373+
if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
374+
return false;
375+
376+
// It is OK to perform this peephole optimization.
377+
return true;
378+
}
379+
254380
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
255381
if (skipFunction(MF.getFunction()))
256382
return false;
@@ -278,6 +404,18 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
278404
case AArch64::ORRWrs:
279405
Changed = visitORR(MI, ToBeRemoved);
280406
break;
407+
case AArch64::ADDWrr:
408+
Changed = visitADDSUB<uint32_t>(MI, ToBeRemoved, true);
409+
break;
410+
case AArch64::SUBWrr:
411+
Changed = visitADDSUB<uint32_t>(MI, ToBeRemoved, false);
412+
break;
413+
case AArch64::ADDXrr:
414+
Changed = visitADDSUB<uint64_t>(MI, ToBeRemoved, true);
415+
break;
416+
case AArch64::SUBXrr:
417+
Changed = visitADDSUB<uint64_t>(MI, ToBeRemoved, false);
418+
break;
281419
}
282420
}
283421
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -run-pass=aarch64-mi-peephole-opt -o - -mtriple=aarch64-unknown-linux -verify-machineinstrs %s | FileCheck %s
3+
4+
# Main intention is to verify machine instructions have valid register classes.
5+
# Use of UBFM[W|X]ri is used as an arbitrary instruction that requires GPR[32|64]RegClass.
6+
# If the ADD/SUB optimization generates invalid register classes, this test will fail.
7+
---
8+
name: addi
9+
body: |
10+
bb.0.entry:
11+
liveins: $w0
12+
; CHECK-LABEL: name: addi
13+
; CHECK: [[COPY:%[0-9]+]]:gpr32common = COPY $w0
14+
; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[COPY]], 273, 12
15+
; CHECK-NEXT: [[ADDWri1:%[0-9]+]]:gpr32common = ADDWri [[ADDWri]], 3549, 0
16+
; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[ADDWri1]], 28, 31
17+
; CHECK-NEXT: $w0 = COPY [[UBFMWri]]
18+
; CHECK-NEXT: RET_ReallyLR implicit $w0
19+
%0:gpr32 = COPY $w0
20+
%1:gpr32 = MOVi32imm 1121757
21+
%2:gpr32 = ADDWrr %0, %1
22+
%3:gpr32 = UBFMWri %2, 28, 31
23+
$w0 = COPY %3
24+
RET_ReallyLR implicit $w0
25+
...
26+
---
27+
name: addl
28+
body: |
29+
bb.0.entry:
30+
liveins: $x0
31+
; CHECK-LABEL: name: addl
32+
; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
33+
; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri [[COPY]], 273, 12
34+
; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64common = ADDXri [[ADDXri]], 3549, 0
35+
; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[ADDXri1]], 28, 31
36+
; CHECK-NEXT: $x0 = COPY [[UBFMXri]]
37+
; CHECK-NEXT: RET_ReallyLR implicit $x0
38+
%0:gpr64 = COPY $x0
39+
%1:gpr32 = MOVi32imm 1121757
40+
%2:gpr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32
41+
%3:gpr64 = ADDXrr %0, killed %2
42+
%4:gpr64 = UBFMXri %3, 28, 31
43+
$x0 = COPY %4
44+
RET_ReallyLR implicit $x0
45+
...
46+
---
47+
name: addl_negate
48+
body: |
49+
bb.0.entry:
50+
liveins: $x0
51+
; CHECK-LABEL: name: addl_negate
52+
; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
53+
; CHECK-NEXT: [[SUBXri:%[0-9]+]]:gpr64sp = SUBXri [[COPY]], 273, 12
54+
; CHECK-NEXT: [[SUBXri1:%[0-9]+]]:gpr64common = SUBXri [[SUBXri]], 3549, 0
55+
; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[SUBXri1]], 28, 31
56+
; CHECK-NEXT: $x0 = COPY [[UBFMXri]]
57+
; CHECK-NEXT: RET_ReallyLR implicit $x0
58+
%0:gpr64 = COPY $x0
59+
%1:gpr64 = MOVi64imm -1121757
60+
%2:gpr64 = ADDXrr %0, killed %1
61+
%3:gpr64 = UBFMXri %2, 28, 31
62+
$x0 = COPY %3
63+
RET_ReallyLR implicit $x0

0 commit comments

Comments
 (0)