[AVR] Remove faulty stack pushing behavior

aykevl · dylanmckay · commit cfbe205a7e88 · 2020-06-24T01:24:15.000+12:00
An instruction like this will need to allocate some stack space for the last parameter: %x = call addrspace(1) i16 @bar(i64 undef, i64 undef, i16 undef, i16 0) This worked fine when passing an actual value (in this case 0). However, when passing undef, no value was pushed to the stack and therefore no push instructions were created. This caused an unbalanced stack leading to interesting results. This commit fixes that by replacing the push logic with a regular stack adjustment and stack-relative load/stores. This is less efficient but at least it correctly compiles the code. I can think of a few improvements in the future: * The stack should have been adjusted in the function prologue when there are no allocas in the function. * Many (if not most) stack adjustments can be replaced by pushing/popping the values directly. Exactly like the previous code attempted but didn't do correctly. * Small stack adjustments can be done more efficiently with a few push/pop instructions (pushing/popping bogus values), both for code size and for speed. All in all, as long as there are no allocas in the function I think that it is almost always more efficient to emit regular push/pop instructions. This is however left for future optimizations. Differential Revision: https://reviews.llvm.org/D78581
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -285,15 +285,10 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
 }
 
 /// Replace pseudo store instructions that pass arguments through the stack with
-/// real instructions. If insertPushes is true then all instructions are
-/// replaced with push instructions, otherwise regular std instructions are
-/// inserted.
+/// real instructions.
 static void fixStackStores(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
-                           const TargetInstrInfo &TII, bool insertPushes) {
-  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
-  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-
+                           const TargetInstrInfo &TII, Register FP) {
   // Iterate through the BB until we hit a call instruction or we reach the end.
   for (auto I = MI, E = MBB.end(); I != E && !I->isCall();) {
     MachineBasicBlock::iterator NextMI = std::next(I);
@@ -308,37 +303,14 @@ static void fixStackStores(MachineBasicBlock &MBB,
 
     assert(MI.getOperand(0).getReg() == AVR::SP &&
            "Invalid register, should be SP!");
-    if (insertPushes) {
-      // Replace this instruction with a push.
-      Register SrcReg = MI.getOperand(2).getReg();
-      bool SrcIsKill = MI.getOperand(2).isKill();
-
-      // We can't use PUSHWRr here because when expanded the order of the new
-      // instructions are reversed from what we need. Perform the expansion now.
-      if (Opcode == AVR::STDWSPQRr) {
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(TRI.getSubReg(SrcReg, AVR::sub_hi),
-                    getKillRegState(SrcIsKill));
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(TRI.getSubReg(SrcReg, AVR::sub_lo),
-                    getKillRegState(SrcIsKill));
-      } else {
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(SrcReg, getKillRegState(SrcIsKill));
-      }
-
-      MI.eraseFromParent();
-      I = NextMI;
-      continue;
-    }
 
     // Replace this instruction with a regular store. Use Y as the base
     // pointer since it is guaranteed to contain a copy of SP.
     unsigned STOpc =
         (Opcode == AVR::STDWSPQRr) ? AVR::STDWPtrQRr : AVR::STDPtrQRr;
 
     MI.setDesc(TII.get(STOpc));
-    MI.getOperand(0).setReg(AVR::R29R28);
+    MI.getOperand(0).setReg(FP);
 
     I = NextMI;
   }
@@ -354,26 +326,45 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   // function entry. Delete the call frame pseudo and replace all pseudo stores
   // with real store instructions.
   if (hasReservedCallFrame(MF)) {
-    fixStackStores(MBB, MI, TII, false);
+    fixStackStores(MBB, MI, TII, AVR::R29R28);
     return MBB.erase(MI);
   }
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned int Opcode = MI->getOpcode();
   int Amount = TII.getFrameSize(*MI);
 
-  // Adjcallstackup does not need to allocate stack space for the call, instead
-  // we insert push instructions that will allocate the necessary stack.
-  // For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
-  // the read and write of SP in I/O space.
+  // ADJCALLSTACKUP and ADJCALLSTACKDOWN are converted to adiw/subi
+  // instructions to read and write the stack pointer in I/O space.
   if (Amount != 0) {
     assert(getStackAlignment() == 1 && "Unsupported stack alignment");
 
     if (Opcode == TII.getCallFrameSetupOpcode()) {
-      fixStackStores(MBB, MI, TII, true);
+      // Update the stack pointer.
+      // In many cases this can be done far more efficiently by pushing the
+      // relevant values directly to the stack. However, doing that correctly
+      // (in the right order, possibly skipping some empty space for undef
+      // values, etc) is tricky and thus left to be optimized in the future.
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+      MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
+                              .addReg(AVR::R31R30, RegState::Kill)
+                              .addImm(Amount);
+      New->getOperand(3).setIsDead();
+
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+          .addReg(AVR::R31R30, RegState::Kill);
+
+      // Make sure the remaining stack stores are converted to real store
+      // instructions.
+      fixStackStores(MBB, MI, TII, AVR::R31R30);
     } else {
       assert(Opcode == TII.getCallFrameDestroyOpcode());
 
+      // Note that small stack changes could be implemented more efficiently
+      // with a few pop instructions instead of the 8-9 instructions now
+      // required.
+
       // Select the best opcode to adjust SP based on the offset size.
       unsigned addOpcode;
       if (isUInt<6>(Amount)) {
diff --git a/llvm/test/CodeGen/AVR/call.ll b/llvm/test/CodeGen/AVR/call.ll
@@ -32,8 +32,8 @@ define i8 @calli8_stack() {
 ; CHECK-LABEL: calli8_stack:
 ; CHECK: ldi [[REG1:r[0-9]+]], 10
 ; CHECK: ldi [[REG2:r[0-9]+]], 11
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: call foo8_3
     %result1 = call i8 @foo8_3(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11)
     ret i8 %result1
@@ -54,12 +54,12 @@ define i16 @calli16_stack() {
 ; CHECK-LABEL: calli16_stack:
 ; CHECK: ldi [[REG1:r[0-9]+]], 9
 ; CHECK: ldi [[REG2:r[0-9]+]], 2 
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 10
 ; CHECK: ldi [[REG2:r[0-9]+]], 2
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: call foo16_2
     %result1 = call i16 @foo16_2(i16 512, i16 513, i16 514, i16 515, i16 516, i16 517, i16 518, i16 519, i16 520, i16 521, i16 522)
     ret i16 %result1
@@ -84,12 +84,12 @@ define i32 @calli32_stack() {
 ; CHECK-LABEL: calli32_stack:
 ; CHECK: ldi [[REG1:r[0-9]+]], 64
 ; CHECK: ldi [[REG2:r[0-9]+]], 66
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 15
 ; CHECK: ldi [[REG2:r[0-9]+]], 2
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: call foo32_2
     %result1 = call i32 @foo32_2(i32 1, i32 2, i32 3, i32 4, i32 34554432)
     ret i32 %result1
@@ -115,20 +115,20 @@ define i64 @calli64_stack() {
 
 ; CHECK: ldi [[REG1:r[0-9]+]], 76
 ; CHECK: ldi [[REG2:r[0-9]+]], 73
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 31
 ; CHECK: ldi [[REG2:r[0-9]+]], 242
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+8, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 155
 ; CHECK: ldi [[REG2:r[0-9]+]], 88
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 255
 ; CHECK: ldi [[REG2:r[0-9]+]], 255
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: call foo64_2
     %result1 = call i64 @foo64_2(i64 1, i64 2, i64 17446744073709551615)
     ret i64 %result1
diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -53,9 +53,27 @@ define void @dynalloca2(i16 %x) {
 ; CHECK-LABEL: dynalloca2:
 ; CHECK: in [[SPCOPY1:r[0-9]+]], 61
 ; CHECK: in [[SPCOPY2:r[0-9]+]], 62
-; CHECK: push
-; CHECK-NOT: st
-; CHECK-NOT: std
+; Allocate stack space for call
+; CHECK: in {{.*}}, 61
+; CHECK: in {{.*}}, 62
+; CHECK: subi
+; CHECK: sbci
+; CHECK: in r0, 63
+; CHECK-NEXT: cli
+; CHECK-NEXT: out 62, {{.*}}
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: out 61, {{.*}}
+; Store values on the stack
+; CHECK: ldi r16, 0
+; CHECK: ldi r17, 0
+; CHECK: std Z+5, r16
+; CHECK: std Z+6, r17
+; CHECK: std Z+7, r16
+; CHECK: std Z+8, r17
+; CHECK: std Z+3, r16
+; CHECK: std Z+4, r17
+; CHECK: std Z+1, r16
+; CHECK: std Z+2, r17
 ; CHECK: call
 ; Call frame restore
 ; CHECK-NEXT: in r30, 61
diff --git a/llvm/test/CodeGen/AVR/varargs.ll b/llvm/test/CodeGen/AVR/varargs.ll
@@ -42,16 +42,16 @@ define void @varargcall() {
 ; CHECK-LABEL: varargcall:
 ; CHECK: ldi [[REG1:r[0-9]+]], 189
 ; CHECK: ldi [[REG2:r[0-9]+]], 205
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 191
 ; CHECK: ldi [[REG2:r[0-9]+]], 223
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
 ; CHECK: ldi [[REG1:r[0-9]+]], 205
 ; CHECK: ldi [[REG2:r[0-9]+]], 171
-; CHECK: push [[REG2]]
-; CHECK: push [[REG1]]
+; CHECK: std Z+1, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
 ; CHECK: call
 ; CHECK: adiw r30, 6
   tail call void (i16, ...) @var1223(i16 -21555, i16 -12867, i16 -8257)