[GlobalIsel][X86] Add initial scalar G_MUL/G_SMULH/G_UMULH instruction selection handling

RKSimon · RKSimon · commit 8269fd2db50b · 2023-07-02T12:56:41.000+01:00
Reuse the existing div/rem selection code to also handle mul/imul to support G_MUL/G_SMULH/G_UMULH, as they have a similar pattern using rDX/rAX for mulh/mul results, plus the AH/AL support for i8 multiplies.
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -114,8 +114,8 @@ class X86InstructionSelector : public InstructionSelector {
   bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI,
                      MachineFunction &MF) const;
   bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const;
-  bool selectDivRem(MachineInstr &I, MachineRegisterInfo &MRI,
-                    MachineFunction &MF) const;
+  bool selectMulDivRem(MachineInstr &I, MachineRegisterInfo &MRI,
+                       MachineFunction &MF) const;
   bool selectIntrinsicWSideEffects(MachineInstr &I, MachineRegisterInfo &MRI,
                                    MachineFunction &MF) const;
 
@@ -421,11 +421,14 @@ bool X86InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_IMPLICIT_DEF:
   case TargetOpcode::G_PHI:
     return selectImplicitDefOrPHI(I, MRI);
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_SMULH:
+  case TargetOpcode::G_UMULH:
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_UREM:
-    return selectDivRem(I, MRI, MF);
+    return selectMulDivRem(I, MRI, MF);
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
     return selectIntrinsicWSideEffects(I, MRI, MF);
   }
@@ -1558,11 +1561,14 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
   return true;
 }
 
-bool X86InstructionSelector::selectDivRem(MachineInstr &I,
-                                          MachineRegisterInfo &MRI,
-                                          MachineFunction &MF) const {
-  // The implementation of this function is taken from X86FastISel.
-  assert((I.getOpcode() == TargetOpcode::G_SDIV ||
+bool X86InstructionSelector::selectMulDivRem(MachineInstr &I,
+                                             MachineRegisterInfo &MRI,
+                                             MachineFunction &MF) const {
+  // The implementation of this function is adapted from X86FastISel.
+  assert((I.getOpcode() == TargetOpcode::G_MUL ||
+          I.getOpcode() == TargetOpcode::G_SMULH ||
+          I.getOpcode() == TargetOpcode::G_UMULH ||
+          I.getOpcode() == TargetOpcode::G_SDIV ||
           I.getOpcode() == TargetOpcode::G_SREM ||
           I.getOpcode() == TargetOpcode::G_UDIV ||
           I.getOpcode() == TargetOpcode::G_UREM) &&
@@ -1581,10 +1587,11 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
     return false;
 
   const static unsigned NumTypes = 4; // i8, i16, i32, i64
-  const static unsigned NumOps = 4;   // SDiv, SRem, UDiv, URem
+  const static unsigned NumOps = 7;   // SDiv/SRem/UDiv/URem/Mul/SMulH/UMulh
   const static bool S = true;         // IsSigned
   const static bool U = false;        // !IsSigned
   const static unsigned Copy = TargetOpcode::COPY;
+
   // For the X86 IDIV instruction, in most cases the dividend
   // (numerator) must be in a specific register pair highreg:lowreg,
   // producing the quotient in lowreg and the remainder in highreg.
@@ -1593,19 +1600,19 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   // exception is i8, where the dividend is defined as a single register rather
   // than a register pair, and we therefore directly sign-extend the dividend
   // into lowreg, instead of copying, and ignore the highreg.
-  const static struct DivRemEntry {
+  const static struct MulDivRemEntry {
     // The following portion depends only on the data type.
     unsigned SizeInBits;
     unsigned LowInReg;  // low part of the register pair
     unsigned HighInReg; // high part of the register pair
     // The following portion depends on both the data type and the operation.
-    struct DivRemResult {
-      unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
+    struct MulDivRemResult {
+      unsigned OpMulDivRem;     // The specific MUL/DIV opcode to use.
       unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
                                 // highreg, or copying a zero into highreg.
       unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
                                 // zero/sign-extending into lowreg for i8.
-      unsigned DivRemResultReg; // Register containing the desired result.
+      unsigned ResultReg;       // Register containing the desired result.
       bool IsOpSigned;          // Whether to use signed or unsigned form.
     } ResultTable[NumOps];
   } OpTable[NumTypes] = {
@@ -1617,25 +1624,34 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
            {X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S}, // SRem
            {X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U},  // UDiv
            {X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U},  // URem
+           {X86::IMUL8r, 0, X86::MOVSX16rr8, X86::AL, S}, // Mul
+           {X86::IMUL8r, 0, X86::MOVSX16rr8, X86::AH, S}, // SMulH
+           {X86::MUL8r, 0, X86::MOVZX16rr8, X86::AH, U},  // UMulH
        }},                                                // i8
       {16,
        X86::AX,
        X86::DX,
        {
-           {X86::IDIV16r, X86::CWD, Copy, X86::AX, S},    // SDiv
-           {X86::IDIV16r, X86::CWD, Copy, X86::DX, S},    // SRem
-           {X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U}, // UDiv
-           {X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U}, // URem
-       }},                                                // i16
+           {X86::IDIV16r, X86::CWD, Copy, X86::AX, S},     // SDiv
+           {X86::IDIV16r, X86::CWD, Copy, X86::DX, S},     // SRem
+           {X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U},  // UDiv
+           {X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U},  // URem
+           {X86::IMUL16r, X86::MOV32r0, Copy, X86::AX, S}, // Mul
+           {X86::IMUL16r, X86::MOV32r0, Copy, X86::DX, S}, // SMulH
+           {X86::MUL16r, X86::MOV32r0, Copy, X86::DX, U},  // UMulH
+       }},                                                 // i16
       {32,
        X86::EAX,
        X86::EDX,
        {
-           {X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S},    // SDiv
-           {X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S},    // SRem
-           {X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U}, // UDiv
-           {X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U}, // URem
-       }},                                                 // i32
+           {X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S},     // SDiv
+           {X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S},     // SRem
+           {X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U},  // UDiv
+           {X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U},  // URem
+           {X86::IMUL32r, X86::MOV32r0, Copy, X86::EAX, S}, // Mul
+           {X86::IMUL32r, X86::MOV32r0, Copy, X86::EDX, S}, // SMulH
+           {X86::MUL32r, X86::MOV32r0, Copy, X86::EDX, U},  // UMulH
+       }},                                                  // i32
       {64,
        X86::RAX,
        X86::RDX,
@@ -1644,10 +1660,13 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
            {X86::IDIV64r, X86::CQO, Copy, X86::RDX, S},    // SRem
            {X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U}, // UDiv
            {X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U}, // URem
-       }},                                                 // i64
+           {X86::IMUL64r, X86::MOV32r0, Copy, X86::RAX, S}, // Mul
+           {X86::IMUL64r, X86::MOV32r0, Copy, X86::RDX, S}, // SMulH
+           {X86::MUL64r, X86::MOV32r0, Copy, X86::RDX, U},  // UMulH
+       }},                                                  // i64
   };
 
-  auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const DivRemEntry &El) {
+  auto OpEntryIt = llvm::find_if(OpTable, [RegTy](const MulDivRemEntry &El) {
     return El.SizeInBits == RegTy.getSizeInBits();
   });
   if (OpEntryIt == std::end(OpTable))
@@ -1656,7 +1675,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   unsigned OpIndex;
   switch (I.getOpcode()) {
   default:
-    llvm_unreachable("Unexpected div/rem opcode");
+    llvm_unreachable("Unexpected mul/div/rem opcode");
   case TargetOpcode::G_SDIV:
     OpIndex = 0;
     break;
@@ -1669,10 +1688,20 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   case TargetOpcode::G_UREM:
     OpIndex = 3;
     break;
+  case TargetOpcode::G_MUL:
+    OpIndex = 4;
+    break;
+  case TargetOpcode::G_SMULH:
+    OpIndex = 5;
+    break;
+  case TargetOpcode::G_UMULH:
+    OpIndex = 6;
+    break;
   }
 
-  const DivRemEntry &TypeEntry = *OpEntryIt;
-  const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
+  const MulDivRemEntry &TypeEntry = *OpEntryIt;
+  const MulDivRemEntry::MulDivRemResult &OpEntry =
+      TypeEntry.ResultTable[OpIndex];
 
   const TargetRegisterClass *RegRC = getRegClass(RegTy, *RegRB);
   if (!RBI.constrainGenericRegister(Op1Reg, *RegRC, MRI) ||
@@ -1687,6 +1716,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpCopy),
           TypeEntry.LowInReg)
       .addReg(Op1Reg);
+
   // Zero-extend or sign-extend into high-order input register.
   if (OpEntry.OpSignExtend) {
     if (OpEntry.IsOpSigned)
@@ -1717,9 +1747,11 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
       }
     }
   }
-  // Generate the DIV/IDIV instruction.
-  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpDivRem))
+
+  // Generate the DIV/IDIV/MUL/IMUL instruction.
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpEntry.OpMulDivRem))
       .addReg(Op2Reg);
+
   // For i8 remainder, we can't reference ah directly, as we'll end
   // up with bogus copies like %r9b = COPY %ah. Reference ax
   // instead to prevent ah references in a rex instruction.
@@ -1728,7 +1760,7 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   // won't generate explicit references to the GR8_NOREX registers. If
   // the allocator and/or the backend get enhanced to be more robust in
   // that regard, this can be, and should be, removed.
-  if (OpEntry.DivRemResultReg == X86::AH && STI.is64Bit()) {
+  if (OpEntry.ResultReg == X86::AH && STI.is64Bit()) {
     Register SourceSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
     Register ResultSuperReg = MRI.createVirtualRegister(&X86::GR16RegClass);
     BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Copy), SourceSuperReg)
@@ -1750,9 +1782,10 @@ bool X86InstructionSelector::selectDivRem(MachineInstr &I,
   } else {
     BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
             DstReg)
-        .addReg(OpEntry.DivRemResultReg);
+        .addReg(OpEntry.ResultReg);
   }
   I.eraseFromParent();
+
   return true;
 }
 
diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -196,6 +196,15 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .clampScalar(0, s8, sMaxScalar)
       .scalarize(0);
 
+  getActionDefinitionsBuilder({G_SMULH, G_UMULH})
+      .legalIf([=](const LegalityQuery &Query) -> bool {
+        return typeInSet(0, {s8, s16, s32})(Query) ||
+               (Is64Bit && typeInSet(0, {s64})(Query));
+      })
+      .widenScalarToNextPow2(0, /*Min=*/32)
+      .clampScalar(0, s8, sMaxScalar)
+      .scalarize(0);
+
   // integer divisions
   getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UDIV, G_UREM})
       .legalIf([=](const LegalityQuery &Query) -> bool {
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir
@@ -1,10 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -O0 -mtriple=x86_64-linux-gnu -run-pass=legalizer %s -o - | FileCheck %s --check-prefixes=CHECK,X64
-# RUN: llc -O0 -mtriple=i386-linux-gnu  -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*'  %s 2>%t -o - | FileCheck %s --check-prefixes=CHECK,X86
-# RUN: FileCheck -check-prefix=ERR32  %s < %t
-
-# ERR32: remark: <unknown>:0:0: unable to legalize instruction: %14:_(s32) = G_UMULH %7:_, %9:_ (in function: test_mul_i42)
-# ERR32: remark: <unknown>:0:0: unable to legalize instruction: %10:_(s32) = G_UMULH %3:_, %5:_ (in function: test_mul_i64)
+# RUN: llc -O0 -mtriple=i386-linux-gnu -run-pass=legalizer %s -o - | FileCheck %s --check-prefixes=CHECK,X86
 
 --- |
   define void @test_mul_i1() { ret void }
@@ -200,21 +196,16 @@ body:             |
     ; X86: liveins: $rdi, $rsi
     ; X86-NEXT: {{  $}}
     ; X86-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx
-    ; X86-NEXT: [[TRUNC:%[0-9]+]]:_(s42) = G_TRUNC [[COPY]](s64)
-    ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s42)
-    ; X86-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC]](s42)
-    ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT]](s64)
-    ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ANYEXT1]](s64)
+    ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; X86-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
     ; X86-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
     ; X86-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
     ; X86-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]]
     ; X86-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
     ; X86-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32)
-    ; X86-NEXT: [[TRUNC1:%[0-9]+]]:_(s42) = G_TRUNC [[MV]](s64)
-    ; X86-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[TRUNC1]](s42)
-    ; X86-NEXT: $rax = COPY [[ANYEXT2]](s64)
+    ; X86-NEXT: $rax = COPY [[MV]](s64)
     ; X86-NEXT: RET 0
     %0(s64) = COPY $rdx
     %1(s42) = G_TRUNC %0(s64)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/mul-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/mul-scalar.ll
@@ -1,40 +1,92 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X86
 
-;TODO: instruction selection not supported yet
-;define i8 @test_mul_i8(i8 %arg1, i8 %arg2) {
-;  %ret = mul i8 %arg1, %arg2
-;  ret i8 %ret
-;}
+define i8 @test_mul_i8(i8 %arg1, i8 %arg2) nounwind {
+; X64-LABEL: test_mul_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    imulb %sil
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_mul_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cbtw
+; X86-NEXT:    imulb %cl
+; X86-NEXT:    retl
+  %ret = mul i8 %arg1, %arg2
+  ret i8 %ret
+}
 
-define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {
+define i16 @test_mul_i16(i16 %arg1, i16 %arg2) nounwind {
 ; X64-LABEL: test_mul_i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    imulw %di, %ax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
+;
+; X86-LABEL: test_mul_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imulw %cx, %ax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
   %ret = mul i16 %arg1, %arg2
   ret i16 %ret
 }
 
-define i32 @test_mul_i32(i32 %arg1, i32 %arg2) {
+define i32 @test_mul_i32(i32 %arg1, i32 %arg2) nounwind {
 ; X64-LABEL: test_mul_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    imull %edi, %eax
 ; X64-NEXT:    retq
+;
+; X86-LABEL: test_mul_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
   %ret = mul i32 %arg1, %arg2
   ret i32 %ret
 }
 
-define i64 @test_mul_i64(i64 %arg1, i64 %arg2) {
+define i64 @test_mul_i64(i64 %arg1, i64 %arg2) nounwind {
 ; X64-LABEL: test_mul_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    imulq %rdi, %rax
 ; X64-NEXT:    retq
+;
+; X86-LABEL: test_mul_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %edx, %edi
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
   %ret = mul i64 %arg1, %arg2
   ret i64 %ret
 }
 
+;TODO: instruction selection not supported yet
+;define i128 @test_mul_i128(i128 %arg1, i128 %arg2) nounwind {
+;  %ret = mul i128 %arg1, %arg2
+;  ret i128 %ret
+;}