[DAGCombiner] check uses more strictly on select-of-binop fold

rotateright · rotateright · commit 985b48f18341 · 2021-08-25T14:14:41.000-04:00
There are 2 bugs here: 1. We were not checking uses of operand 2 (the false value of the select). 2. We were not checking for multiple uses of nodes that produce >1 result. Correcting those is enough to avoid the crash in the reduced test based on: https://llvm.org/PR51612 The additional use check on operand 0 (the condition value of the select) should not strictly be necessary because we are only replacing one use with another (whether it makes performance sense to do the transform with that pattern is not clear). But as noted in the TODO, changing that uncovers another bug. Note: there's at least one more bug here - we aren't propagating EVTs correctly, but I plan to fix that in another patch.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22565,7 +22565,11 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
   if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
     return SDValue();
 
-  if (!N->isOnlyUserOf(N0.getNode()) || !N->isOnlyUserOf(N1.getNode()))
+  // The use checks are intentionally on SDNode because we may be dealing
+  // with opcodes that produce more than one SDValue.
+  // TODO: Do we really need to check N0 (the condition operand of the select)?
+  //       But removing that clause could cause an infinite loop...
+  if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
     return SDValue();
 
   // Fold select(cond, binop(x, y), binop(z, y))
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_add_u32_e32 v3, s2, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, s2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
@@ -165,15 +165,15 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s5, v2
-; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, s2, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX10-NEXT:    v_not_b32_e32 v3, v2
+; GFX10-NEXT:    v_mul_lo_u32 v2, s5, v2
+; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, s2, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, s2, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
 ; GFX10-NEXT:    s_add_u32 s2, s2, 1
 ; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s4, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
@@ -1544,3 +1544,48 @@ entry:
  %1 = select i1 %cmp10, i32 %A, i32 %0
  ret i32 %1
 }
+
+define i64 @PR51612(i64 %x, i64 %y) {
+; CHECK-LABEL: PR51612:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    incl %esi
+; CHECK-NEXT:    incq %rax
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    andl 10, %eax
+; CHECK-NEXT:    retq
+;
+; ATHLON-LABEL: PR51612:
+; ATHLON:       ## %bb.0:
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; ATHLON-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; ATHLON-NEXT:    incl %edx
+; ATHLON-NEXT:    addl $1, %eax
+; ATHLON-NEXT:    adcl $0, %ecx
+; ATHLON-NEXT:    cmovbl %edx, %eax
+; ATHLON-NEXT:    andl 10, %eax
+; ATHLON-NEXT:    xorl %edx, %edx
+; ATHLON-NEXT:    retl
+;
+; MCU-LABEL: PR51612:
+; MCU:       # %bb.0:
+; MCU-NEXT:    addl $1, %eax
+; MCU-NEXT:    adcl $0, %edx
+; MCU-NEXT:    jae .LBB31_2
+; MCU-NEXT:  # %bb.1:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    incl %eax
+; MCU-NEXT:  .LBB31_2:
+; MCU-NEXT:    andl 10, %eax
+; MCU-NEXT:    xorl %edx, %edx
+; MCU-NEXT:    retl
+  %add = add i64 %x, 1
+  %inc = add i64 %y, 1
+  %tobool = icmp eq i64 %add, 0
+  %sel = select i1 %tobool, i64 %inc, i64 %add
+  %i = load i32, i32* inttoptr (i32 10 to i32*), align 4
+  %conv = zext i32 %i to i64
+  %and = and i64 %sel, %conv
+  ret i64 %and
+}