Skip to content

Commit 985b48f

Browse files
committed
[DAGCombiner] check uses more strictly on select-of-binop fold
There are 2 bugs here: 1. We were not checking uses of operand 2 (the false value of the select). 2. We were not checking for multiple uses of nodes that produce >1 result. Correcting those is enough to avoid the crash in the reduced test based on: https://llvm.org/PR51612 The additional use check on operand 0 (the condition value of the select) should not strictly be necessary because we are only replacing one use with another (whether it makes performance sense to do the transform with that pattern is not clear). But as noted in the TODO, changing that uncovers another bug. Note: there's at least one more bug here - we aren't propagating EVTs correctly, but I plan to fix that in another patch.
1 parent 2c062f2 commit 985b48f

File tree

3 files changed

+59
-10
lines changed

3 files changed

+59
-10
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22565,7 +22565,11 @@ SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
2256522565
if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
2256622566
return SDValue();
2256722567

22568-
if (!N->isOnlyUserOf(N0.getNode()) || !N->isOnlyUserOf(N1.getNode()))
22568+
// The use checks are intentionally on SDNode because we may be dealing
22569+
// with opcodes that produce more than one SDValue.
22570+
// TODO: Do we really need to check N0 (the condition operand of the select)?
22571+
// But removing that clause could cause an infinite loop...
22572+
if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
2256922573
return SDValue();
2257022574

2257122575
// Fold select(cond, binop(x, y), binop(z, y))

llvm/test/CodeGen/AMDGPU/idiv-licm.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,10 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
128128
; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2
129129
; GFX9-NEXT: v_not_b32_e32 v2, v2
130130
; GFX9-NEXT: v_mul_lo_u32 v2, s4, v2
131-
; GFX9-NEXT: v_add_u32_e32 v4, s2, v3
132-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v4
133-
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
131+
; GFX9-NEXT: v_add_u32_e32 v3, s2, v3
132+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3
134133
; GFX9-NEXT: v_add_u32_e32 v2, s2, v2
134+
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
135135
; GFX9-NEXT: s_add_u32 s2, s2, 1
136136
; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2
137137
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2
@@ -165,15 +165,15 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
165165
; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
166166
; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0
167167
; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2
168-
; GFX10-NEXT: v_mul_lo_u32 v3, s5, v2
169-
; GFX10-NEXT: v_not_b32_e32 v2, v2
170-
; GFX10-NEXT: v_mul_lo_u32 v2, s4, v2
171-
; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v3
172-
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4
173-
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
168+
; GFX10-NEXT: v_not_b32_e32 v3, v2
169+
; GFX10-NEXT: v_mul_lo_u32 v2, s5, v2
170+
; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3
174171
; GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2
172+
; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3
173+
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2
175174
; GFX10-NEXT: s_add_u32 s2, s2, 1
176175
; GFX10-NEXT: s_addc_u32 s3, s3, 0
176+
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
177177
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v2
178178
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2
179179
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo

llvm/test/CodeGen/X86/select.ll

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1544,3 +1544,48 @@ entry:
15441544
%1 = select i1 %cmp10, i32 %A, i32 %0
15451545
ret i32 %1
15461546
}
1547+
1548+
define i64 @PR51612(i64 %x, i64 %y) {
1549+
; CHECK-LABEL: PR51612:
1550+
; CHECK: ## %bb.0:
1551+
; CHECK-NEXT: movq %rdi, %rax
1552+
; CHECK-NEXT: incl %esi
1553+
; CHECK-NEXT: incq %rax
1554+
; CHECK-NEXT: cmovel %esi, %eax
1555+
; CHECK-NEXT: andl 10, %eax
1556+
; CHECK-NEXT: retq
1557+
;
1558+
; ATHLON-LABEL: PR51612:
1559+
; ATHLON: ## %bb.0:
1560+
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %eax
1561+
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %ecx
1562+
; ATHLON-NEXT: movl {{[0-9]+}}(%esp), %edx
1563+
; ATHLON-NEXT: incl %edx
1564+
; ATHLON-NEXT: addl $1, %eax
1565+
; ATHLON-NEXT: adcl $0, %ecx
1566+
; ATHLON-NEXT: cmovbl %edx, %eax
1567+
; ATHLON-NEXT: andl 10, %eax
1568+
; ATHLON-NEXT: xorl %edx, %edx
1569+
; ATHLON-NEXT: retl
1570+
;
1571+
; MCU-LABEL: PR51612:
1572+
; MCU: # %bb.0:
1573+
; MCU-NEXT: addl $1, %eax
1574+
; MCU-NEXT: adcl $0, %edx
1575+
; MCU-NEXT: jae .LBB31_2
1576+
; MCU-NEXT: # %bb.1:
1577+
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
1578+
; MCU-NEXT: incl %eax
1579+
; MCU-NEXT: .LBB31_2:
1580+
; MCU-NEXT: andl 10, %eax
1581+
; MCU-NEXT: xorl %edx, %edx
1582+
; MCU-NEXT: retl
1583+
%add = add i64 %x, 1
1584+
%inc = add i64 %y, 1
1585+
%tobool = icmp eq i64 %add, 0
1586+
%sel = select i1 %tobool, i64 %inc, i64 %add
1587+
%i = load i32, i32* inttoptr (i32 10 to i32*), align 4
1588+
%conv = zext i32 %i to i64
1589+
%and = and i64 %sel, %conv
1590+
ret i64 %and
1591+
}

0 commit comments

Comments
 (0)