llvm
diff --git a/‎clang/test/CodeGen/builtins-wasm.c
Lines changed: 24 additions & 36 deletions b/‎clang/test/CodeGen/builtins-wasm.c
Lines changed: 24 additions & 36 deletions
diff --git a/‎clang/test/Headers/wasm.c
Lines changed: 32 additions & 44 deletions b/‎clang/test/Headers/wasm.c
Lines changed: 32 additions & 44 deletions
diff --git a/‎llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
Lines changed: 47 additions & 21 deletions b/‎llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
Lines changed: 47 additions & 21 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
Lines changed: 4 additions & 7 deletions b/‎llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
Lines changed: 4 additions & 7 deletions
@@ -262,86 +262,74 @@ i64x2 abs_i64x2(i64x2 v) {
 
 i8x16 min_s_i8x16(i8x16 x, i8x16 y) {
   return __builtin_wasm_min_s_i8x16(x, y);
-  // WEBASSEMBLY: %0 = icmp slt <16 x i8> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> %y
-  // WEBASSEMBLY-NEXT: ret <16 x i8> %1
+  // WEBASSEMBLY: call <16 x i8> @llvm.smin.v16i8(<16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 u8x16 min_u_i8x16(u8x16 x, u8x16 y) {
   return __builtin_wasm_min_u_i8x16(x, y);
-  // WEBASSEMBLY: %0 = icmp ult <16 x i8> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> %y
-  // WEBASSEMBLY-NEXT: ret <16 x i8> %1
+  // WEBASSEMBLY: call <16 x i8> @llvm.umin.v16i8(<16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i8x16 max_s_i8x16(i8x16 x, i8x16 y) {
   return __builtin_wasm_max_s_i8x16(x, y);
-  // WEBASSEMBLY: %0 = icmp sgt <16 x i8> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> %y
-  // WEBASSEMBLY-NEXT: ret <16 x i8> %1
+  // WEBASSEMBLY: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 u8x16 max_u_i8x16(u8x16 x, u8x16 y) {
   return __builtin_wasm_max_u_i8x16(x, y);
-  // WEBASSEMBLY: %0 = icmp ugt <16 x i8> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <16 x i1> %0, <16 x i8> %x, <16 x i8> %y
-  // WEBASSEMBLY-NEXT: ret <16 x i8> %1
+  // WEBASSEMBLY: call <16 x i8> @llvm.umax.v16i8(<16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i16x8 min_s_i16x8(i16x8 x, i16x8 y) {
   return __builtin_wasm_min_s_i16x8(x, y);
-  // WEBASSEMBLY: %0 = icmp slt <8 x i16> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> %y
-  // WEBASSEMBLY-NEXT: ret <8 x i16> %1
+  // WEBASSEMBLY: call <8 x i16> @llvm.smin.v8i16(<8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 u16x8 min_u_i16x8(u16x8 x, u16x8 y) {
   return __builtin_wasm_min_u_i16x8(x, y);
-  // WEBASSEMBLY: %0 = icmp ult <8 x i16> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> %y
-  // WEBASSEMBLY-NEXT: ret <8 x i16> %1
+  // WEBASSEMBLY: call <8 x i16> @llvm.umin.v8i16(<8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i16x8 max_s_i16x8(i16x8 x, i16x8 y) {
   return __builtin_wasm_max_s_i16x8(x, y);
-  // WEBASSEMBLY: %0 = icmp sgt <8 x i16> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> %y
-  // WEBASSEMBLY-NEXT: ret <8 x i16> %1
+  // WEBASSEMBLY: call <8 x i16> @llvm.smax.v8i16(<8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 u16x8 max_u_i16x8(u16x8 x, u16x8 y) {
   return __builtin_wasm_max_u_i16x8(x, y);
-  // WEBASSEMBLY: %0 = icmp ugt <8 x i16> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <8 x i1> %0, <8 x i16> %x, <8 x i16> %y
-  // WEBASSEMBLY-NEXT: ret <8 x i16> %1
+  // WEBASSEMBLY: call <8 x i16> @llvm.umax.v8i16(<8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i32x4 min_s_i32x4(i32x4 x, i32x4 y) {
   return __builtin_wasm_min_s_i32x4(x, y);
-  // WEBASSEMBLY: %0 = icmp slt <4 x i32> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <4 x i1> %0, <4 x i32> %x, <4 x i32> %y
-  // WEBASSEMBLY-NEXT: ret <4 x i32> %1
+  // WEBASSEMBLY: call <4 x i32> @llvm.smin.v4i32(<4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 u32x4 min_u_i32x4(u32x4 x, u32x4 y) {
   return __builtin_wasm_min_u_i32x4(x, y);
-  // WEBASSEMBLY: %0 = icmp ult <4 x i32> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <4 x i1> %0, <4 x i32> %x, <4 x i32> %y
-  // WEBASSEMBLY-NEXT: ret <4 x i32> %1
+  // WEBASSEMBLY: call <4 x i32> @llvm.umin.v4i32(<4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i32x4 max_s_i32x4(i32x4 x, i32x4 y) {
   return __builtin_wasm_max_s_i32x4(x, y);
-  // WEBASSEMBLY: %0 = icmp sgt <4 x i32> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <4 x i1> %0, <4 x i32> %x, <4 x i32> %y
-  // WEBASSEMBLY-NEXT: ret <4 x i32> %1
+  // WEBASSEMBLY: call <4 x i32> @llvm.smax.v4i32(<4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 u32x4 max_u_i32x4(u32x4 x, u32x4 y) {
   return __builtin_wasm_max_u_i32x4(x, y);
-  // WEBASSEMBLY: %0 = icmp ugt <4 x i32> %x, %y
-  // WEBASSEMBLY-NEXT: %1 = select <4 x i1> %0, <4 x i32> %x, <4 x i32> %y
-  // WEBASSEMBLY-NEXT: ret <4 x i32> %1
+  // WEBASSEMBLY: call <4 x i32> @llvm.umax.v4i32(<4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
 }
 
 i16x8 sub_sat_s_i16x8(i16x8 x, i16x8 y) {
 
@@ -1711,10 +1711,9 @@ v128_t test_u8x16_sub_sat(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i8x16_min(v128_t a, v128_t b) {
   return wasm_i8x16_min(a, b);
@@ -1724,10 +1723,9 @@ v128_t test_i8x16_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_u8x16_min(v128_t a, v128_t b) {
   return wasm_u8x16_min(a, b);
@@ -1737,10 +1735,9 @@ v128_t test_u8x16_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i8x16_max(v128_t a, v128_t b) {
   return wasm_i8x16_max(a, b);
@@ -1750,10 +1747,9 @@ v128_t test_i8x16_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_u8x16_max(v128_t a, v128_t b) {
   return wasm_u8x16_max(a, b);
@@ -1944,10 +1940,9 @@ v128_t test_i16x8_mul(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i16x8_min(v128_t a, v128_t b) {
   return wasm_i16x8_min(a, b);
@@ -1957,10 +1952,9 @@ v128_t test_i16x8_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_u16x8_min(v128_t a, v128_t b) {
   return wasm_u16x8_min(a, b);
@@ -1970,10 +1964,9 @@ v128_t test_u16x8_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i16x8_max(v128_t a, v128_t b) {
   return wasm_i16x8_max(a, b);
@@ -1983,10 +1976,9 @@ v128_t test_i16x8_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <8 x i16> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]]
-// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_u16x8_max(v128_t a, v128_t b) {
   return wasm_u16x8_max(a, b);
@@ -2103,39 +2095,35 @@ v128_t test_i32x4_mul(v128_t a, v128_t b) {
 
 // CHECK-LABEL: @test_i32x4_min(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[A]], <4 x i32> [[B]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_i32x4_min(v128_t a, v128_t b) {
   return wasm_i32x4_min(a, b);
 }
 
 // CHECK-LABEL: @test_u32x4_min(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i32> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[A]], <4 x i32> [[B]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_u32x4_min(v128_t a, v128_t b) {
   return wasm_u32x4_min(a, b);
 }
 
 // CHECK-LABEL: @test_i32x4_max(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i32> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[A]], <4 x i32> [[B]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_i32x4_max(v128_t a, v128_t b) {
   return wasm_i32x4_max(a, b);
 }
 
 // CHECK-LABEL: @test_u32x4_max(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[A:%.*]], [[B:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[A]], <4 x i32> [[B]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_u32x4_max(v128_t a, v128_t b) {
   return wasm_u32x4_max(a, b);
 
@@ -1142,29 +1142,55 @@ static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
   return &Sel;
 }
 
-static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
-                                        InstCombinerImpl &IC) {
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
-    return nullptr;
-
+static Instruction *canonicalizeSPF(SelectInst &Sel, ICmpInst &Cmp,
+                                    InstCombinerImpl &IC) {
   Value *LHS, *RHS;
-  SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
-  if (SPF != SelectPatternFlavor::SPF_ABS &&
-      SPF != SelectPatternFlavor::SPF_NABS)
+  // TODO: What to do with pointer min/max patterns?
+  if (!Sel.getType()->isIntOrIntVectorTy())
     return nullptr;
 
-  // Note that NSW flag can only be propagated for normal, non-negated abs!
-  bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
-                        match(RHS, m_NSWNeg(m_Specific(LHS)));
-  Constant *IntMinIsPoisonC =
-      ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
-  Instruction *Abs =
-      IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
-
-  if (SPF == SelectPatternFlavor::SPF_NABS)
-    return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
+  SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
+  if (SPF == SelectPatternFlavor::SPF_ABS ||
+      SPF == SelectPatternFlavor::SPF_NABS) {
+    if (!Cmp.hasOneUse())
+      return nullptr; // TODO: Relax this restriction.
+
+    // Note that NSW flag can only be propagated for normal, non-negated abs!
+    bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
+                          match(RHS, m_NSWNeg(m_Specific(LHS)));
+    Constant *IntMinIsPoisonC =
+        ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
+    Instruction *Abs =
+        IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
+
+    if (SPF == SelectPatternFlavor::SPF_NABS)
+      return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
+    return IC.replaceInstUsesWith(Sel, Abs);
+  }
+
+  if (SelectPatternResult::isMinOrMax(SPF)) {
+    Intrinsic::ID IntrinsicID;
+    switch (SPF) {
+    case SelectPatternFlavor::SPF_UMIN:
+      IntrinsicID = Intrinsic::umin;
+      break;
+    case SelectPatternFlavor::SPF_UMAX:
+      IntrinsicID = Intrinsic::umax;
+      break;
+    case SelectPatternFlavor::SPF_SMIN:
+      IntrinsicID = Intrinsic::smin;
+      break;
+    case SelectPatternFlavor::SPF_SMAX:
+      IntrinsicID = Intrinsic::smax;
+      break;
+    default:
+      llvm_unreachable("Unexpected SPF");
+    }
+    return IC.replaceInstUsesWith(
+        Sel, IC.Builder.CreateBinaryIntrinsic(IntrinsicID, LHS, RHS));
+  }
 
-  return IC.replaceInstUsesWith(Sel, Abs);
+  return nullptr;
 }
 
 /// If we have a select with an equality comparison, then we know the value in
@@ -1540,8 +1566,8 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
     return NewSel;
 
-  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
-    return NewAbs;
+  if (Instruction *NewSPF = canonicalizeSPF(SI, *ICI, *this))
+    return NewSPF;
 
   if (Value *V = canonicalizeClampLike(SI, *ICI, Builder))
     return replaceInstUsesWith(SI, V);
 
@@ -198,8 +198,7 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)*
 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 ; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x
-; CHECK: %cmp = icmp slt i32 %sub, 8
-; CHECK: %select = select i1 %cmp, i32 %sub, i32 8
+; CHECK: %1 = call i32 @llvm.smin.i32(i32 %sub, i32 8)
 define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
@@ -222,9 +221,8 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)*
 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_select(
 ; CHECK: %group.id_x_group.size.x.neg = mul i32 %group.id, -8
 ; CHECK: %sub = add i32 %group.id_x_group.size.x.neg, %grid.size.x
-; CHECK: %1 = icmp ugt i32 %sub, 8
-; CHECK: %select = select i1 %1, i32 %sub, i32 8
-; CHECK: %zext = zext i32 %select to i64
+; CHECK: %1 = call i32 @llvm.umax.i32(i32 %sub, i32 8)
+; CHECK: %zext = zext i32 %1 to i64
 define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
@@ -472,8 +470,7 @@ define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspac
 }
 
 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false(
-; CHECK: icmp ult
-; CHECK: select
+; CHECK: call i32 @llvm.umin
 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 {
   %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
   %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4