[libc][nfc] Use common implementation of read_first_lane_u64 (llvm#131027)

JonChesterfield · web-flow · commit cba9dc6e9d04 · 2025-03-12T21:29:46.000Z
No codegen regression on either target. The two builtin_ffs implied on nvptx CSE away. ``` define internal i64 @__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) rust-lang#2 { entry: %shr = lshr i64 %__x, 32 %conv = trunc nuw i64 %shr to i32 %conv1 = trunc i64 %__x to i32 %conv2 = trunc i64 %__lane_mask to i32 %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 true) %iszero = icmp eq i32 %conv2, 0 %sub = select i1 %iszero, i32 -1, i32 %0 %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 %sub, i32 31) %conv4 = sext i32 %1 to i64 %shl = shl nsw i64 %conv4, 32 %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 %sub, i32 31) %conv7 = zext i32 %2 to i64 %or = or disjoint i64 %shl, %conv7 ret i64 %or } ; becomes define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) rust-lang#2 { entry: %shr = lshr i64 %__x, 32 %conv = trunc nuw i64 %shr to i32 %conv1 = trunc i64 %__x to i32 %conv.i = trunc i64 %__lane_mask to i32 %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 true) %iszero = icmp eq i32 %conv.i, 0 %sub.i = select i1 %iszero, i32 -1, i32 %0 %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 %sub.i, i32 31) %conv4 = zext i32 %1 to i64 %shl = shl nuw i64 %conv4, 32 %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 %sub.i, i32 31) %conv7 = zext i32 %2 to i64 %or = or disjoint i64 %shl, %conv7 ret i64 %or } ``` The sext vs zext difference is vaguely interesting but since the bits are immediately discarded in either case it make no odds. The amdgcn one doesn't need CSE, the readfirstlane function is a single call to an intrinsic. Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and a stray missing static as well.
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __builtin_amdgcn_readfirstlane(__x);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
-_DEFAULT_FN_ATTRS __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
-         ((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
 // Returns the current lane mask if every lane contains __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   __gpu_sync_lane(__lane_mask);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
@@ -115,6 +115,16 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
+// Copies the value from the first active thread in the wavefront to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
+  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
+  return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
+         ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
+          0xFFFFFFFFull);
+}
+
 // Gets the first floating point value from the active lanes.
 _DEFAULT_FN_ATTRS static __inline__ float
 __gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
@@ -37,6 +37,10 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
+// Defined in gpuintrin.h, used later in this file.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -120,21 +124,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
   return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
 }
 
-// Copies the value from the first active thread in the warp to the rest.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
-  uint32_t __mask = (uint32_t)__lane_mask;
-  uint32_t __id = __builtin_ffs(__mask) - 1;
-  return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
-                                             __gpu_num_lanes() - 1)
-          << 32ull) |
-         ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
-                                             __gpu_num_lanes() - 1) &
-          0xFFFFFFFF);
-}
-
 // Returns a bitmask of threads in the current lane for which \p x is true.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
                                                           bool __x) {
@@ -231,7 +220,7 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
   return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
 #endif
 
-  uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+  uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
   return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
 }
diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c
@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
   __gpu_lane_id();
   __gpu_lane_mask();
   __gpu_read_first_lane_u32(-1, -1);
+  __gpu_read_first_lane_u64(-1, -1);
   __gpu_ballot(-1, 1);
   __gpu_sync_threads();
   __gpu_sync_lane(-1);
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
 // AMDGPU-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_threads() #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
-// AMDGPU-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
 // AMDGPU-NEXT:    call void @__gpu_exit() #[[ATTR8:[0-9]+]]
 // AMDGPU-NEXT:    unreachable
 //
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
 // AMDGPU-NEXT:    ret i32 [[TMP1]]
 //
 //
+// AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
+// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// AMDGPU-NEXT:  [[ENTRY:.*:]]
+// AMDGPU-NEXT:    [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// AMDGPU-NEXT:    [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGPU-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGPU-NEXT:    [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
+// AMDGPU-NEXT:    [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
+// AMDGPU-NEXT:    [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
+// AMDGPU-NEXT:    [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
+// AMDGPU-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// AMDGPU-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// AMDGPU-NEXT:    store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
+// AMDGPU-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// AMDGPU-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// AMDGPU-NEXT:    store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
+// AMDGPU-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
+// AMDGPU-NEXT:    [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// AMDGPU-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// AMDGPU-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
+// AMDGPU-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
+// AMDGPU-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
+// AMDGPU-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// AMDGPU-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// AMDGPU-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// AMDGPU-NEXT:    ret i64 [[OR]]
+//
+//
 // AMDGPU-LABEL: define internal i64 @__gpu_ballot(
 // AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
 // NVPTX-NEXT:    [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_threads() #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
-// NVPTX-NEXT:    [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
+// NVPTX-NEXT:    [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
 // NVPTX-NEXT:    call void @__gpu_exit() #[[ATTR7:[0-9]+]]
 // NVPTX-NEXT:    unreachable
 //
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
 // NVPTX-NEXT:    ret i32 [[TMP7]]
 //
 //
+// NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
+// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__X_ADDR:%.*]] = alloca i64, align 8
+// NVPTX-NEXT:    [[__HI:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[__LO:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT:    store i64 [[__X]], ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT:    [[SHR:%.*]] = lshr i64 [[TMP0]], 32
+// NVPTX-NEXT:    [[CONV:%.*]] = trunc i64 [[SHR]] to i32
+// NVPTX-NEXT:    store i32 [[CONV]], ptr [[__HI]], align 4
+// NVPTX-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
+// NVPTX-NEXT:    [[AND:%.*]] = and i64 [[TMP1]], 4294967295
+// NVPTX-NEXT:    [[CONV1:%.*]] = trunc i64 [[AND]] to i32
+// NVPTX-NEXT:    store i32 [[CONV1]], ptr [[__LO]], align 4
+// NVPTX-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
+// NVPTX-NEXT:    [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
+// NVPTX-NEXT:    [[CONV2:%.*]] = zext i32 [[CALL]] to i64
+// NVPTX-NEXT:    [[SHL:%.*]] = shl i64 [[CONV2]], 32
+// NVPTX-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
+// NVPTX-NEXT:    [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
+// NVPTX-NEXT:    [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
+// NVPTX-NEXT:    [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
+// NVPTX-NEXT:    [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
+// NVPTX-NEXT:    ret i64 [[OR]]
+//
+//
 // NVPTX-LABEL: define internal i64 @__gpu_ballot(
 // NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
 // NVPTX-NEXT:  [[ENTRY:.*:]]