Skip to content

Commit cba9dc6

Browse files
[libc][nfc] Use common implementation of read_first_lane_u64 (llvm#131027)
No codegen regression on either target. The two builtin_ffs implied on nvptx CSE away. ``` define internal i64 @__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) rust-lang#2 { entry: %shr = lshr i64 %__x, 32 %conv = trunc nuw i64 %shr to i32 %conv1 = trunc i64 %__x to i32 %conv2 = trunc i64 %__lane_mask to i32 %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 true) %iszero = icmp eq i32 %conv2, 0 %sub = select i1 %iszero, i32 -1, i32 %0 %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 %sub, i32 31) %conv4 = sext i32 %1 to i64 %shl = shl nsw i64 %conv4, 32 %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 %sub, i32 31) %conv7 = zext i32 %2 to i64 %or = or disjoint i64 %shl, %conv7 ret i64 %or } ; becomes define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) rust-lang#2 { entry: %shr = lshr i64 %__x, 32 %conv = trunc nuw i64 %shr to i32 %conv1 = trunc i64 %__x to i32 %conv.i = trunc i64 %__lane_mask to i32 %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 true) %iszero = icmp eq i32 %conv.i, 0 %sub.i = select i1 %iszero, i32 -1, i32 %0 %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 %sub.i, i32 31) %conv4 = zext i32 %1 to i64 %shl = shl nuw i64 %conv4, 32 %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 %sub.i, i32 31) %conv7 = zext i32 %2 to i64 %or = or disjoint i64 %shl, %conv7 ret i64 %or } ``` The sext vs zext difference is vaguely interesting but since the bits are immediately discarded in either case it make no odds. The amdgcn one doesn't need CSE, the readfirstlane function is a single call to an intrinsic. Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and could use first_lane_u32 instead. Added the missing call to gpuintrin.c test case and a stray missing static as well.
1 parent c476a4a commit cba9dc6

File tree

4 files changed

+99
-34
lines changed

4 files changed

+99
-34
lines changed

clang/lib/Headers/amdgpuintrin.h

+5-10
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
3333
// Attribute to declare a function as a kernel.
3434
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
3535

36+
// Defined in gpuintrin.h, used later in this file.
37+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
38+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
39+
3640
// Returns the number of workgroups in the 'x' dimension of the grid.
3741
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
3842
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
115119
return __builtin_amdgcn_readfirstlane(__x);
116120
}
117121

118-
// Copies the value from the first active thread in the wavefront to the rest.
119-
_DEFAULT_FN_ATTRS __inline__ uint64_t
120-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121-
uint32_t __hi = (uint32_t)(__x >> 32ull);
122-
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123-
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124-
((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
125-
}
126-
127122
// Returns a bitmask of threads in the current lane for which \p x is true.
128123
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
129124
bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
203198
// Returns the current lane mask if every lane contains __x.
204199
_DEFAULT_FN_ATTRS static __inline__ uint64_t
205200
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
206-
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
201+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
207202
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
208203
__gpu_sync_lane(__lane_mask);
209204
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;

clang/lib/Headers/gpuintrin.h

+10
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,16 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
115115
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
116116
}
117117

118+
// Copies the value from the first active thread in the wavefront to the rest.
119+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
120+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121+
uint32_t __hi = (uint32_t)(__x >> 32ull);
122+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
123+
return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
124+
((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
125+
0xFFFFFFFFull);
126+
}
127+
118128
// Gets the first floating point value from the active lanes.
119129
_DEFAULT_FN_ATTRS static __inline__ float
120130
__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {

clang/lib/Headers/nvptxintrin.h

+5-16
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
3737
// Attribute to declare a function as a kernel.
3838
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
3939

40+
// Defined in gpuintrin.h, used later in this file.
41+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
42+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
43+
4044
// Returns the number of CUDA blocks in the 'x' dimension.
4145
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
4246
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -120,21 +124,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
120124
return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
121125
}
122126

123-
// Copies the value from the first active thread in the warp to the rest.
124-
_DEFAULT_FN_ATTRS static __inline__ uint64_t
125-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
126-
uint32_t __hi = (uint32_t)(__x >> 32ull);
127-
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
128-
uint32_t __mask = (uint32_t)__lane_mask;
129-
uint32_t __id = __builtin_ffs(__mask) - 1;
130-
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
131-
__gpu_num_lanes() - 1)
132-
<< 32ull) |
133-
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
134-
__gpu_num_lanes() - 1) &
135-
0xFFFFFFFF);
136-
}
137-
138127
// Returns a bitmask of threads in the current lane for which \p x is true.
139128
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
140129
bool __x) {
@@ -231,7 +220,7 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
231220
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
232221
#endif
233222

234-
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
223+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
235224
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
236225
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
237226
}

clang/test/Headers/gpuintrin.c

+79-8
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
3333
__gpu_lane_id();
3434
__gpu_lane_mask();
3535
__gpu_read_first_lane_u32(-1, -1);
36+
__gpu_read_first_lane_u64(-1, -1);
3637
__gpu_ballot(-1, 1);
3738
__gpu_sync_threads();
3839
__gpu_sync_lane(-1);
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
6465
// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
6566
// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
6667
// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
67-
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
68+
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
69+
// AMDGPU-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
6870
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
6971
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
70-
// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
71-
// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
72-
// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
72+
// AMDGPU-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
73+
// AMDGPU-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
74+
// AMDGPU-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
7375
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
7476
// AMDGPU-NEXT: unreachable
7577
//
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
388390
// AMDGPU-NEXT: ret i32 [[TMP1]]
389391
//
390392
//
393+
// AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
394+
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
395+
// AMDGPU-NEXT: [[ENTRY:.*:]]
396+
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
397+
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
398+
// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
399+
// AMDGPU-NEXT: [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
400+
// AMDGPU-NEXT: [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
401+
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
402+
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
403+
// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
404+
// AMDGPU-NEXT: [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
405+
// AMDGPU-NEXT: [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
406+
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
407+
// AMDGPU-NEXT: store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
408+
// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
409+
// AMDGPU-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
410+
// AMDGPU-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
411+
// AMDGPU-NEXT: store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
412+
// AMDGPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
413+
// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
414+
// AMDGPU-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
415+
// AMDGPU-NEXT: store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
416+
// AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
417+
// AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
418+
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
419+
// AMDGPU-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
420+
// AMDGPU-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
421+
// AMDGPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
422+
// AMDGPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
423+
// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
424+
// AMDGPU-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
425+
// AMDGPU-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
426+
// AMDGPU-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
427+
// AMDGPU-NEXT: ret i64 [[OR]]
428+
//
429+
//
391430
// AMDGPU-LABEL: define internal i64 @__gpu_ballot(
392431
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
393432
// AMDGPU-NEXT: [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
525564
// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
526565
// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
527566
// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
528-
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
567+
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
568+
// NVPTX-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
529569
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
530570
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
531-
// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
532-
// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
533-
// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
571+
// NVPTX-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
572+
// NVPTX-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
573+
// NVPTX-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
534574
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
535575
// NVPTX-NEXT: unreachable
536576
//
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
793833
// NVPTX-NEXT: ret i32 [[TMP7]]
794834
//
795835
//
836+
// NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
837+
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
838+
// NVPTX-NEXT: [[ENTRY:.*:]]
839+
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
840+
// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8
841+
// NVPTX-NEXT: [[__HI:%.*]] = alloca i32, align 4
842+
// NVPTX-NEXT: [[__LO:%.*]] = alloca i32, align 4
843+
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
844+
// NVPTX-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8
845+
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
846+
// NVPTX-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
847+
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
848+
// NVPTX-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4
849+
// NVPTX-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
850+
// NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
851+
// NVPTX-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
852+
// NVPTX-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4
853+
// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
854+
// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
855+
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
856+
// NVPTX-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
857+
// NVPTX-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
858+
// NVPTX-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
859+
// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
860+
// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
861+
// NVPTX-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
862+
// NVPTX-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
863+
// NVPTX-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
864+
// NVPTX-NEXT: ret i64 [[OR]]
865+
//
866+
//
796867
// NVPTX-LABEL: define internal i64 @__gpu_ballot(
797868
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
798869
// NVPTX-NEXT: [[ENTRY:.*:]]

0 commit comments

Comments
 (0)