diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 03cd45d7de6f2..4ff761ec19b3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -95,11 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &); void initializeAMDGPUAlwaysInlinePass(PassRegistry&); -Pass *createAMDGPUAnnotateKernelFeaturesPass(); Pass *createAMDGPUAttributorLegacyPass(); void initializeAMDGPUAttributorLegacyPass(PassRegistry &); -void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); -extern char &AMDGPUAnnotateKernelFeaturesID; // DPP/Iterative option enables the atomic optimizer with given strategy // whereas None disables the atomic optimizer. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index a9bd41382c255..9c9fa5c6e2f0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -52,11 +52,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { char AMDGPUAnnotateKernelFeatures::ID = 0; -char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; - -INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) - bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { bool HaveStackObjects = false; bool Changed = false; @@ -131,7 +126,3 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { TM = &TPC->getTM(); return false; } - -Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { - return new AMDGPUAnnotateKernelFeatures(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 538b1b181f643..60d27a7fbef29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -149,9 +149,3 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-regbanklegalize", AMDGPURegBankLegalizePass( DUMMY_MACHINE_FUNCTION_PASS("amdgpu-regbank-combiner", AMDGPURegBankCombinerPass()) #undef DUMMY_MACHINE_FUNCTION_PASS - - -#define DUMMY_CGSCC_PASS(NAME, CREATE_PASS) -DUMMY_CGSCC_PASS("amdgpu-annotate-kernel-features", AMDGPUAnnotateKernelFeaturesPass()) - -#undef DUMMY_CGSCC_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b59e940852724..98d321eeda694 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -515,7 +515,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUSwLowerLDSLegacyPass(*PR); initializeAMDGPUAttributorLegacyPass(*PR); - initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); @@ -1311,12 +1310,6 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { - if (TM->getTargetTriple().isAMDGCN()) { - // FIXME: This pass adds 2 hacky attributes that can be replaced with an - // analysis, and should be removed. - addPass(createAMDGPUAnnotateKernelFeaturesPass()); - } - if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments) addPass(createAMDGPULowerKernelArgumentsPass()); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 53f5c1efd14eb..d6153ce93b451 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -601,12 +601,6 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, const CallingConv::ID CC = F.getCallingConv(); const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) KernargSegmentPtr = true; @@ -629,12 +623,13 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, DispatchID = true; } - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && (IsAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + // FlatScratchInit cannot be true for graphics CC if enableFlatScratch() + // is false. + (ST.enableFlatScratch() || + (!AMDGPU::isGraphics(CC) && + !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) && !ST.flatScratchIsArchitected()) { FlatScratchInit = true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index d9be677a0e58d..aeb301939e986 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -1079,6 +1169,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1093,6 +1185,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1132,6 +1228,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1144,6 +1243,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1156,6 +1258,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1167,6 +1271,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1199,6 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1213,6 +1324,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1227,6 +1341,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1238,6 +1354,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,6 +1393,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -1287,6 +1410,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -1301,6 +1427,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1312,6 +1440,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1348,7 +1480,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1356,6 +1490,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -1370,7 +1505,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1378,6 +1515,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -1392,6 +1530,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -1410,6 +1550,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -1466,6 +1610,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -1483,6 +1630,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1500,6 +1650,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1513,6 +1665,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1559,10 +1715,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1580,10 +1739,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1601,7 +1763,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1616,6 +1780,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_ret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1654,12 +1822,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1677,12 +1848,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1700,7 +1874,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -1715,6 +1891,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1756,10 +1936,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1769,10 +1952,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1782,7 +1968,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1794,6 +1982,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1828,12 +2020,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1843,12 +2038,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1858,7 +2056,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1870,6 +2070,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1907,12 +2111,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1922,12 +2129,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1937,7 +2147,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -1949,6 +2161,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1987,6 +2203,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2013,6 +2232,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2039,12 +2261,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2058,6 +2282,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2116,6 +2344,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2134,6 +2365,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2152,12 +2386,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2166,6 +2402,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2219,8 +2459,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2237,8 +2480,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2312,7 +2558,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2328,7 +2577,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2394,7 +2646,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -2410,7 +2665,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2594,10 +2852,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_dec_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2610,10 +2871,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_dec_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2671,12 +2935,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_dec_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2689,12 +2956,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_dec_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2753,12 +3023,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_dec_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2771,12 +3044,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2835,10 +3111,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_dec_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2848,10 +3127,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_dec_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2902,12 +3184,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_dec_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2917,12 +3202,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_dec_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2974,12 +3262,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_dec_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2989,12 +3280,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3047,6 +3341,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3070,6 +3367,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3144,6 +3444,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3162,6 +3465,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3232,7 +3538,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -3251,7 +3560,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 92a7de9aaefd2..1d401a4ee33d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -21,11 +21,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -36,11 +39,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -110,11 +116,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -125,11 +134,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -332,6 +344,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -347,6 +362,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -415,6 +433,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -432,6 +453,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -503,6 +527,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -520,6 +547,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -592,6 +622,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -604,6 +637,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -664,6 +700,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -678,6 +717,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -741,6 +783,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -755,6 +800,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -820,7 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -828,6 +878,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -842,7 +893,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -850,6 +903,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -925,6 +979,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -942,6 +999,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1019,8 +1079,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1037,8 +1100,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1129,7 +1195,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1145,7 +1214,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1224,7 +1296,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1240,7 +1315,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1459,10 +1537,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CI-LABEL: global_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1475,10 +1556,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; VI-LABEL: global_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1548,12 +1632,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; CI-LABEL: global_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1566,12 +1653,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou ; VI-LABEL: global_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1642,12 +1732,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-LABEL: global_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1660,12 +1753,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1737,10 +1833,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; CI-LABEL: global_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1750,10 +1849,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 ; VI-LABEL: global_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1815,12 +1917,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; CI-LABEL: global_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1830,12 +1935,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI-LABEL: global_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1898,12 +2006,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-LABEL: global_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1913,12 +2024,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1983,6 +2097,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2006,6 +2123,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2094,6 +2214,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2112,6 +2235,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2188,6 +2314,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -2203,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -2218,6 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2232,6 +2366,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2281,6 +2419,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2298,6 +2439,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2315,6 +2459,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2329,6 +2475,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2381,6 +2531,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 @@ -2398,6 +2551,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 @@ -2415,6 +2571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2429,6 +2587,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2482,6 +2644,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2494,6 +2659,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2506,6 +2674,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2517,6 +2687,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2560,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2574,6 +2751,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i32_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2588,6 +2768,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2599,6 +2781,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2645,6 +2831,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 @@ -2659,6 +2848,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -2673,6 +2865,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -2684,6 +2878,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2732,7 +2930,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 42 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2740,6 +2940,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 42 ; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol @@ -2754,7 +2955,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 42 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2762,6 +2965,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 42 ; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -2776,6 +2980,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -2794,6 +3000,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 42 @@ -2871,6 +3081,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -2888,6 +3101,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -2905,6 +3121,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2918,6 +3136,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2988,7 +3210,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; CI-NEXT: v_mov_b32_e32 v3, s2 ; CI-NEXT: flat_store_dword v[3:4], v0 @@ -3007,7 +3232,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_mov_b32_e32 v3, s2 ; VI-NEXT: flat_store_dword v[3:4], v0 @@ -3097,10 +3325,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_ret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3118,10 +3349,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_ret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3139,7 +3373,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_ret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3154,6 +3390,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_ret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3206,12 +3446,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_ret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3229,12 +3472,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_ret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3252,7 +3498,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3267,6 +3515,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3322,12 +3574,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3345,12 +3600,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3368,7 +3626,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 @@ -3383,6 +3643,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3439,10 +3703,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3452,10 +3719,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3465,7 +3735,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3477,6 +3749,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3523,12 +3799,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3538,12 +3817,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; VI-LABEL: flat_atomic_inc_noret_i64_offset: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3553,7 +3835,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3565,6 +3849,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3614,12 +3902,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3629,12 +3920,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3644,7 +3938,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -3656,6 +3952,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3707,6 +4007,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -3733,6 +4036,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -3759,12 +4065,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3778,6 +4086,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3858,6 +4170,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -3876,6 +4191,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -3894,12 +4212,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3908,6 +4228,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3975,6 +4299,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s4 ; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -3982,6 +4307,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: v_mov_b32_e32 v0, s2 @@ -3995,6 +4322,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 @@ -4002,6 +4330,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 31a229a908142..9ef16aef0dd16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3016,7 +3016,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 14 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3042,7 +3042,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 15 +; GPRIDX-NEXT: wavefront_sgpr_count = 17 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3107,7 +3107,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 14 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3133,7 +3133,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: wavefront_sgpr_count = 24 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -3168,21 +3168,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8 +; MOVREL-NEXT: s_add_i32 s12, s12, s17 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; MOVREL-NEXT: s_mov_b32 s4, 0 ; MOVREL-NEXT: s_mov_b32 s5, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s2, 0 -; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_mov_b32 s2, 0 ; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_mov_b32 s3, 0x40140000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -3210,7 +3213,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -3225,7 +3228,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4042,7 +4045,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 14 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4057,7 +4060,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4074,7 +4077,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 14 +; GPRIDX-NEXT: wavefront_sgpr_count = 16 ; GPRIDX-NEXT: workitem_vgpr_count = 2 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4115,7 +4118,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4126,7 +4129,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 14 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4141,7 +4144,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4158,7 +4161,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: wavefront_sgpr_count = 24 ; MOVREL-NEXT: workitem_vgpr_count = 3 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4176,6 +4179,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; MOVREL-NEXT: s_add_i32 s12, s12, s17 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s2, 1 ; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0 @@ -4211,7 +4217,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4226,7 +4232,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4387,7 +4393,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_mem_ordered = 0 ; GPRIDX-NEXT: enable_fwd_progress = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GPRIDX-NEXT: user_sgpr_count = 12 +; GPRIDX-NEXT: user_sgpr_count = 14 ; GPRIDX-NEXT: enable_trap_handler = 0 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1 ; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4402,7 +4408,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 -; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 +; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1 ; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4419,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 28 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 14 +; GPRIDX-NEXT: wavefront_sgpr_count = 16 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -4463,7 +4469,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: kernel_code_entry_byte_offset = 256 ; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0 ; MOVREL-NEXT: granulated_workitem_vgpr_count = 0 -; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1 +; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2 ; MOVREL-NEXT: priority = 0 ; MOVREL-NEXT: float_mode = 240 ; MOVREL-NEXT: priv = 0 @@ -4474,7 +4480,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_mem_ordered = 0 ; MOVREL-NEXT: enable_fwd_progress = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; MOVREL-NEXT: user_sgpr_count = 12 +; MOVREL-NEXT: user_sgpr_count = 14 ; MOVREL-NEXT: enable_trap_handler = 0 ; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1 ; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4489,7 +4495,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 -; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 +; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1 ; MOVREL-NEXT: enable_sgpr_private_segment_size = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0 @@ -4506,7 +4512,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: gds_segment_byte_size = 0 ; MOVREL-NEXT: kernarg_segment_byte_size = 28 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 -; MOVREL-NEXT: wavefront_sgpr_count = 10 +; MOVREL-NEXT: wavefront_sgpr_count = 24 ; MOVREL-NEXT: workitem_vgpr_count = 4 ; MOVREL-NEXT: reserved_vgpr_first = 0 ; MOVREL-NEXT: reserved_vgpr_count = 0 @@ -4524,10 +4530,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: ; %bb.0: ; %entry ; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8 ; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; MOVREL-NEXT: s_add_i32 s12, s12, s17 +; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; MOVREL-NEXT: s_mov_b32 s2, 0 -; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 +; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 ; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 ; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] @@ -4535,6 +4543,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; MOVREL-NEXT: v_mov_b32_e32 v1, s3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -4562,7 +4571,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_mem_ordered = 1 ; GFX10-NEXT: enable_fwd_progress = 0 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; GFX10-NEXT: user_sgpr_count = 12 +; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 ; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1 ; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1 @@ -4577,7 +4586,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 -; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1 ; GFX10-NEXT: enable_sgpr_private_segment_size = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0 ; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll index 00c44c27257bb..e207d95287783 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll @@ -35,7 +35,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; RO-FLAT: scratch_store_dword ; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer -; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1 +; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init ; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; RW-FLAT-NOT: .amdhsa_enable_private_segment @@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { ; RO-FLAT: .amdhsa_enable_private_segment 1 ; RW-FLAT: .amdhsa_reserve_flat_scratch 0 ; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 -; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4 ; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0 define amdgpu_kernel void @stack_object_in_kernel_no_calls() { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll index 676035735d0af..86766e2904619 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40 -; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V4-NEXT: s_add_i32 s12, s12, s17 +; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_mov_b32 s4, s0 ; GFX8V4-NEXT: s_mov_b32 s5, s3 @@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V4-NEXT: flat_store_dword v[0:1], v2 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) @@ -37,7 +40,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8 -; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: s_add_i32 s12, s12, s17 +; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_mov_b32 s4, s0 ; GFX8V5-NEXT: s_mov_b32 s5, s2 @@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) @@ -60,9 +66,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-LABEL: addrspacecast: ; GFX9V4: ; %bb.0: ; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V4-NEXT: s_mov_b32 s2, s0 ; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 @@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V4-NEXT: flat_store_dword v[0:1], v2 ; GFX9V4-NEXT: s_waitcnt vmcnt(0) @@ -84,9 +92,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-LABEL: addrspacecast: ; GFX9V5: ; %bb.0: ; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base -; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX9V5-NEXT: s_mov_b32 s2, s0 ; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 @@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 ; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 ; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 ; GFX9V5-NEXT: flat_store_dword v[0:1], v2 ; GFX9V5-NEXT: s_waitcnt vmcnt(0) @@ -111,7 +121,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ret void } -define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -167,7 +177,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ret void } -define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -223,7 +233,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ret void } -define amdgpu_kernel void @llvm_trap() { +define amdgpu_kernel void @llvm_trap() #0 { ; GFX8V4-LABEL: llvm_trap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_mov_b64 s[0:1], s[6:7] @@ -246,7 +256,7 @@ define amdgpu_kernel void @llvm_trap() { unreachable } -define amdgpu_kernel void @llvm_debugtrap() { +define amdgpu_kernel void @llvm_debugtrap() #0 { ; GFX8V4-LABEL: llvm_debugtrap: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_trap 3 @@ -266,7 +276,7 @@ define amdgpu_kernel void @llvm_debugtrap() { unreachable } -define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 @@ -374,3 +384,5 @@ declare void @llvm.debugtrap() !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll index 378c6312c52be..94853767ccfac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10 -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll index a6a7f35a774db..859f7ef16e395 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace( ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_add_i32 s12, s12, s17 ; CHECK-NEXT: ds_read_b32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v3, 9 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_mov_b32_e32 v3, 9 ; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x200 ; CHECK-NEXT: ds_write_b32 v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll index dcc2c23cae046..a5a75f74833f1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll @@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) { ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s2, s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s2, 0 ; GCN-NEXT: s_cselect_b32 s2, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll index ad588ebee2f9e..1deee215e522b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll @@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0 @@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 @@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; CI-LABEL: s_trig_preop_f64_imm: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; CI-NEXT: s_add_u32 s0, s0, 4 @@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; VI-LABEL: s_trig_preop_f64_imm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; VI-NEXT: s_add_u32 s0, s0, 4 @@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; GFX9-LABEL: s_trig_preop_f64_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1] @@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) { ; ; GFX10-LABEL: s_trig_preop_f64_imm: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s12, s12, s17 +; GFX10-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 40f29c56c8f12..b59f85b2dfa38 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s6, s5, 31 ; GFX8-NEXT: s_add_i32 s0, s5, s6 @@ -146,6 +149,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_ashr_i32 s12, s11, 31 @@ -617,6 +623,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 @@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2187,6 +2202,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: sdiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2332,6 +2350,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010 ; GFX8-NEXT: s_ashr_i32 s3, s0, 31 @@ -2596,6 +2617,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: sdiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -2741,6 +2765,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s10, s0, 31 @@ -3002,6 +3029,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: sdivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 @@ -3153,6 +3183,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: sdivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000 ; GFX8-NEXT: s_ashr_i32 s5, s0, 31 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index e3c1a52696b47..ff0114cfc3ddb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 ; GFX8-NEXT: s_sub_i32 s0, 0, s5 @@ -113,6 +116,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10 @@ -523,6 +529,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -980,6 +992,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: udivrem_v2i64: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20 ; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out ; GFX8-LABEL: udiv_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 @@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou ; GFX8-LABEL: udiv_i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s5, s4, 16 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 @@ -2194,6 +2218,9 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s2, s1, 0xffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) % ; GFX8-LABEL: udivrem_i3: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 @@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 29fb320bf1283..c78f0a4eb61e9 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workitem_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1 ; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0 @@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr) ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id: ; FIXEDABI: ; %bb.0: ; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6 +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 ; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) -; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8 +; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10 ; FIXEDABI-NEXT: flat_store_dword v[0:1], v2 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: s_endpgm @@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 { define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 { ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: +; FIXEDABI-NEXT: s_add_i32 s6, s6, s11 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 @@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) # define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 { ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr: ; FIXEDABI: ; %bb.0: +; FIXEDABI-NEXT: s_add_i32 s4, s4, s9 ; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0 +; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5 +; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 ; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc ; FIXEDABI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index 59bd4e9ac8ce6..3eba47d7d7852 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i32, i1) #0 @@ -27,11 +26,6 @@ define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 { } define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4 @@ -42,11 +36,6 @@ define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { } define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4 @@ -57,11 +46,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { } define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 @@ -92,12 +76,6 @@ define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 { } define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4 @@ -110,12 +88,6 @@ define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace } define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4 @@ -128,13 +100,6 @@ define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addr } define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 -; AKF_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 -; AKF_HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 @@ -149,11 +114,6 @@ define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrsp } define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false) @@ -165,11 +125,6 @@ define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspa ; Can't just search the pointer value define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8 @@ -181,11 +136,6 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr ; Can't just search pointer types define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat -; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8 @@ -197,11 +147,6 @@ define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat ; Cast group to flat, do GEP, cast back to group define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4 @@ -212,10 +157,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() # } define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)) @@ -229,14 +170,11 @@ attributes #1 = { nounwind } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll index b6c0271e5f56f..4e7022710c671 100644 --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-LABEL: readfirstlane_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s0, s0, s4 @@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt ; GCN-NEXT: s_add_u32 s0, s2, 40 ; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index 7fdc012d4f1b5..e71bf15384727 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -393,6 +393,9 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) { ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s0, 0 ; GCN-NEXT: s_movk_i32 s0, 0x80 diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 3e19ee5567929..85b5c7c870b23 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -34,7 +34,7 @@ define amdgpu_kernel void @amdhsa_trap_num_sgprs( ptr addrspace(1) %out26, i32 %in26, ptr addrspace(1) %out27, i32 %in27, ptr addrspace(1) %out28, i32 %in28, - ptr addrspace(1) %out29, i32 %in29) { + ptr addrspace(1) %out29, i32 %in29) #0 { entry: store i32 %in0, ptr addrspace(1) %out0 store i32 %in1, ptr addrspace(1) %out1 @@ -68,3 +68,5 @@ entry: store i32 %in29, ptr addrspace(1) %out29 ret void } + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index 6d205921923d3..8389a8e86cb44 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s ; TODO: The test contains UB which is refined by the Attributor and should be removed. @@ -19,12 +18,6 @@ declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 define void @use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1:[0-9]+]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -37,12 +30,6 @@ define void @use_workitem_id_x() #1 { } define void @use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -55,12 +42,6 @@ define void @use_workitem_id_y() #1 { } define void @use_workitem_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() @@ -73,12 +54,6 @@ define void @use_workitem_id_z() #1 { } define void @use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -91,12 +66,6 @@ define void @use_workgroup_id_x() #1 { } define void @use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -109,12 +78,6 @@ define void @use_workgroup_id_y() #1 { } define void @use_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() @@ -127,12 +90,6 @@ define void @use_workgroup_id_z() #1 { } define void @use_dispatch_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -145,12 +102,6 @@ define void @use_dispatch_ptr() #1 { } define void @use_queue_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() @@ -163,12 +114,6 @@ define void @use_queue_ptr() #1 { } define void @use_dispatch_id() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() -; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() @@ -181,14 +126,6 @@ define void @use_dispatch_id() #1 { } define void @use_workgroup_id_y_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR10:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -205,11 +142,6 @@ define void @use_workgroup_id_y_workgroup_id_z() #1 { } define void @func_indirect_use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x() @@ -220,11 +152,6 @@ define void @func_indirect_use_workitem_id_x() #1 { } define void @kernel_indirect_use_workitem_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x() @@ -235,11 +162,6 @@ define void @kernel_indirect_use_workitem_id_x() #1 { } define void @func_indirect_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_y() @@ -250,11 +172,6 @@ define void @func_indirect_use_workitem_id_y() #1 { } define void @func_indirect_use_workitem_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workitem_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR3]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_z() @@ -265,11 +182,6 @@ define void @func_indirect_use_workitem_id_z() #1 { } define void @func_indirect_use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x() @@ -280,11 +192,6 @@ define void @func_indirect_use_workgroup_id_x() #1 { } define void @kernel_indirect_use_workgroup_id_x() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_x() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x ; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x() @@ -295,11 +202,6 @@ define void @kernel_indirect_use_workgroup_id_x() #1 { } define void @func_indirect_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_y() @@ -310,11 +212,6 @@ define void @func_indirect_use_workgroup_id_y() #1 { } define void @func_indirect_use_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_workgroup_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_z() @@ -325,11 +222,6 @@ define void @func_indirect_use_workgroup_id_z() #1 { } define void @func_indirect_indirect_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() @@ -340,11 +232,6 @@ define void @func_indirect_indirect_use_workgroup_id_y() #1 { } define void @indirect_x2_use_workgroup_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() @@ -355,11 +242,6 @@ define void @indirect_x2_use_workgroup_id_y() #1 { } define void @func_indirect_use_dispatch_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_dispatch_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_ptr() @@ -370,11 +252,6 @@ define void @func_indirect_use_dispatch_ptr() #1 { } define void @func_indirect_use_queue_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_queue_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_queue_ptr() @@ -385,11 +262,6 @@ define void @func_indirect_use_queue_ptr() #1 { } define void @func_indirect_use_dispatch_id() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_dispatch_id() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_id() @@ -400,11 +272,6 @@ define void @func_indirect_use_dispatch_id() #1 { } define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() @@ -415,13 +282,6 @@ define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { } define void @recursive_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -436,11 +296,6 @@ define void @recursive_use_workitem_id_y() #1 { } define void @call_recursive_use_workitem_id_y() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y ; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: call void @recursive_use_workitem_id_y() @@ -451,12 +306,6 @@ define void @call_recursive_use_workitem_id_y() #1 { } define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -470,12 +319,6 @@ define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -488,13 +331,6 @@ define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 { } define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %ptr) #2 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4 -; AKF_HSA-NEXT: call void @func_indirect_use_queue_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR14:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4) @@ -509,11 +345,6 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %pt } define void @indirect_use_group_to_flat_addrspacecast() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null) @@ -524,11 +355,6 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 { } define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null) @@ -539,11 +365,6 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { } define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null) @@ -554,12 +375,6 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { } define void @use_kernarg_segment_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() @@ -571,11 +386,6 @@ define void @use_kernarg_segment_ptr() #1 { ret void } define void @func_indirect_use_kernarg_segment_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_kernarg_segment_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_kernarg_segment_ptr() @@ -586,12 +396,6 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 { } define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -604,12 +408,6 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { } define void @use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -622,11 +420,6 @@ define void @use_implicitarg_ptr() #1 { } define void @func_indirect_use_implicitarg_ptr() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: call void @use_implicitarg_ptr() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr ; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr() @@ -640,10 +433,6 @@ declare void @external.func() #3 ; This function gets deleted. define internal void @defined.func() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@defined.func -; AKF_HSA-SAME: () #[[ATTR3:[0-9]+]] { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -652,11 +441,6 @@ define internal void @defined.func() #3 { } define void @func_call_external() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_external -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @external.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() @@ -667,11 +451,6 @@ define void @func_call_external() #3 { } define void @func_call_defined() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_defined -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @defined.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() @@ -681,11 +460,6 @@ define void @func_call_defined() #3 { ret void } define void @func_call_asm() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_call_asm -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR3]] -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm ; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR26:[0-9]+]] @@ -696,11 +470,6 @@ define void @func_call_asm() #3 { } define amdgpu_kernel void @kern_call_external() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_external -; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] { -; AKF_HSA-NEXT: call void @external.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @external.func() @@ -711,11 +480,6 @@ define amdgpu_kernel void @kern_call_external() #3 { } define amdgpu_kernel void @func_kern_defined() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @defined.func() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined ; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @defined.func() @@ -726,12 +490,6 @@ define amdgpu_kernel void @func_kern_defined() #3 { } define i32 @use_dispatch_ptr_ret_type() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) poison, align 8 -; AKF_HSA-NEXT: ret i32 0 -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -744,12 +502,6 @@ define i32 @use_dispatch_ptr_ret_type() #1 { } define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func -; AKF_HSA-SAME: () #[[ATTR1]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type() @@ -762,12 +514,6 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { } define float @func_indirect_call(ptr %fptr) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call ; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() @@ -781,12 +527,6 @@ define float @func_indirect_call(ptr %fptr) #3 { declare float @extern() #3 define float @func_extern_call() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @extern() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern() @@ -799,12 +539,6 @@ define float @func_extern_call() #3 { } define float @func_null_call(ptr %fptr) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call -; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float null() -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call ; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null() @@ -820,12 +554,6 @@ declare float @llvm.amdgcn.rcp.f32(float) #0 ; Calls some other recognized intrinsic define float @func_other_intrinsic_call(float %arg) #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR3]] { -; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) -; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 -; AKF_HSA-NEXT: ret float [[FADD]] -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call ; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR16]] { ; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) @@ -839,11 +567,6 @@ define float @func_other_intrinsic_call(float %arg) #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_sanitize_address() #4 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address -; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] { -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 @@ -855,11 +578,6 @@ define amdgpu_kernel void @kern_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_sanitize_address() #4 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_sanitize_address -; AKF_HSA-SAME: () #[[ATTR5]] { -; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] { ; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4 @@ -871,11 +589,6 @@ define void @func_sanitize_address() #4 { ; Hostcall needs to be enabled for sanitizers define void @func_indirect_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address -; AKF_HSA-SAME: () #[[ATTR3]] { -; AKF_HSA-NEXT: call void @func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() @@ -887,11 +600,6 @@ define void @func_indirect_sanitize_address() #3 { ; Hostcall needs to be enabled for sanitizers define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] { ; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address() @@ -906,11 +614,6 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 { declare void @extern_func_sanitize_address() #5 define amdgpu_kernel void @kern_decl_sanitize_address() #3 { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address -; AKF_HSA-SAME: () #[[ATTR4]] { -; AKF_HSA-NEXT: call void @extern_func_sanitize_address() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address ; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] { ; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address() @@ -923,10 +626,6 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 { declare void @enqueue_block_decl() #6 define internal void @enqueue_block_def() #6 { -; AKF_HSA-LABEL: define {{[^@]+}}@enqueue_block_def -; AKF_HSA-SAME: () #[[ATTR7:[0-9]+]] { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def ; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -935,11 +634,6 @@ define internal void @enqueue_block_def() #6 { } define amdgpu_kernel void @kern_call_enqueued_block_decl() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl -; AKF_HSA-SAME: () #[[ATTR8:[0-9]+]] { -; AKF_HSA-NEXT: call void @enqueue_block_decl() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl ; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl() @@ -950,11 +644,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() { } define amdgpu_kernel void @kern_call_enqueued_block_def() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def -; AKF_HSA-SAME: () #[[ATTR8]] { -; AKF_HSA-NEXT: call void @enqueue_block_def() -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def ; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def() @@ -965,9 +654,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() { } define void @unused_enqueue_block() { -; AKF_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block() { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -976,9 +662,6 @@ define void @unused_enqueue_block() { } define internal void @known_func() { -; AKF_HSA-LABEL: define {{[^@]+}}@known_func() { -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR25]] { ; ATTRIBUTOR_HSA-NEXT: ret void @@ -988,11 +671,6 @@ define internal void @known_func() { ; Should never happen define amdgpu_kernel void @kern_callsite_enqueue_block() { -; AKF_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block -; AKF_HSA-SAME: () #[[ATTR8]] { -; AKF_HSA-NEXT: call void @known_func() #[[ATTR7]] -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] { ; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR27:[0-9]+]] @@ -1014,15 +692,6 @@ attributes #6 = { "enqueued-block" } !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "target-cpu"="gfx900" } -; AKF_HSA: attributes #[[ATTR3]] = { nounwind } -; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-calls" } -; AKF_HSA: attributes #[[ATTR5]] = { nounwind sanitize_address } -; AKF_HSA: attributes #[[ATTR6:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" } -; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" } -; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 2809f0957462a..32bb22b699b61 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -33,12 +32,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -51,14 +44,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -75,14 +60,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -99,12 +76,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() @@ -117,14 +88,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -141,14 +104,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() @@ -165,16 +120,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() @@ -207,12 +152,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -225,12 +164,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() @@ -259,14 +192,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() @@ -283,16 +208,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -313,22 +228,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_all_workitems -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; AKF_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; AKF_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; AKF_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_all_workitems ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() @@ -361,13 +260,6 @@ define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -382,13 +274,6 @@ define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_queue_ptr(ptr addrspace(1) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() -; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4 -; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr() @@ -417,12 +302,6 @@ define amdgpu_kernel void @use_kernarg_segment_ptr(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr -; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr @@ -435,12 +314,6 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr } define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; AKF_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr -; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { ; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr @@ -526,13 +399,6 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 { } define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_is_shared -; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) -; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 -; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared ; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]]) @@ -547,13 +413,6 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 { } define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_is_private -; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] { -; AKF_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) -; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 -; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) poison, align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private ; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] { ; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]]) @@ -568,12 +427,6 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 { } define amdgpu_kernel void @use_alloca() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca -; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] { -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) @@ -586,15 +439,6 @@ define amdgpu_kernel void @use_alloca() #1 { } define amdgpu_kernel void @use_alloca_non_entry_block() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; AKF_HSA-SAME: () #[[ATTR2]] { -; AKF_HSA-NEXT: entry: -; AKF_HSA-NEXT: br label [[BB:%.*]] -; AKF_HSA: bb: -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: entry: @@ -614,12 +458,6 @@ bb: } define void @use_alloca_func() #1 { -; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; AKF_HSA-SAME: () #[[ATTR2]] { -; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) -; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4 -; AKF_HSA-NEXT: ret void -; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func ; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] { ; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) @@ -638,9 +476,6 @@ attributes #1 = { nounwind } !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 20ce05278d213..15dc1a0529254 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=CHECK,AKF_CHECK %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s declare i32 @llvm.r600.read.tgid.x() #0 @@ -27,12 +26,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() @@ -45,14 +38,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() @@ -69,14 +54,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() @@ -93,12 +70,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() @@ -111,14 +82,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() @@ -135,14 +98,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() @@ -159,16 +114,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() @@ -201,12 +146,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() @@ -219,12 +158,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() @@ -253,14 +186,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() @@ -277,16 +202,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() @@ -307,22 +222,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 { } define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 { -; AKF_CHECK-LABEL: define {{[^@]+}}@use_all_workitems -; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] { -; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; AKF_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() -; AKF_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() -; AKF_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() -; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4 -; AKF_CHECK-NEXT: ret void -; ; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_all_workitems ; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { ; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() @@ -394,8 +293,6 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind } ;. -; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index fc13b86566f76..22cc5af30da66 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -35,9 +35,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_1024 -; CHECK: SGPRBlocks: 0 +; CHECK: SGPRBlocks: 2 ; CHECK: VGPRBlocks: 10 -; CHECK: NumSGPRsForWavesPerEU: 2{{$}} +; CHECK: NumSGPRsForWavesPerEU: 24{{$}} ; CHECK: NumVGPRsForWavesPerEU: 43 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_1024() #3 { diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index 46edf06c3b62c..d0107eb3ade27 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -4,8 +4,8 @@ ; ALL-LABEL: {{^}}max_10_sgprs: -; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 10 +; ALL: SGPRBlocks: 2 +; ALL: NumSGPRsForWavesPerEU: 24 define amdgpu_kernel void @max_10_sgprs() #0 { %one = load volatile i32, ptr addrspace(4) poison %two = load volatile i32, ptr addrspace(4) poison diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 14519f5a5e77c..4507fd5865989 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 3 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 20 +; CHECK: NumSGPRsForWavesPerEU: 30 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, ptr addrspace(1) @var diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll index 682a57571d11e..35f0ccf5ba62f 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll @@ -392,7 +392,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -420,7 +421,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -434,7 +436,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -462,7 +465,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -476,7 +480,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -490,7 +495,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast ; GFX10: argumentInfo: @@ -504,7 +510,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() { ; GFX10: argumentInfo: ; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } -; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' } +; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' } +; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' } ; ; GFX10: name: with_indirect_call ; GFX10: argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll new file mode 100644 index 0000000000000..1b422252573db --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s + +; +; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests +; we manually set the attribute for the functions. The purpose is to test how the amdgpu-attributor pass +; handles this situation. +; +;; tests of addrspacecast + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of addrspacecast in a constant + +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 { + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 + ret void +} + +;; tests of intrinsics + +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 { + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { + call void @calls_intrin_ascast(ptr addrspace(3) %ptr) + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } + +; GFX9: attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; GFX10: attributes #0 = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll new file mode 100644 index 0000000000000..51caa84450ff3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll @@ -0,0 +1,870 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX8-ARCH-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX9-ARCH-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GFX942-ARCH-FLAT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s + +; +; None of these functions should have the attribute amdgpu-no-flat-scratch-init. In these tests +; we manually set the attribute for the functions. The purpose is to test how llc handles this. +; + +;; tests of addrspacecast + +define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: with_private_to_flat_addrspacecast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], 0xc0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: with_private_to_flat_addrspacecast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-ARCH-FLAT-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-ARCH-FLAT-NEXT: s_nop 0 +; GFX942-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX942-ARCH-FLAT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: with_private_to_flat_addrspacecast: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 +; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX8-NEXT: s_load_dword s1, s[8:9], 0xc8 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8-NEXT: s_cselect_b32 s0, s0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s1, s[4:5], 0xc8 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8-ARCH-FLAT-NEXT: s_cselect_b32 s1, s1, 0 +; GFX8-ARCH-FLAT-NEXT: s_cselect_b32 s0, s0, 0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cmp_lg_u32 s2, -1 +; GFX9-NEXT: s_cselect_b32 s0, s1, 0 +; GFX9-NEXT: s_cselect_b32 s1, s2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1 +; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0 +; GFX9-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_cmp_lg_u32 s2, -1 +; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s0, s1, 0 +; GFX942-ARCH-FLAT-NEXT: s_cselect_b32 s1, s2, 0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s1 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: with_private_to_flat_addrspacecast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s2, -1 +; GFX10-NEXT: s_cselect_b32 s0, s2, 0 +; GFX10-NEXT: s_cselect_b32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: call_with_private_to_flat_addrspacecast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s18, s33 +; GFX8-NEXT: s_mov_b32 s33, s32 +; GFX8-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[16:17] +; GFX8-NEXT: s_addk_i32 s32, 0x400 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: v_readlane_b32 s31, v3, 1 +; GFX8-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: s_mov_b32 s33, s18 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s2, s33 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s33, s32 +; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8 +; GFX8-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s3 ; 4-byte Folded Spill +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16 +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX8-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 +; GFX8-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 +; GFX8-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX8-ARCH-FLAT-NEXT: s_add_i32 s3, s33, 8 +; GFX8-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s3 ; 4-byte Folded Reload +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s33, s2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: call_with_private_to_flat_addrspacecast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s18, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[16:17] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b32 s33, s18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s2, s33 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s33, s32 +; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX9-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16 +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 +; GFX9-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 +; GFX9-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX9-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s33, s2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s2, s33 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s33, s32 +; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-ARCH-FLAT-NEXT: scratch_store_dword off, v3, s33 ; 4-byte Folded Spill +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_add_i32 s32, s32, 16 +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s0, s0, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s1, s1, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s30, 0 +; GFX942-ARCH-FLAT-NEXT: v_writelane_b32 v3, s31, 1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s31, v3, 1 +; GFX942-ARCH-FLAT-NEXT: v_readlane_b32 s30, v3, 0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, s33 +; GFX942-ARCH-FLAT-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX942-ARCH-FLAT-NEXT: scratch_load_dword v3, off, s33 ; 4-byte Folded Reload +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 exec, s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s33, s2 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: call_with_private_to_flat_addrspacecast: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s18, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_xor_saveexec_b32 s16, -1 +; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s16 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_writelane_b32 v3, s31, 1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: v_readlane_b32 s31, v3, 1 +; GFX10-NEXT: v_readlane_b32 s30, v3, 0 +; GFX10-NEXT: s_mov_b32 s32, s33 +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 s33, s18 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) #0 { +; GFX8-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s0, s15 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX8-NEXT: s_add_u32 s8, s8, 8 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s8, s8, 8 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: call_with_private_to_flat_addrspacecast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, 8 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, with_private_to_flat_addrspacecast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, with_private_to_flat_addrspacecast@gotpcrel32@hi+12 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s15 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_endpgm + call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) + ret void +} + +;; tests of addrspacecast in a constant + +define amdgpu_kernel void @private_constant_expression_use(ptr addrspace(1) nocapture %out) #0 { +; GFX8-LABEL: private_constant_expression_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s2, s[8:9], 0xc8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: private_constant_expression_use: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0xc8 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-ARCH-FLAT-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: private_constant_expression_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: private_constant_expression_use: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: private_constant_expression_use: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: private_constant_expression_use: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm + store volatile ptr addrspacecast (ptr addrspace(5) inttoptr (i32 123 to ptr addrspace(5)) to ptr), ptr addrspace(1) %out, align 8 + ret void +} + +;; tests of intrinsics + +define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { +; GFX8-LABEL: calls_intrin_ascast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX8-NEXT: s_load_dword s1, s[8:9], 0xcc +; GFX8-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s1, s[4:5], 0xcc +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: calls_intrin_ascast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s2 +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: calls_intrin_ascast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX10-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX10-NEXT: v_mov_b32_e32 v2, 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define void @calls_intrin_ascast(ptr addrspace(3) %ptr) #0 { +; GFX8-LABEL: calls_intrin_ascast: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b64 s[4:5], 0xc4 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-ARCH-FLAT-LABEL: calls_intrin_ascast: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], 0xc4 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX8-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: calls_intrin_ascast: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-ARCH-FLAT-LABEL: calls_intrin_ascast: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 +; GFX9-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-ARCH-FLAT-LABEL: calls_intrin_ascast: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v2, 7 +; GFX942-ARCH-FLAT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: calls_intrin_ascast: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX10-NEXT: v_mov_b32_e32 v2, 7 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: flat_store_dword v[0:1], v2 +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] + %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr) + store volatile i32 7, ptr %1, align 4 + ret void +} + +define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) #0 { +; GFX8-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_add_u32 s0, s0, s15 +; GFX8-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX8-NEXT: s_add_u32 s8, s8, 8 +; GFX8-NEXT: s_addc_u32 s9, s9, 0 +; GFX8-NEXT: s_getpc_b64 s[16:17] +; GFX8-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX8-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX8-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-NEXT: s_mov_b32 s32, 0 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-NEXT: s_endpgm +; +; GFX8-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX8-ARCH-FLAT: ; %bb.0: +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX8-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX8-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX8-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX8-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX8-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX8-ARCH-FLAT-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX8-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX8-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX8-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX8-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX8-ARCH-FLAT-NEXT: s_endpgm +; +; GFX9-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX9-NEXT: s_add_u32 s8, s8, 8 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: s_endpgm +; +; GFX9-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX9-ARCH-FLAT: ; %bb.0: +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX9-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX9-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX9-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX9-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-ARCH-FLAT-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX9-ARCH-FLAT-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX9-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX9-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX9-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-ARCH-FLAT-NEXT: s_endpgm +; +; GFX942-ARCH-FLAT-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX942-ARCH-FLAT: ; %bb.0: +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s12, s8 +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s8, s4, 8 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s13, s9 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s9, s5, 0 +; GFX942-ARCH-FLAT-NEXT: s_load_dword s15, s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_getpc_b64 s[4:5] +; GFX942-ARCH-FLAT-NEXT: s_add_u32 s4, s4, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX942-ARCH-FLAT-NEXT: s_addc_u32 s5, s5, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX942-ARCH-FLAT-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s14, s10 +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX942-ARCH-FLAT-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v31, v0 +; GFX942-ARCH-FLAT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-ARCH-FLAT-NEXT: v_mov_b32_e32 v0, s15 +; GFX942-ARCH-FLAT-NEXT: s_mov_b32 s32, 0 +; GFX942-ARCH-FLAT-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX942-ARCH-FLAT-NEXT: s_endpgm +; +; GFX10-LABEL: call_calls_intrin_ascast_cc_kernel: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s15 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: s_load_dword s15, s[8:9], 0x0 +; GFX10-NEXT: s_add_u32 s8, s8, 8 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_getpc_b64 s[16:17] +; GFX10-NEXT: s_add_u32 s16, s16, calls_intrin_ascast@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s17, s17, calls_intrin_ascast@gotpcrel32@hi+12 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX10-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX10-NEXT: s_mov_b32 s32, 0 +; GFX10-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s15 +; GFX10-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX10-NEXT: s_endpgm + call void @calls_intrin_ascast(ptr addrspace(3) %ptr) + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 55ed11ac62972..4f341fa71cf68 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -30,9 +30,11 @@ ; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1 ; NOOPT: .amdhsa_system_sgpr_workgroup_info 0 ; NOOPT: .amdhsa_system_vgpr_workitem_id 2 -define amdgpu_kernel void @foo() { +define amdgpu_kernel void @foo() #0 { ret void } +attributes #0 = { "amdgpu-no-flat-scratch-init" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index d5da3e00df1a6..10ca3c9d5f2c8 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: ; GCN-NOT: s6 -; GCN: s_getpc_b64 s[4:5] -; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x@rel32@hi+12 +; GCN: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+12 ; GCN-NOT: s6 -; GCN: s_mov_b32 s12, s6 +; GCN: s_mov_b32 s12, s4 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm @@ -217,7 +217,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y: ; GCN-NOT: s12 -; GCN: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s13, s5 ; GCN-NOT: s12 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -233,7 +233,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z: ; GCN-NOT: s12 ; GCN-NOT: s13 -; GCN: s_mov_b32 s14, s7 +; GCN: s_mov_b32 s14, s5 ; GCN-NOT: s12 ; GCN-NOT: s13 @@ -250,8 +250,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy: ; GCN-NOT: s14 -; GCN: s_mov_b32 s12, s6 -; GCN-NEXT: s_mov_b32 s13, s7 +; GCN: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s13, s5 ; GCN-NOT: s14 ; GCN: s_mov_b32 s32, 0 @@ -266,9 +266,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { } ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz: -; GCN: s_mov_b32 s12, s6 -; GCN: s_mov_b32 s13, s7 -; GCN: s_mov_b32 s14, s8 +; GCN: s_mov_b32 s12, s4 +; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s14, s6 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -283,8 +283,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz: ; GCN-NOT: s13 -; GCN: s_mov_b32 s12, s6 -; GCN-NEXT: s_mov_b32 s14, s7 +; GCN: s_mov_b32 s12, s4 +; GCN-NEXT: s_mov_b32 s14, s5 ; GCN-NOT: s13 ; GCN: s_mov_b32 s32, 0 @@ -300,8 +300,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz: -; GCN: s_mov_b32 s13, s7 -; GCN: s_mov_b32 s14, s8 +; GCN: s_mov_b32 s13, s5 +; GCN: s_mov_b32 s14, s6 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -382,7 +382,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { ; GCN-NOT: s13 ; GCN-NOT: s14 -; GCN-DAG: s_mov_b32 s12, s6 +; GCN-DAG: s_mov_b32 s12, s4 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-NOT: s13 ; GCN-NOT: s14 @@ -400,7 +400,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s13, s7 +; GCN-DAG: s_mov_b32 s13, s5 ; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -415,7 +415,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z: ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN-DAG: s_mov_b32 s14, s7 +; GCN-DAG: s_mov_b32 s14, s5 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 @@ -474,7 +474,7 @@ define hidden void @use_every_sgpr_input() #1 { ; GCN: .amdhsa_user_sgpr_queue_ptr 1 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 ; GCN: .amdhsa_user_sgpr_private_segment_size 0 ; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 @@ -499,7 +499,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { ; GCN: .amdhsa_user_sgpr_queue_ptr 1 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GCN: .amdhsa_user_sgpr_dispatch_id 1 -; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 ; GCN: .amdhsa_user_sgpr_private_segment_size 0 ; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll index ee4a2ed883b63..3fe3cafd729a7 100644 --- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll @@ -68,7 +68,7 @@ define amdgpu_kernel void @fadd( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) #0 { entry: %a.val = load float, ptr addrspace(1) %a %b.val = load float, ptr addrspace(1) %b @@ -80,7 +80,7 @@ entry: define amdgpu_kernel void @fsub( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) #0 { entry: %a.val = load float, ptr addrspace(1) %a %b.val = load float, ptr addrspace(1) %b @@ -99,7 +99,9 @@ define amdgpu_kernel void @empty( i32 %i, ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) { + ptr addrspace(1) %b) #0 { entry: ret void } + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index c17cf1cd6bca4..c167834470e3b 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,6 +5,9 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 { ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-NEXT: s_add_i32 s12, s12, s17 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 3 diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll index fcb8fa5997b7e..fc17d9288bf40 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,6 +6,8 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 39554e05c96b4..f964170ccdda5 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -1,11 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s define internal void @indirect() { -; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: ret void @@ -14,14 +10,6 @@ define internal void @indirect() { } define amdgpu_kernel void @test_simple_indirect_call() #0 { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; AKF_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8 -; AKF_GCN-NEXT: call void [[FP]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -40,7 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 { attributes #0 = { "amdgpu-no-dispatch-id" } ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 9104dc68eb9b4..72913d2596ebf 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s15 +; CHECK-NEXT: s_add_u32 s24, s24, s17 ; CHECK-NEXT: s_addc_u32 s25, s25, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index f3aec696abdee..e6f02295e67d5 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-LABEL: s_add_co_br_user: ; GFX7: ; %bb.0: ; %bb ; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_i32 s0, s2, s2 ; GFX7-NEXT: s_cmp_lt_u32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 0c25ca5076790..fac9f5bf826a6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -5,6 +5,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v1i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -18,6 +21,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -32,6 +38,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v2i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -54,6 +63,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -80,6 +92,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v3i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -102,6 +117,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -128,6 +146,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v4i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -150,6 +171,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -176,6 +200,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v8i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s0, s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 @@ -192,10 +219,13 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { ; VI-LABEL: extract_vector_elt_v8i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v3 @@ -213,6 +243,9 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v16i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,6 +268,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -261,6 +297,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v32i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s0, s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s0, 16 @@ -277,10 +316,13 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { ; VI-LABEL: extract_vector_elt_v32i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v3 @@ -298,6 +340,9 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 { ; SI-LABEL: extract_vector_elt_v64i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; SI-NEXT: s_load_dword s2, s[8:9], 0x10 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -320,6 +365,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -351,6 +399,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v2i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s2, s[8:9], 0xa ; SI-NEXT: s_load_dword s3, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -370,11 +421,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out ; VI-NEXT: s_load_dword s2, s[8:9], 0x4c ; VI-NEXT: s_load_dword s3, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_lshr_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -388,6 +442,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v3i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dword s2, s[8:9], 0x13 ; SI-NEXT: s_load_dword s3, s[8:9], 0xa ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -406,10 +463,13 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out ; VI-NEXT: s_load_dword s2, s[8:9], 0x4c ; VI-NEXT: s_load_dword s3, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s2, s2, 3 ; VI-NEXT: s_lshr_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -424,6 +484,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v4i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_load_dword s4, s[8:9], 0xc ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -442,6 +505,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -463,6 +529,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { ; SI-LABEL: dynamic_extract_vector_elt_v8i8: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: s_load_dword s4, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -481,6 +550,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -502,6 +574,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { ; SI-LABEL: reduce_load_vector_v8i8_extract_0123: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -526,6 +601,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -558,6 +636,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { ; SI-LABEL: reduce_load_vector_v8i8_extract_0145: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -581,6 +662,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -612,6 +696,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { ; SI-LABEL: reduce_load_vector_v8i8_extract_45: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 4 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -628,6 +715,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 4 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -649,6 +739,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { ; SI-LABEL: reduce_load_vector_v16i8_extract_0145: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -672,6 +765,9 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { ; VI: ; %bb.0: ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s2, s0, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 32f75f3835226..7b6a363c42708 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -14,6 +14,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -26,6 +29,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -80,6 +86,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -92,6 +101,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -145,6 +157,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -157,6 +172,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -196,6 +214,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -209,6 +230,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff @@ -251,6 +275,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; CI-LABEL: fabs_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -268,6 +295,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -325,6 +355,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -339,6 +372,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -386,6 +422,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -398,6 +437,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -441,6 +483,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -467,6 +512,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -523,9 +571,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_lshr_b32 s2, s4, 16 @@ -551,9 +602,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -610,6 +664,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -633,6 +690,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -718,6 +778,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -736,6 +799,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll index 6496b70b4d697..60334e46a4454 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -216,8 +222,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -227,6 +235,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 @@ -243,6 +252,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -351,6 +363,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 @@ -361,6 +376,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 +; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17 +; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index 4e12a30c6f6f4..9919497acea73 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -76,6 +79,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -87,6 +93,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -132,6 +141,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -184,6 +196,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -237,6 +252,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -289,6 +307,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou ; GFX678-LABEL: test_fold_canonicalize_undef_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -328,6 +349,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -367,6 +391,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -409,6 +436,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -449,6 +479,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, -1.0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -489,6 +522,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -529,6 +565,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -568,10 +607,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -612,10 +654,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -656,10 +701,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: s_mov_b32 s2, 0x7fffff -; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 +; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 ; GFX678-NEXT: flat_store_dword v[0:1], v2 ; GFX678-NEXT: s_endpgm @@ -700,6 +748,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -740,6 +791,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -782,6 +836,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -822,6 +879,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -862,6 +922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -902,6 +965,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -942,6 +1008,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -982,6 +1051,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1022,6 +1094,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1062,6 +1137,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 @@ -1102,6 +1180,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1 ; GFX678-LABEL: v_test_canonicalize_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1153,6 +1234,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX6-LABEL: s_test_canonicalize_var_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3] ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1163,6 +1247,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do ; GFX8-LABEL: s_test_canonicalize_var_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -1205,6 +1292,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1257,6 +1347,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1310,6 +1403,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v0, s0 ; GFX678-NEXT: v_mov_b32_e32 v1, s1 @@ -1362,10 +1458,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1407,10 +1506,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1450,10 +1552,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_p1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1491,10 +1596,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out) ; GFX678-LABEL: test_fold_canonicalize_n1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1532,10 +1640,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) % ; GFX678-LABEL: test_fold_canonicalize_literal_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1573,10 +1684,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, v0 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, v0 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1618,10 +1732,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1662,10 +1779,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1705,10 +1825,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, -1 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1749,10 +1872,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out ; GFX678-LABEL: test_fold_canonicalize_qnan_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1790,10 +1916,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1831,10 +1960,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1872,10 +2004,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1913,10 +2048,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1954,10 +2092,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -1995,10 +2136,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace( ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX678-NEXT: s_add_i32 s12, s12, s17 +; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX678-NEXT: v_mov_b32_e32 v0, 0 -; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: s_waitcnt lgkmcnt(0) ; GFX678-NEXT: v_mov_b32_e32 v3, s1 +; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000 ; GFX678-NEXT: v_mov_b32_e32 v2, s0 ; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX678-NEXT: s_endpgm @@ -2037,6 +2181,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2054,6 +2201,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2117,6 +2267,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2134,6 +2287,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2197,6 +2353,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2215,6 +2374,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2279,6 +2441,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2302,6 +2467,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2368,6 +2536,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2385,6 +2556,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2448,6 +2622,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2465,6 +2642,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2529,6 +2709,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2547,6 +2730,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2612,6 +2798,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -2635,6 +2824,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2700,6 +2892,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: s_add_i32 s12, s12, s17 +; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -2717,6 +2912,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll index fee6540f43c64..fc316b736d5f1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll @@ -22,12 +22,14 @@ ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define amdgpu_kernel void @test(ptr addrspace(1) %out) { +define amdgpu_kernel void @test(ptr addrspace(1) %out) #0 { entry: store i32 0, ptr addrspace(1) %out ret void } +; ALL-LABEL: {{^}}test_addr64: + ; HSA-DEFAULT: flat_store_dword ; HSA-NODEFAULT: buffer_store_dword ; HSA-NOADDR64: flat_store_dword @@ -35,7 +37,7 @@ entry: ; NOHSA-DEFAULT: buffer_store_dword ; NOHSA-NODEFAULT: flat_store_dword ; NOHSA-NOADDR64: flat_store_dword -define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) { +define amdgpu_kernel void @test_addr64(ptr addrspace(1) %out) #0 { entry: %out.addr = alloca ptr addrspace(1), align 4, addrspace(5) @@ -51,5 +53,7 @@ entry: ret void } +attributes #0 = { "amdgpu-no-flat-scratch-init" } + !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 45223a24e021a..a59382ba20dc5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -8,28 +8,34 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 8 ; VI-NOXNACK: ; TotalNumSgprs: 8 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 8 ; VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 +; HSA-VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{s7}"() @@ -41,12 +47,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 10 ; VI-NOXNACK: ; TotalNumSgprs: 10 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 10 ; VI-XNACK: ; TotalNumSgprs: 12 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 +; HSA-VI-XNACK: ; TotalNumSgprs: 12 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_no_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc}"() @@ -58,12 +70,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 12 ; VI-NOXNACK: ; TotalNumSgprs: 14 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 24 ; VI-XNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8 +; HSA-VI-XNACK: ; TotalNumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{flat_scratch}"() @@ -75,12 +93,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: ; TotalNumSgprs: 12 ; VI-NOXNACK: ; TotalNumSgprs: 14 +; HSA-VI-NOXNACK: ; TotalNumSgprs: 24 ; VI-XNACK: ; TotalNumSgprs: 14 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10 +; HSA-VI-XNACK: ; TotalNumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"() @@ -95,12 +119,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 +; HSA-VI-NOXNACK: NumSgprs: 24 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; HSA-VI-XNACK: NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch}"() @@ -115,9 +145,13 @@ entry: ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 +; HSA-VI-NOXNACK: NumSgprs: 24 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; HSA-VI-XNACK: NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_lo() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"() @@ -129,12 +163,18 @@ entry: ; NOT-HSA-CI: .amdhsa_reserve_xnack_mask ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 +; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1 ; CI: NumSgprs: 4 ; VI-NOXNACK: NumSgprs: 6 +; HSA-VI-NOXNACK: NumSgprs: 24 ; VI-XNACK: NumSgprs: 6 -; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6 -; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0 +; HSA-VI-XNACK: NumSgprs: 24 +; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6 +; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6 +; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0 +; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0 define amdgpu_kernel void @use_flat_scr_hi() #0 { entry: call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"() diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 64be9cb72a6ee..fb2448fb80744 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_fadd_use_test_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0 @@ -80,8 +83,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-NEXT: s_load_dword s3, s[8:9], 0x2c +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: v_add_f32_e64 v2, s4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -139,6 +145,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; VI-LABEL: multiple_use_fadd_fmad_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s4, s0, 4 @@ -194,6 +203,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s6, s4, 4 ; VI-NEXT: v_mov_b32_e32 v0, s1 @@ -255,6 +267,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 @@ -303,10 +318,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f32_e32 v0, s2, v0 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -350,6 +368,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -368,6 +389,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0 @@ -482,6 +506,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -503,6 +530,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -599,6 +629,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-DENORM: ; %bb.0: ; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3 @@ -620,6 +653,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 ; VI-FLUSH: ; %bb.0: ; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3 @@ -718,6 +754,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8 +; VI-DENORM-NEXT: s_add_i32 s12, s12, s17 +; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0 @@ -725,6 +763,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1 ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2 +; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-DENORM-NEXT: s_add_u32 s4, s2, 2 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3 ; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0 @@ -741,6 +780,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8 +; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17 +; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0 @@ -748,6 +789,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2 +; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 ; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0 @@ -847,6 +889,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 @@ -898,10 +943,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mul_f16_e32 v0, s2, v0 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 67bec43078803..eca8c2837b0fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -26,6 +29,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -83,6 +89,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; CI-LABEL: fneg_fabs_fmul_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s1, s0, 0x7fff ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -101,6 +110,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -162,6 +174,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -174,6 +189,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -229,6 +247,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -241,6 +262,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitset1_b32 s2, 15 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -294,6 +318,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace( ; CIVI-LABEL: v_fneg_fabs_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -348,6 +375,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 @@ -370,7 +400,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -379,6 +411,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, < ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -421,6 +454,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -433,6 +469,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_or_b32 s2, s2, 0x80008000 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -473,6 +512,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; CIVI-LABEL: fneg_fabs_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000 @@ -516,6 +558,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; CI-LABEL: fold_user_fneg_fabs_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1| @@ -537,7 +582,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -545,6 +592,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -584,6 +632,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -601,6 +652,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff @@ -655,6 +709,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010 @@ -679,7 +736,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s1, s4, 16 @@ -688,6 +747,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff ; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 781a2ca3146f5..058c273a65d99 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1477,6 +1477,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bitcmp1_b32 s6, 0 ; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 @@ -1488,6 +1490,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i ; GFX7-NEXT: s_cselect_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 23e4ba9fd4ed7..98e0b27cd955d 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -11,6 +11,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -23,6 +26,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -78,6 +84,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -92,6 +101,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -152,6 +164,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -164,6 +179,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -217,6 +235,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; CI-LABEL: v_fneg_fold_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -234,6 +255,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX8-LABEL: v_fneg_fold_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -289,6 +313,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -301,6 +328,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) # ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -340,14 +370,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 { ; CIVI-LABEL: s_fneg_v2f16_nonload: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 ; CIVI-NEXT: ;;#ASMSTART ; CIVI-NEXT: ; def s2 ; CIVI-NEXT: ;;#ASMEND ; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000 -; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm ; @@ -388,6 +421,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 @@ -402,6 +438,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -449,6 +488,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_xor_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -461,6 +503,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -501,6 +546,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; CI-LABEL: v_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -527,6 +575,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac ; GFX8-LABEL: v_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 @@ -572,6 +623,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; CI-LABEL: v_extract_fneg_fold_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -593,6 +647,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 { ; GFX8-LABEL: v_extract_fneg_fold_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -672,6 +729,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index a2fca33af1046..10573aad38a51 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -46,6 +52,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -57,6 +66,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -81,6 +93,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; CIVI-LABEL: load_v3f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_add_u32 s4, s0, 4 ; CIVI-NEXT: s_addc_u32 s5, s1, 0 @@ -114,6 +129,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; CIVI-LABEL: load_v4f16_arg: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v2, s2 @@ -139,6 +157,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -153,6 +174,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -183,6 +207,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -196,6 +223,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -227,6 +257,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -238,6 +271,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -265,6 +301,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s3, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -278,6 +317,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 @@ -308,6 +350,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 @@ -321,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -351,6 +399,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: s_lshr_b32 s5, s2, 16 @@ -366,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -401,6 +455,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s1, 16 ; CI-NEXT: s_lshr_b32 s7, s0, 16 @@ -429,6 +486,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s1, 16 ; VI-NEXT: s_lshr_b32 s7, s0, 16 @@ -485,6 +545,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; CI-LABEL: extload_f16_to_f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -498,6 +561,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a ; VI-LABEL: extload_f16_to_f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -529,6 +595,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; CI-LABEL: extload_v2f16_to_v2f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -545,6 +614,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 ; VI-LABEL: extload_v2f16_to_v2f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s0, s[8:9], 0x8 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 @@ -582,6 +654,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; CI-NEXT: s_lshr_b32 s4, s2, 16 @@ -603,6 +678,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 @@ -648,6 +726,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -673,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 @@ -726,6 +810,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s6, s3, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 @@ -773,6 +860,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s6, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 @@ -858,6 +948,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr ; CIVI-LABEL: global_load_store_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -886,6 +979,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v2f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -914,6 +1010,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add ; CIVI-LABEL: global_load_store_v4f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -942,6 +1041,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: global_load_store_v8f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -970,6 +1072,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f32: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1001,6 +1106,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1017,6 +1125,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1052,6 +1163,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1069,6 +1183,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1106,6 +1223,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1125,6 +1245,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1165,6 +1288,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1195,6 +1321,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1251,6 +1380,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 16 ; CI-NEXT: v_mov_b32_e32 v5, s3 @@ -1309,6 +1441,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1406,6 +1541,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr ; CIVI-LABEL: global_extload_f16_to_f64: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -1440,6 +1578,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v2f16_to_v2f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1458,6 +1599,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v2f16_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1498,6 +1642,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v3f16_to_v3f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1523,6 +1670,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v3f16_to_v3f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1574,6 +1724,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v4f16_to_v4f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1602,6 +1755,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v4f16_to_v4f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1659,6 +1815,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; CI-LABEL: global_extload_v8f16_to_v8f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1707,6 +1866,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ; VI-LABEL: global_extload_v8f16_to_v8f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1791,6 +1953,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; CI-LABEL: global_extload_v16f16_to_v16f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1885,6 +2050,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out ; VI-LABEL: global_extload_v16f16_to_v16f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2039,6 +2207,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p ; CIVI-LABEL: global_truncstore_f32_to_f16: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 @@ -2070,6 +2241,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v2f32_to_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2087,6 +2261,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v2f32_to_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2123,6 +2300,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v3f32_to_v3f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2146,6 +2326,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v3f32_to_v3f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2191,6 +2374,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v4f32_to_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2212,6 +2398,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v4f32_to_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2254,6 +2443,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; CI-LABEL: global_truncstore_v8f32_to_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2289,6 +2481,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou ; VI-LABEL: global_truncstore_v8f32_to_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2352,6 +2547,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; CI-LABEL: global_truncstore_v16f32_to_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s4, s2, 32 ; CI-NEXT: s_addc_u32 s5, s3, 0 @@ -2420,6 +2618,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) % ; VI-LABEL: global_truncstore_v16f32_to_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -2530,6 +2731,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; CI-LABEL: fadd_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[8:9], 0x2 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s0, 16 @@ -2547,6 +2751,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 @@ -2577,6 +2784,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s4, s2, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 @@ -2598,6 +2808,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s3, 16 ; VI-NEXT: s_lshr_b32 s5, s2, 16 @@ -2629,6 +2842,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; CI-LABEL: fadd_v4f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -2666,6 +2882,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; VI-LABEL: fadd_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -2706,6 +2925,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s10, s0, 16 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 @@ -2764,6 +2986,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s10, s7, 16 ; VI-NEXT: s_lshr_b32 s11, s3, 16 @@ -2824,6 +3049,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr ; CIVI-LABEL: test_bitcast_from_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 @@ -2853,6 +3081,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs ; CIVI-LABEL: test_bitcast_to_half: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: v_mov_b32_e32 v0, s2 ; CIVI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 8c017fa5ec263..cd89a36fe538b 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -23,7 +23,7 @@ define amdgpu_kernel void @test( ptr addrspace(1) %r, ptr addrspace(1) %a, - ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" { + ptr addrspace(1) %b) "amdgpu-no-implicitarg-ptr" "amdgpu-no-flat-scratch-init" { entry: %a.val = load half, ptr addrspace(1) %a %b.val = load half, ptr addrspace(1) %b @@ -170,7 +170,7 @@ define amdgpu_kernel void @num_spilled_vgprs() #1 { ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 1 -attributes #0 = { "amdgpu-num-sgpr"="20" } +attributes #0 = { "amdgpu-num-sgpr"="20" "amdgpu-no-flat-scratch-init" } attributes #1 = { "amdgpu-num-vgpr"="20" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index 5a2a976e23846..024593c49dba1 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -43,7 +43,7 @@ ; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269 ; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978 ; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63 -; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C +; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C ; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C ; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072 ; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370 @@ -59,7 +59,7 @@ ; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172 ; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D ; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB -; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367 +; ELF: 0210: 2E736770 725F636F 756E740C B12E7367 ; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7 ; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E ; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index b4b6bef7a7ed3..ec80efc5f0362 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -7,7 +7,7 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck --check-prefixes=GFX9V5 %s -define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) { +define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) #0 { ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -109,7 +109,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr ret void } -define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 @@ -163,7 +163,7 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ret void } -define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { +define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 @@ -283,7 +283,7 @@ define amdgpu_kernel void @llvm_ubsantrap() { unreachable } -define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) { +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 { ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 @@ -389,3 +389,5 @@ declare void @llvm.debugtrap() !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 CODE_OBJECT_VERSION} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 9eb966b4e2a94..0ca180ed6e105 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, %14 ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7208970 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7208969 /* reguse:SGPR_128 */, %12 ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %14 ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:VReg_128_Align2 */, %12 ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -46,16 +46,17 @@ define amdgpu_kernel void @v_input_output_i128() { define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6029322 /* regdef:AReg_128 */, def %12 - ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6029322 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6029321 /* reguse:AReg_128 */, %14 + ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %10 - ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:AReg_128_Align2 */, %12 ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll index 75db7571444bc..b51cb9df8d784 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_0: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI-LABEL: s_insertelement_v2bf16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -531,14 +552,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -611,14 +635,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -689,14 +716,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -769,14 +799,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -853,9 +886,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -948,9 +984,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -1065,9 +1104,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -1245,11 +1287,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1417,11 +1462,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 97c97ac8a7ad3..e11900ac0ca68 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -21,6 +21,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -68,6 +71,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -84,6 +90,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -152,6 +161,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -172,6 +184,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -251,6 +266,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -266,6 +284,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -320,6 +341,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -339,6 +363,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -407,6 +434,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -429,6 +459,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -498,6 +531,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2i16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -544,6 +580,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -560,6 +599,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -623,6 +665,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_0: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -668,6 +713,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CIVI-LABEL: s_insertelement_v2f16_1: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 @@ -714,6 +762,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -732,6 +783,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -788,9 +842,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -807,9 +864,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -880,6 +940,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -898,6 +961,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -953,6 +1019,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -971,6 +1040,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1038,6 +1110,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1056,6 +1131,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1123,6 +1201,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1141,6 +1222,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1195,6 +1279,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1213,6 +1300,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1267,6 +1357,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1285,6 +1378,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1353,6 +1449,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1371,6 +1470,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 @@ -1445,6 +1547,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1464,6 +1569,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1526,9 +1634,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1547,9 +1658,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 @@ -1612,11 +1726,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1639,11 +1756,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1712,14 +1832,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, s4, v0, v4 @@ -1731,9 +1854,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1790,14 +1916,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v0, v0, s4, v4 @@ -1809,9 +1938,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1883,14 +2015,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -1902,9 +2037,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -1961,14 +2099,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, v1, s4, v4 @@ -1980,9 +2121,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2054,14 +2198,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_perm_b32 v1, s4, v1, v4 @@ -2073,9 +2220,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2138,6 +2288,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: flat_load_dword v4, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2165,6 +2318,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) % ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: flat_load_dword v4, v[0:1] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 @@ -2268,9 +2424,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -2294,9 +2453,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -2363,9 +2525,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 @@ -2383,9 +2548,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2457,9 +2625,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2477,9 +2648,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2568,9 +2742,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v5, s1 @@ -2622,9 +2799,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s1 @@ -2803,11 +2983,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2830,9 +3013,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s3 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 @@ -2923,12 +3109,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2936,6 +3124,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 +; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_perm_b32 v3, s4, v3, v12 @@ -2949,11 +3138,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3087,11 +3279,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3184,11 +3379,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3] ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll index f0609f62a9024..5dff7372ab561 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -7,6 +7,9 @@ define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) { ; CHECK-LABEL: use_group_to_global_addrspacecast: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-NEXT: s_add_i32 s12, s12, s17 +; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: flat_store_dword v[0:1], v0 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll index 621187100f323..55a5d50f06bbd 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll @@ -6,6 +6,8 @@ define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshr_b32 s4, s6, 16 diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll index 496a1c652da25..1a32953305bbc 100644 --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -7,7 +7,7 @@ declare void @llvm.trap() #0 ; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 ; DOORBELL-NEXT: .amdhsa_kernarg_size 8 -; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 14 ; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; DOORBELL: .end_amdhsa_kernel diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 4b6cc32522f5b..7179f687c70f2 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -47,11 +47,7 @@ ; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O0-NEXT: Expand reduction intrinsics -; GCN-O0-NEXT: CallGraph Construction -; GCN-O0-NEXT: Call Graph SCC Pass Manager -; GCN-O0-NEXT: AMDGPU Annotate Kernel Features -; GCN-O0-NEXT: FunctionPass Manager -; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager @@ -232,11 +228,7 @@ ; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics -; GCN-O1-NEXT: CallGraph Construction -; GCN-O1-NEXT: Call Graph SCC Pass Manager -; GCN-O1-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager @@ -531,11 +523,7 @@ ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics ; GCN-O1-OPTS-NEXT: Early CSE -; GCN-O1-OPTS-NEXT: CallGraph Construction -; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features -; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager @@ -848,11 +836,7 @@ ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics ; GCN-O2-NEXT: Early CSE -; GCN-O2-NEXT: CallGraph Construction -; GCN-O2-NEXT: Call Graph SCC Pass Manager -; GCN-O2-NEXT: AMDGPU Annotate Kernel Features -; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager @@ -1180,11 +1164,7 @@ ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering -; GCN-O3-NEXT: CallGraph Construction -; GCN-O3-NEXT: Call Graph SCC Pass Manager -; GCN-O3-NEXT: AMDGPU Annotate Kernel Features -; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments +; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index f93d80cc7adf8..4edd0357c6e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -30,9 +30,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32 ; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -59,10 +62,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32 ; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -133,6 +139,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-SDAG: ; %bb.0: ; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 ; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -166,6 +175,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 637d8388cddf1..9d078f7906b4d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -63,9 +63,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33 ; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -92,10 +95,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33 ; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc ; CI-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -200,6 +206,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-SDAG: ; %bb.0: ; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1 ; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33 +; CI-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1 ; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -233,6 +242,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { ; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33 +; CI-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0 ; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll index 97219a8f143ce..0fe371c1b51fe 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) { define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: kernel_lds_id: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GCN-NEXT: s_add_i32 s2, s12, 42 +; GCN-NEXT: s_add_i32 s2, s14, 42 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -74,6 +77,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 { ; GCN-LABEL: doesnt_use_it: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v2, 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 55fa02a0c582c..cc9e34be209b4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -284,6 +284,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -294,6 +297,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -309,10 +315,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -321,10 +330,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -337,10 +349,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -349,11 +364,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -366,12 +384,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-SDAG-LABEL: test_readfirstlane_m0: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -379,12 +400,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) { ; CHECK-GISEL-LABEL: test_readfirstlane_m0: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -398,25 +422,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -430,13 +460,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -444,13 +477,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -464,13 +500,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -478,13 +517,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index edb6ebcee1325..f2b0959cc706e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32 ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-SDAG-LABEL: test_readlane_m0_sreg: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 m0, -1 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm @@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src ; CHECK-GISEL-LABEL: test_readlane_m0_sreg: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 m0, -1 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v0 ; CHECK-SDAG-NEXT: ;;#ASMEND +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; @@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: ; def v0 ; CHECK-GISEL-NEXT: ;;#ASMEND +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: ; def v[0:1] ; CHECK-SDAG-NEXT: ;;#ASMEND ; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32 ; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1 ; CHECK-GISEL-NEXT: ; def v[0:1] ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b32 s2, 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2 ; CHECK-SDAG-NEXT: s_endpgm ; ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b32 s2, 0 ; CHECK-GISEL-NEXT: ;;#ASMEND -; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2 ; CHECK-GISEL-NEXT: s_endpgm @@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm @@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-SDAG: ; %bb.0: ; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17 +; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CHECK-SDAG-NEXT: ;;#ASMSTART ; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-SDAG-NEXT: ;;#ASMEND -; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-SDAG-NEXT: s_endpgm @@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64: ; CHECK-GISEL: ; %bb.0: ; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17 ; CHECK-GISEL-NEXT: ;;#ASMSTART ; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0 ; CHECK-GISEL-NEXT: ;;#ASMEND ; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CHECK-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 04d179478590b..4ac2cc98970b5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-SDAG-LABEL: test_writelane_sreg_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -53,6 +56,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s ; GFX802-GISEL-LABEL: test_writelane_sreg_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 @@ -98,6 +104,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -147,6 +156,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -202,6 +214,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 @@ -251,6 +266,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 @@ -306,6 +324,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -348,6 +369,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -396,6 +420,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -444,6 +471,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -498,11 +528,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -551,11 +584,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -609,6 +645,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -668,6 +707,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -738,6 +780,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -803,6 +848,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -877,7 +925,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -886,6 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1] ; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 @@ -946,7 +997,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -956,6 +1009,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p ; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1028,15 +1082,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-SDAG-NEXT: ;;#ASMSTART ; GFX802-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX802-SDAG-NEXT: ;;#ASMEND +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: s_mov_b32 s4, m0 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2 ; GFX802-SDAG-NEXT: s_endpgm ; @@ -1081,15 +1138,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32 ; GFX802-GISEL-NEXT: ;;#ASMSTART ; GFX802-GISEL-NEXT: s_mov_b32 m0, -1 ; GFX802-GISEL-NEXT: ;;#ASMEND +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: s_mov_b32 s4, m0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s2 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3 ; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0 +; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX802-GISEL-NEXT: s_endpgm ; @@ -1138,6 +1198,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1180,6 +1243,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1227,6 +1293,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-SDAG-LABEL: test_writelane_imm_i64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1270,6 +1339,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr ; GFX802-GISEL-LABEL: test_writelane_imm_i64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1319,6 +1391,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-SDAG-LABEL: test_writelane_imm_f64: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 @@ -1362,6 +1437,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double ; GFX802-GISEL-LABEL: test_writelane_imm_f64: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1412,6 +1490,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 @@ -1449,6 +1530,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 @@ -1492,10 +1576,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1538,11 +1625,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1589,10 +1679,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s6 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3 ; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0 @@ -1635,11 +1728,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval, ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18 ; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX802-GISEL-NEXT: s_mov_b32 m0, s6 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3 @@ -1684,7 +1780,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32: ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-SDAG-NEXT: s_mov_b32 m0, s3 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0 @@ -1716,7 +1815,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out, ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32: ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s3 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -1754,11 +1856,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1797,11 +1902,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1845,11 +1953,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-SDAG: ; %bb.0: ; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10 +; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0 @@ -1888,11 +1999,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out, ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10 ; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000 ; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX802-GISEL-NEXT: s_mov_b32 m0, s4 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0 ; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0 ; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll index 6f95364ac3644..919c1dfd4694e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_f64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -90,7 +93,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu ; ; GFX7-HSA-LABEL: constant_load_2v4f64: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 51dfbda53ad4c..817c5def5614f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: constant_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -117,6 +120,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -188,6 +194,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 @@ -286,6 +295,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -360,6 +372,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp ; GCN-HSA-LABEL: constant_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -445,6 +460,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs ; GCN-HSA-LABEL: constant_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -584,6 +602,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GCN-HSA-LABEL: constant_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 @@ -831,6 +852,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -906,6 +930,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -982,6 +1009,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1057,6 +1087,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1131,6 +1164,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1216,6 +1252,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1305,6 +1344,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1402,6 +1444,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1504,6 +1549,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1610,6 +1658,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1727,6 +1778,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1885,6 +1939,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2062,6 +2119,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2324,6 +2384,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2631,7 +2694,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3112,7 +3178,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3680,7 +3749,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4596,7 +4668,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5383,6 +5458,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5480,6 +5558,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; GCN-HSA-LABEL: constant_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5578,6 +5659,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5670,6 +5754,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5767,12 +5854,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 ; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff @@ -5877,6 +5967,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -5980,10 +6073,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 @@ -6136,6 +6232,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6292,10 +6391,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 @@ -6510,6 +6612,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6771,10 +6876,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 @@ -7156,6 +7264,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7631,7 +7742,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8354,7 +8468,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 120f47a277ee6..68a6a148819e8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac ; GFX7-HSA-LABEL: constant_load_i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -103,6 +106,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v2i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -190,6 +196,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -284,6 +293,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v4i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -383,6 +395,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v8i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16 @@ -517,6 +532,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v9i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -678,6 +696,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v10i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -847,6 +868,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v11i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1023,6 +1047,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs ; GFX7-HSA-LABEL: constant_load_v12i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8 ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 @@ -1202,7 +1229,10 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v16i32: ; GFX7-HSA: ; %bb.0: ; %entry +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48 @@ -1389,6 +1419,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1473,6 +1506,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1563,6 +1599,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 @@ -1647,6 +1686,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -1739,12 +1781,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -1837,6 +1882,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1949,13 +1997,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2082,6 +2133,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2244,8 +2298,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -2253,6 +2309,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11 @@ -2452,6 +2509,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2748,7 +2808,10 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3196,7 +3259,10 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3628,7 +3694,10 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) % ; ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4479,8 +4548,10 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 @@ -4509,6 +4580,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) % ; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90 ; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 @@ -5097,7 +5169,10 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs ; ; GFX7-HSA-LABEL: constant_load_v32i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index b3e75e767ae64..2219ceea7ec9b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac ; GFX7-LABEL: constant_load_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -95,6 +98,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v2i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v4, s0 @@ -179,6 +185,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v3i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 @@ -294,6 +303,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp ; GFX7-LABEL: constant_load_v4i64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-NEXT: s_add_u32 s10, s8, 16 @@ -421,7 +433,10 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp ; ; GFX7-LABEL: constant_load_v8i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-NEXT: s_add_u32 s18, s16, 48 @@ -638,7 +653,10 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs ; ; GFX7-LABEL: constant_load_v16i64: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_add_i32 s12, s12, s17 ; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index c608bef3f726e..4031be65fab61 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace ; GFX7-HSA-LABEL: constant_load_i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -112,6 +115,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v2i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -195,6 +201,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v3i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -305,6 +314,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v4i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -374,6 +386,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa ; GFX7-HSA-LABEL: constant_load_v8i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -448,6 +463,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp ; GFX7-HSA-LABEL: constant_load_v16i8: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -529,6 +547,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -604,6 +625,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -680,6 +704,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -755,6 +782,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -834,6 +864,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -933,6 +966,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1030,6 +1066,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1131,6 +1170,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32: ; GFX7-HSA: ; %bb.0: ; %entry ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 @@ -1232,6 +1274,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1336,6 +1381,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1453,6 +1501,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1612,6 +1663,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,6 +1848,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2060,6 +2117,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2374,6 +2434,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -2856,6 +2919,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -3437,7 +3503,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4353,7 +4422,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o ; ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32: ; GFX7-HSA: ; %bb.0: +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 ; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -5161,6 +5233,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5243,6 +5318,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5328,6 +5406,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5408,6 +5489,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5496,6 +5580,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5603,6 +5690,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5716,10 +5806,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24 @@ -5854,6 +5947,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6013,10 +6109,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 @@ -6235,6 +6334,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -6504,10 +6606,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24 @@ -6898,6 +7003,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7387,10 +7495,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24 ; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24 @@ -8128,6 +8239,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -8898,6 +9012,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -8982,6 +9099,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9068,6 +9188,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9152,6 +9275,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9241,6 +9367,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9340,6 +9469,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -9452,6 +9584,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9560,6 +9695,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 @@ -9683,6 +9821,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -9832,6 +9973,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -10014,6 +10158,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10261,6 +10408,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -10574,6 +10724,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -11018,6 +11171,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX7-HSA: ; %bb.0: ; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17 +; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index c5771bc73b945..9054e509cde8e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace( ; GCN-HSA-LABEL: global_load_i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v2i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v3i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -339,6 +348,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v4i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -424,6 +436,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac ; GCN-HSA-LABEL: global_load_v8i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -512,6 +527,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa ; GCN-HSA-LABEL: global_load_v16i16: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -662,6 +680,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a ; GCN-HSA-LABEL: global_load_v16i16_align2: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -811,6 +832,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -896,6 +920,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -984,6 +1011,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1069,6 +1099,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1159,6 +1192,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1258,6 +1294,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1359,6 +1398,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1469,6 +1511,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1586,6 +1631,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1701,6 +1749,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1823,6 +1874,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1972,6 +2026,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2136,6 +2193,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2372,6 +2432,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2643,6 +2706,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3054,6 +3120,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3573,6 +3642,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -4377,6 +4449,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5142,6 +5217,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_zextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5239,6 +5317,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr ; GCN-HSA-LABEL: global_sextload_i16_to_i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5334,6 +5415,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5426,6 +5510,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5524,6 +5611,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5633,6 +5723,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5751,6 +5844,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -5896,6 +5992,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6056,10 +6155,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6074,8 +6173,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4 @@ -6275,6 +6377,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6525,10 +6630,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -6545,7 +6650,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8 @@ -6905,6 +7013,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -7376,6 +7487,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -8078,6 +8192,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 033a66abcedb9..e8c862a3cb93c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace( ; GCNX3-HSA-LABEL: global_load_i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v2i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v3i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v4i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v8i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac ; GCNX3-HSA-LABEL: global_load_v9i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v10i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v11i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v12i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 @@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v16i32: ; GCNX3-HSA: ; %bb.0: ; %entry ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1569,6 +1614,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1674,8 +1722,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1683,6 +1733,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 @@ -1800,6 +1851,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1941,8 +1995,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1957,6 +2013,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -2134,6 +2191,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2370,6 +2430,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -2731,8 +2794,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 -; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -2766,6 +2831,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou ; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) @@ -3122,6 +3188,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -3589,12 +3658,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; ; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: ; GCN-GFX900-HSA: ; %bb.0: -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15 -; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 +; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17 +; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0 ; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 ; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 @@ -3620,11 +3689,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 ; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[20:23], 0 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill -; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill +; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 @@ -3667,11 +3736,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 ; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[20:23], 0 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_nop 0 -; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload -; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload +; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload ; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 ; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 @@ -3913,6 +3982,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -4437,6 +4509,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa ; GCNX3-HSA-LABEL: global_load_v32i32: ; GCNX3-HSA: ; %bb.0: ; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17 +; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index 4dfc773d615e4..1a6fa3c518ca7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -13,7 +13,8 @@ ; GCN: s_cselect_b32 ; GCN-NOT: load_dword -; GCN: flat_load_dwordx2 +; GCN: flat_load_dword +; GCN: flat_load_dword ; GCN-NOT: load_dword ; GCN: flat_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 245a2775d9f2f..07b5e1610cfc0 100644 --- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff -; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]] +; GCN: s_mul_i32 [[MUL:s[0-9]+]], s14, [[WGSIZEX]] ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 { %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index b3b529d4e5e5b..4896e504cfdf4 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: s_mul_i32 s12, s12, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s12 +; GFX9-NEXT: s_mul_i32 s14, s14, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s14 ; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] @@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff -; GFX10-NEXT: s_mul_i32 s12, s12, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0 +; GFX10-NEXT: s_mul_i32 s14, s14, s4 +; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 88ee2a34dd49f..8d020b9e1a603 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK-LABEL: memcpy_p0_p0_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v12, s3 ; CHECK-NEXT: v_mov_b32_e32 v11, s2 @@ -94,12 +96,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { ; CHECK-LABEL: memcpy_p5_p4_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -107,50 +109,50 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112 ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 -; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -160,55 +162,57 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { ; CHECK-LABEL: memcpy_p0_p5_minsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s15 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v26, s0 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 ; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 @@ -268,6 +272,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 ; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v21, s1 ; CHECK-NEXT: v_mov_b32_e32 v20, s0 @@ -294,6 +300,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK-LABEL: memcpy_p0_p0_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v12, s3 ; CHECK-NEXT: v_mov_b32_e32 v11, s2 @@ -379,12 +387,12 @@ entry: define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { ; CHECK-LABEL: memcpy_p5_p4_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v24, 0 -; CHECK-NEXT: s_add_u32 s16, s16, s15 +; CHECK-NEXT: s_add_u32 s20, s20, s17 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 @@ -392,50 +400,50 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 ; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 ; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: v_mov_b32_e32 v25, s2 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112 ; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96 ; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 -; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -445,55 +453,57 @@ entry: define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { ; CHECK-LABEL: memcpy_p0_p5_optsize: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3] -; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] +; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] +; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 -; CHECK-NEXT: s_add_u32 s16, s16, s15 -; CHECK-NEXT: s_addc_u32 s17, s17, 0 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s20, s20, s17 +; CHECK-NEXT: s_addc_u32 s21, s21, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v26, s0 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 -; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 -; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 -; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 -; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 -; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 -; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 -; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 -; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 -; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 -; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 -; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 -; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v25, s1 ; CHECK-NEXT: v_mov_b32_e32 v24, s0 ; CHECK-NEXT: s_waitcnt vmcnt(20) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 -; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 -; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 -; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 +; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 -; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12 ; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 ; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 @@ -553,6 +563,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 ; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 ; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v21, s1 ; CHECK-NEXT: v_mov_b32_e32 v20, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 5af37809443e0..07ad8cb0c4a3d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX7-LABEL: flat_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_load( ; GFX7-LABEL: flat_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -423,6 +466,8 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -531,6 +576,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX7-LABEL: flat_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -547,6 +595,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -565,6 +617,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -598,6 +654,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -612,6 +670,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -739,6 +799,9 @@ entry: define amdgpu_kernel void @flat_agent_unordered_store( ; GFX7-LABEL: flat_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -750,6 +813,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -761,6 +828,10 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -783,6 +854,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -793,6 +866,8 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -873,6 +948,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX7-LABEL: flat_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -884,6 +962,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -895,6 +977,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -917,6 +1003,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -927,6 +1015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1007,6 +1097,9 @@ entry: define amdgpu_kernel void @flat_agent_release_store( ; GFX7-LABEL: flat_agent_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1019,6 +1112,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-WGP-LABEL: flat_agent_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1032,6 +1129,10 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX10-CU-LABEL: flat_agent_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1057,6 +1158,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1068,6 +1171,8 @@ define amdgpu_kernel void @flat_agent_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1165,6 +1270,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX7-LABEL: flat_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1177,6 +1285,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1190,6 +1302,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1215,6 +1331,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1226,6 +1344,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1323,6 +1443,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1334,6 +1457,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,6 +1472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1367,6 +1498,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1457,6 +1592,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,6 +1608,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1485,6 +1627,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1512,6 +1658,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,6 +1672,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1622,6 +1772,9 @@ entry: define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX7-LABEL: flat_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1634,6 +1787,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1647,6 +1804,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1672,6 +1833,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1683,6 +1846,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1780,6 +1945,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1794,6 +1962,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1811,6 +1983,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1841,6 +2017,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1854,6 +2032,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1969,6 +2149,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1983,6 +2166,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2000,6 +2187,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2030,6 +2221,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2043,6 +2236,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2158,6 +2353,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2174,6 +2372,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2191,6 +2393,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2223,6 +2429,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2237,6 +2445,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2352,6 +2562,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2369,6 +2582,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2388,6 +2605,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2423,6 +2644,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2438,6 +2661,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2574,6 +2799,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2591,6 +2819,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2610,6 +2842,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2645,6 +2881,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2660,6 +2898,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2796,6 +3036,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2821,6 +3064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2846,6 +3093,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2896,6 +3147,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2910,6 +3163,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3019,6 +3274,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3046,6 +3304,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3075,6 +3337,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3130,6 +3396,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3146,6 +3414,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3273,6 +3543,9 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3299,6 +3572,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3326,6 +3603,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3379,6 +3660,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3394,6 +3677,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3520,6 +3805,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3548,6 +3836,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3579,6 +3871,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3637,6 +3933,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3654,6 +3952,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3798,6 +4098,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3826,6 +4129,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3857,6 +4164,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3915,6 +4226,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3932,6 +4245,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4076,6 +4391,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4103,6 +4421,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4132,6 +4454,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4187,6 +4513,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4203,6 +4531,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4330,6 +4660,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4357,6 +4690,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4386,6 +4723,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4441,6 +4782,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4457,6 +4800,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4584,6 +4929,9 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4612,6 +4960,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4643,6 +4995,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4701,6 +5057,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4718,6 +5076,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4862,6 +5222,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4890,6 +5253,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4921,6 +5288,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4979,6 +5350,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4996,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5140,6 +5515,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5168,6 +5546,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5199,6 +5581,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5257,6 +5643,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5274,6 +5662,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5418,6 +5808,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5446,6 +5839,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5477,6 +5874,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5535,6 +5936,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5552,6 +5955,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5696,6 +6101,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5724,6 +6132,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5755,6 +6167,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5813,6 +6229,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5830,6 +6248,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5974,6 +6394,9 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6002,6 +6425,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6033,6 +6460,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6091,6 +6522,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6108,6 +6541,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6252,6 +6687,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6280,6 +6718,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6311,6 +6753,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6369,6 +6815,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6386,6 +6834,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6530,6 +6980,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6558,6 +7011,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6589,6 +7046,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6647,6 +7108,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6664,6 +7127,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6808,6 +7273,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6837,6 +7305,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6866,6 +7338,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6924,6 +7400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6941,6 +7419,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7077,6 +7557,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7107,6 +7590,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7138,6 +7625,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7198,6 +7689,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7216,6 +7709,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7361,6 +7856,9 @@ entry: define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7391,6 +7889,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7422,6 +7924,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7483,6 +7989,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7501,6 +8009,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7654,6 +8164,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7685,6 +8198,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7718,6 +8235,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7781,6 +8302,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7800,6 +8323,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7966,6 +8491,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7997,6 +8525,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8030,6 +8562,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8093,6 +8629,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8112,6 +8650,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8278,6 +8818,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8308,6 +8851,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8339,6 +8886,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8399,6 +8950,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8417,6 +8970,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8566,6 +9121,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8596,6 +9154,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8627,6 +9189,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8687,6 +9253,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8705,6 +9273,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8850,6 +9420,9 @@ entry: define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8881,6 +9454,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8914,6 +9491,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8977,6 +9558,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8996,6 +9579,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9162,6 +9747,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9193,6 +9781,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9226,6 +9818,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9289,6 +9885,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9308,6 +9906,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9474,6 +10074,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9505,6 +10108,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9538,6 +10145,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9601,6 +10212,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9620,6 +10233,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9786,6 +10401,9 @@ entry: define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9817,6 +10435,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9850,6 +10472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9913,6 +10539,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9932,6 +10560,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10098,6 +10728,9 @@ entry: define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10129,6 +10762,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10162,6 +10799,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10225,6 +10866,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10244,6 +10887,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10406,6 +11051,9 @@ entry: define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10437,6 +11085,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10470,6 +11122,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10533,6 +11189,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10552,6 +11210,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10718,6 +11378,9 @@ entry: define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10749,6 +11412,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10782,6 +11449,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10845,6 +11516,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10864,6 +11537,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11030,6 +11705,9 @@ entry: define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -11061,6 +11739,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11094,6 +11776,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11157,6 +11843,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11176,6 +11864,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11342,6 +12032,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX7-LABEL: flat_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11356,6 +12049,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +12067,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11398,6 +12099,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11410,6 +12113,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11509,6 +12214,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX7-LABEL: flat_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11523,6 +12231,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11537,6 +12249,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11565,6 +12281,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11577,6 +12295,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11676,6 +12396,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX7-LABEL: flat_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11692,6 +12415,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11709,6 +12436,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11741,6 +12472,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11755,6 +12488,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11868,6 +12603,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX7-LABEL: flat_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11885,6 +12623,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11904,6 +12646,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11939,6 +12685,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11954,6 +12702,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12086,6 +12836,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX7-LABEL: flat_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12097,6 +12850,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12108,6 +12865,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12130,6 +12891,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12140,6 +12903,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12220,6 +12985,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX7-LABEL: flat_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12231,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12242,6 +13014,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12264,6 +13040,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12274,6 +13052,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12354,6 +13134,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX7-LABEL: flat_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12366,6 +13149,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12379,6 +13166,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12404,6 +13195,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12415,6 +13208,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12512,6 +13307,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX7-LABEL: flat_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12524,6 +13322,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12537,6 +13339,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12562,6 +13368,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12573,6 +13381,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12670,6 +13480,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12681,6 +13494,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12692,6 +13509,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12714,6 +13535,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12724,6 +13547,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12804,6 +13629,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12817,6 +13645,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12831,6 +13663,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12857,6 +13693,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12869,6 +13707,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12965,6 +13805,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12977,6 +13820,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12990,6 +13837,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13015,6 +13866,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13026,6 +13879,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13123,6 +13978,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13137,6 +13995,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13153,6 +14015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13182,6 +14048,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13195,6 +14063,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13308,6 +14178,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13322,6 +14195,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13338,6 +14215,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13367,6 +14248,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13380,6 +14263,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13493,6 +14378,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13510,6 +14398,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13528,6 +14420,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13562,6 +14458,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13577,6 +14475,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13697,6 +14597,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13715,6 +14618,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,6 +14642,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13772,6 +14683,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13788,6 +14701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13929,6 +14844,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13947,6 +14865,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13967,6 +14889,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14004,6 +14930,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14020,6 +14948,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14161,6 +15091,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14186,6 +15119,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14211,6 +15148,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14261,6 +15202,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14275,6 +15218,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14384,6 +15329,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14411,6 +15359,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14439,6 +15391,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14493,6 +15449,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14509,6 +15467,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14634,6 +15594,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14660,6 +15623,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14687,6 +15654,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14740,6 +15711,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14755,6 +15728,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14881,6 +15856,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14909,6 +15887,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14939,6 +15921,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14996,6 +15982,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15013,6 +16001,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15155,6 +16145,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15183,6 +16176,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15213,6 +16210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15270,6 +16271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15287,6 +16290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15429,6 +16434,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15456,6 +16464,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15484,6 +16496,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15538,6 +16554,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15554,6 +16572,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15679,6 +16699,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15706,6 +16729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15734,6 +16761,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15788,6 +16819,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15804,6 +16837,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15929,6 +16964,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15957,6 +16995,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15987,6 +17029,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16044,6 +17090,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16061,6 +17109,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16203,6 +17253,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16231,6 +17284,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16261,6 +17318,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16318,6 +17379,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16335,6 +17398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16477,6 +17542,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16505,6 +17573,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16535,6 +17607,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16592,6 +17668,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16609,6 +17687,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16751,6 +17831,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16779,6 +17862,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16809,6 +17896,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16866,6 +17957,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16883,6 +17976,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17025,6 +18120,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17053,6 +18151,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17083,6 +18185,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17140,6 +18246,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17157,6 +18265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17299,6 +18409,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17327,6 +18440,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17357,6 +18474,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17414,6 +18535,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17431,6 +18554,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17573,6 +18698,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17601,6 +18729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17631,6 +18763,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17688,6 +18824,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17705,6 +18843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17847,6 +18987,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17875,6 +19018,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17905,6 +19052,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17962,6 +19113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17979,6 +19132,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18121,6 +19276,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18150,6 +19308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18179,6 +19341,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18237,6 +19403,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18254,6 +19422,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18390,6 +19560,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18421,6 +19594,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18453,6 +19630,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18515,6 +19696,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18534,6 +19717,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18684,6 +19869,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18714,6 +19902,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18745,6 +19937,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18806,6 +20002,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18824,6 +20022,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18977,6 +20177,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19009,6 +20212,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19043,6 +20250,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19108,6 +20319,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19128,6 +20341,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19299,6 +20514,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19331,6 +20549,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19365,6 +20587,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19430,6 +20656,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19450,6 +20678,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19621,6 +20851,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19652,6 +20885,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19684,6 +20921,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19746,6 +20987,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19765,6 +21008,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19919,6 +21164,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19950,6 +21198,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19982,6 +21234,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20044,6 +21300,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20063,6 +21321,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20213,6 +21473,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20245,6 +21508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20279,6 +21546,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20344,6 +21615,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20364,6 +21637,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20535,6 +21810,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20567,6 +21845,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20601,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20666,6 +21952,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20686,6 +21974,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20857,6 +22147,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20889,6 +22182,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20923,6 +22220,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20988,6 +22289,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21008,6 +22311,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21179,6 +22484,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21211,6 +22519,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21245,6 +22557,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21310,6 +22626,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21330,6 +22648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21501,6 +22821,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21533,6 +22856,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21567,6 +22894,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21632,6 +22963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21652,6 +22985,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21819,6 +23154,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21851,6 +23189,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21885,6 +23227,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21950,6 +23296,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21970,6 +23318,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22141,6 +23491,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22173,6 +23526,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22207,6 +23564,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22272,6 +23633,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22292,6 +23655,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22463,6 +23828,9 @@ entry: define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22495,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22529,6 +23901,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22594,6 +23970,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22614,6 +23992,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 30c0a322d7ddc..3c24c36ec547d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -240,6 +262,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -298,6 +324,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_nop 0 @@ -329,6 +357,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_nop 0 @@ -537,6 +567,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -551,6 +584,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -565,6 +602,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -593,6 +634,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -605,6 +648,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -704,6 +749,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -732,6 +780,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -759,6 +811,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -814,6 +870,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -843,6 +901,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1047,6 +1107,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX7-LABEL: flat_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1062,6 +1125,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1077,6 +1144,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: flat_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1178,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1120,6 +1193,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b80dfaea01653..b88a10ab24a98 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX7-LABEL: flat_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX7-LABEL: flat_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -516,6 +561,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX7-LABEL: flat_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -683,6 +743,9 @@ entry: define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX7-LABEL: flat_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -817,6 +892,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX7-LABEL: flat_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -951,6 +1041,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_store( ; GFX7-LABEL: flat_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,6 +1190,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX7-LABEL: flat_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1219,6 +1339,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,6 +1488,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1487,6 +1637,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1621,6 +1786,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1755,6 +1935,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,6 +2084,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2068,6 +2278,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2247,6 +2472,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2426,6 +2666,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2649,6 +2904,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2872,6 +3142,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3095,6 +3380,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3318,6 +3618,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3541,6 +3856,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3764,6 +4094,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3987,6 +4332,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4210,6 +4570,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4433,6 +4808,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4656,6 +5046,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4879,6 +5284,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5102,6 +5522,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5325,6 +5760,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5548,6 +5998,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5771,6 +6236,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6040,6 +6520,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6309,6 +6804,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6578,6 +7088,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6847,6 +7372,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7116,6 +7656,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7385,6 +7940,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7654,6 +8224,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7923,6 +8508,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8192,6 +8792,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8461,6 +9076,9 @@ entry: define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8730,6 +9360,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8999,6 +9644,9 @@ entry: define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9268,6 +9928,9 @@ entry: define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9537,6 +10212,9 @@ entry: define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9806,6 +10496,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX7-LABEL: flat_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9973,6 +10678,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10140,6 +10860,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX7-LABEL: flat_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10307,6 +11042,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10474,6 +11224,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX7-LABEL: flat_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10608,6 +11373,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10742,6 +11522,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX7-LABEL: flat_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10876,6 +11671,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11010,6 +11820,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11144,6 +11969,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11278,6 +12118,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11412,6 +12267,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11546,6 +12416,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11680,6 +12565,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11859,6 +12759,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,6 +12953,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12217,6 +13147,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12440,6 +13385,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12663,6 +13623,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12886,6 +13861,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13109,6 +14099,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13332,6 +14337,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13555,6 +14575,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13778,6 +14813,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14001,6 +15051,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14224,6 +15289,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14447,6 +15527,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14670,6 +15765,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14893,6 +16003,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15116,6 +16241,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15339,6 +16479,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15562,6 +16717,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15831,6 +17001,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16100,6 +17285,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16369,6 +17569,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16638,6 +17853,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16907,6 +18137,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17176,6 +18421,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17445,6 +18705,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17714,6 +18989,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17983,6 +19273,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18252,6 +19557,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18521,6 +19841,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18790,6 +20125,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19059,6 +20409,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19328,6 +20693,9 @@ entry: define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19357,6 +20725,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19386,6 +20758,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19444,6 +20820,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19461,6 +20839,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 1ec942ea5f47b..919fc3e8f4e4f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX10-CU-LABEL: flat_system_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_system_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_load( ; GFX7-LABEL: flat_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_load( ; GFX7-LABEL: flat_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX10-CU-LABEL: flat_system_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -424,6 +467,8 @@ define amdgpu_kernel void @flat_system_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -533,6 +578,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX7-LABEL: flat_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -549,6 +597,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -567,6 +619,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -600,6 +656,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -615,6 +673,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -743,6 +803,9 @@ entry: define amdgpu_kernel void @flat_system_unordered_store( ; GFX7-LABEL: flat_system_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -754,6 +817,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -765,6 +832,10 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX10-CU-LABEL: flat_system_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -787,6 +858,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -797,6 +870,8 @@ define amdgpu_kernel void @flat_system_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -877,6 +952,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_store( ; GFX7-LABEL: flat_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -888,6 +966,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -899,6 +981,10 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -921,6 +1007,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -931,6 +1019,8 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1011,6 +1101,9 @@ entry: define amdgpu_kernel void @flat_system_release_store( ; GFX7-LABEL: flat_system_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1023,6 +1116,10 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-WGP-LABEL: flat_system_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1036,6 +1133,10 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX10-CU-LABEL: flat_system_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1061,6 +1162,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1073,6 +1176,8 @@ define amdgpu_kernel void @flat_system_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1173,6 +1278,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX7-LABEL: flat_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1185,6 +1293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1223,6 +1339,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1235,6 +1353,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1335,6 +1455,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1346,6 +1469,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1357,6 +1484,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1379,6 +1510,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1389,6 +1522,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1469,6 +1604,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX7-LABEL: flat_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1482,6 +1620,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1497,6 +1639,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1524,6 +1670,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1537,6 +1685,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1636,6 +1786,9 @@ entry: define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX7-LABEL: flat_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1648,6 +1801,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1661,6 +1818,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1686,6 +1847,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1698,6 +1861,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1798,6 +1963,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1812,6 +1980,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1829,6 +2001,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1859,6 +2035,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1874,6 +2052,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1993,6 +2173,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2007,6 +2190,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2024,6 +2211,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2054,6 +2245,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2069,6 +2262,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2188,6 +2383,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2204,6 +2402,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2221,6 +2423,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2253,6 +2459,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2268,6 +2476,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2384,6 +2594,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2401,6 +2614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2420,6 +2637,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2455,6 +2676,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2472,6 +2695,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2612,6 +2837,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2629,6 +2857,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2648,6 +2880,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2683,6 +2919,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2700,6 +2938,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2840,6 +3080,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2865,6 +3108,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2890,6 +3137,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2940,6 +3191,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2954,6 +3207,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3063,6 +3318,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3090,6 +3348,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3119,6 +3381,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3174,6 +3440,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3191,6 +3459,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3319,6 +3589,9 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3345,6 +3618,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3372,6 +3649,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3425,6 +3706,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3441,6 +3724,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3570,6 +3855,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3598,6 +3886,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3629,6 +3921,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3687,6 +3983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3706,6 +4004,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3854,6 +4154,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3882,6 +4185,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3913,6 +4220,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3971,6 +4282,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3990,6 +4303,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4138,6 +4453,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4165,6 +4483,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4194,6 +4516,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4249,6 +4575,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4266,6 +4594,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4394,6 +4724,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4421,6 +4754,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4450,6 +4787,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4505,6 +4846,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4522,6 +4865,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4650,6 +4995,9 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4678,6 +5026,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4709,6 +5061,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4767,6 +5123,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4786,6 +5144,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4934,6 +5294,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4962,6 +5325,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4993,6 +5360,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5051,6 +5422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5070,6 +5443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5218,6 +5593,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5246,6 +5624,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5277,6 +5659,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5335,6 +5721,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5354,6 +5742,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5502,6 +5892,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5530,6 +5923,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5619,6 +6020,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5638,6 +6041,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5786,6 +6191,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5814,6 +6222,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5845,6 +6257,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5903,6 +6319,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5922,6 +6340,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6070,6 +6490,9 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6098,6 +6521,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6129,6 +6556,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6187,6 +6618,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6206,6 +6639,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6354,6 +6789,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6382,6 +6820,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6413,6 +6855,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6471,6 +6917,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6490,6 +6938,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6638,6 +7088,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6666,6 +7119,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6697,6 +7154,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -6755,6 +7216,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6774,6 +7237,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6922,6 +7387,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6951,6 +7419,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6980,6 +7452,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7038,6 +7514,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7055,6 +7533,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7191,6 +7671,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7221,6 +7704,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7252,6 +7739,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7312,6 +7803,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7331,6 +7824,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7477,6 +7972,9 @@ entry: define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7507,6 +8005,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7538,6 +8040,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7599,6 +8105,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7618,6 +8126,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7774,6 +8284,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7805,6 +8318,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7838,6 +8355,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7901,6 +8422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7922,6 +8445,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8092,6 +8617,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8123,6 +8651,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8156,6 +8688,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8219,6 +8755,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8240,6 +8778,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8410,6 +8950,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8440,6 +8983,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8471,6 +9018,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8531,6 +9082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8550,6 +9103,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8700,6 +9255,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8730,6 +9288,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8761,6 +9323,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8821,6 +9387,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8840,6 +9408,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8986,6 +9556,9 @@ entry: define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9017,6 +9590,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9050,6 +9627,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9113,6 +9694,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9134,6 +9717,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9304,6 +9889,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9335,6 +9923,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9368,6 +9960,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9431,6 +10027,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9452,6 +10050,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9622,6 +10222,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9653,6 +10256,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9686,6 +10293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9749,6 +10360,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9770,6 +10383,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9940,6 +10555,9 @@ entry: define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9971,6 +10589,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10004,6 +10626,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10067,6 +10693,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10088,6 +10716,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10258,6 +10888,9 @@ entry: define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10289,6 +10922,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10322,6 +10959,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10385,6 +11026,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10406,6 +11049,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10572,6 +11217,9 @@ entry: define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10603,6 +11251,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10636,6 +11288,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10699,6 +11355,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10720,6 +11378,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -10890,6 +11550,9 @@ entry: define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10921,6 +11584,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -10954,6 +11621,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11017,6 +11688,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11038,6 +11711,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11208,6 +11883,9 @@ entry: define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -11239,6 +11917,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11272,6 +11954,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -11335,6 +12021,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11356,6 +12044,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -11526,6 +12216,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX7-LABEL: flat_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11540,6 +12233,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11554,6 +12251,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11582,6 +12283,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11594,6 +12297,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11693,6 +12398,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX7-LABEL: flat_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11707,6 +12415,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11721,6 +12433,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11749,6 +12465,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11761,6 +12479,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11860,6 +12580,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX7-LABEL: flat_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11876,6 +12599,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11893,6 +12620,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11925,6 +12656,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11940,6 +12673,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12054,6 +12789,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX7-LABEL: flat_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12071,6 +12809,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12090,6 +12832,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12125,6 +12871,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12141,6 +12889,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12274,6 +13024,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX7-LABEL: flat_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12285,6 +13038,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12296,6 +13053,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_system_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12318,6 +13079,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12328,6 +13091,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12408,6 +13173,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX7-LABEL: flat_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12419,6 +13187,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12430,6 +13202,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12452,6 +13228,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12462,6 +13240,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12542,6 +13322,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX7-LABEL: flat_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12554,6 +13337,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12567,6 +13354,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX10-CU-LABEL: flat_system_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12592,6 +13383,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12604,6 +13397,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12704,6 +13499,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX7-LABEL: flat_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12716,6 +13514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12729,6 +13531,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12754,6 +13560,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12766,6 +13574,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12866,6 +13676,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12877,6 +13690,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12888,6 +13705,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12910,6 +13731,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12920,6 +13743,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13000,6 +13825,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13013,6 +13841,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13027,6 +13859,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13053,6 +13889,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13066,6 +13904,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13163,6 +14003,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX7-LABEL: flat_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13175,6 +14018,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13188,6 +14035,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13213,6 +14064,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13225,6 +14078,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13325,6 +14180,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13339,6 +14197,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13355,6 +14217,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13384,6 +14250,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13399,6 +14267,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13516,6 +14386,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13530,6 +14403,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13546,6 +14423,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13575,6 +14456,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13590,6 +14473,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13707,6 +14592,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13724,6 +14612,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13742,6 +14634,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13776,6 +14672,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13792,6 +14690,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -13913,6 +14813,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13931,6 +14834,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -13951,6 +14858,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -13988,6 +14899,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14006,6 +14919,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14151,6 +15066,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14169,6 +15087,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -14189,6 +15111,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -14226,6 +15152,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14244,6 +15172,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -14389,6 +15319,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14414,6 +15347,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14439,6 +15376,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14489,6 +15430,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14503,6 +15446,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14612,6 +15557,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14639,6 +15587,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14667,6 +15619,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14721,6 +15677,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14738,6 +15696,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14864,6 +15824,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14890,6 +15853,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14917,6 +15884,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14970,6 +15941,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14986,6 +15959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15115,6 +16090,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15143,6 +16121,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15173,6 +16155,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15230,6 +16216,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15249,6 +16237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15395,6 +16385,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15423,6 +16416,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15453,6 +16450,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15510,6 +16511,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15529,6 +16532,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15675,6 +16680,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15702,6 +16710,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15730,6 +16742,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15784,6 +16800,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15801,6 +16819,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15927,6 +16947,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15954,6 +16977,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15982,6 +17009,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16036,6 +17067,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16053,6 +17086,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16179,6 +17214,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16207,6 +17245,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16237,6 +17279,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16294,6 +17340,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16313,6 +17361,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16459,6 +17509,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16487,6 +17540,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16517,6 +17574,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16574,6 +17635,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16593,6 +17656,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16739,6 +17804,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16767,6 +17835,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16797,6 +17869,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -16854,6 +17930,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16873,6 +17951,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17019,6 +18099,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17047,6 +18130,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17077,6 +18164,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17134,6 +18225,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17153,6 +18246,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17299,6 +18394,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17327,6 +18425,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17357,6 +18459,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17414,6 +18520,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17433,6 +18541,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17579,6 +18689,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17607,6 +18720,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17637,6 +18754,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17694,6 +18815,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17713,6 +18836,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17859,6 +18984,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17887,6 +19015,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17917,6 +19049,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -17974,6 +19110,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17993,6 +19131,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18139,6 +19279,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18167,6 +19310,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18197,6 +19344,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -18254,6 +19405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18273,6 +19426,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18419,6 +19574,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18448,6 +19606,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18477,6 +19639,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18535,6 +19701,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18552,6 +19720,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18688,6 +19858,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18719,6 +19892,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18751,6 +19928,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18813,6 +19994,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18833,6 +20016,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18984,6 +20169,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19014,6 +20202,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19045,6 +20237,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19106,6 +20302,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19125,6 +20323,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19281,6 +20481,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19313,6 +20516,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19347,6 +20554,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19412,6 +20623,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19434,6 +20647,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19609,6 +20824,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19641,6 +20859,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19675,6 +20897,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19740,6 +20966,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19762,6 +20990,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19937,6 +21167,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19968,6 +21201,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20000,6 +21237,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20062,6 +21303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20082,6 +21325,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20237,6 +21482,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20268,6 +21516,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20300,6 +21552,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20362,6 +21618,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20382,6 +21640,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20533,6 +21793,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20565,6 +21828,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20599,6 +21866,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20664,6 +21935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20686,6 +21959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20861,6 +22136,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20893,6 +22171,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20927,6 +22209,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20992,6 +22278,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21014,6 +22302,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21189,6 +22479,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21221,6 +22514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21255,6 +22552,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21320,6 +22621,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21342,6 +22645,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21517,6 +22822,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21549,6 +22857,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21583,6 +22895,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21648,6 +22964,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21670,6 +22988,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21845,6 +23165,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21877,6 +23200,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21911,6 +23238,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -21976,6 +23307,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -21998,6 +23331,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22169,6 +23504,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22201,6 +23539,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22235,6 +23577,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22300,6 +23646,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22322,6 +23670,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22497,6 +23847,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22529,6 +23882,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22563,6 +23920,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22628,6 +23989,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22650,6 +24013,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22825,6 +24190,9 @@ entry: define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -22857,6 +24225,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22891,6 +24263,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -22956,6 +24332,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -22978,6 +24356,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index e1f82a70b4c0a..a88e0e217fdb4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -11,6 +11,9 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -26,6 +29,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -41,6 +48,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; ; GFX10-CU-LABEL: flat_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -142,6 +153,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX7-LABEL: flat_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -172,6 +186,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_nop 0 @@ -202,6 +220,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; ; GFX10-CU-LABEL: flat_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_nop 0 @@ -405,6 +427,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX7-LABEL: flat_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -420,6 +445,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -435,6 +464,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; ; GFX10-CU-LABEL: flat_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -540,6 +573,9 @@ entry: define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX7-LABEL: flat_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -569,6 +605,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-WGP-LABEL: flat_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -597,6 +637,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; ; GFX10-CU-LABEL: flat_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -799,6 +843,9 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX7-LABEL: flat_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -814,6 +861,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -829,6 +880,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -926,6 +981,9 @@ entry: define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX7-LABEL: flat_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -938,6 +996,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -951,6 +1013,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 588f06f1be054..7c637a20ab47b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX7-LABEL: flat_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX7-LABEL: flat_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -516,6 +561,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX7-LABEL: flat_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -683,6 +743,9 @@ entry: define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX7-LABEL: flat_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -817,6 +892,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX7-LABEL: flat_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -951,6 +1041,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_store( ; GFX7-LABEL: flat_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1085,6 +1190,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX7-LABEL: flat_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1219,6 +1339,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,6 +1488,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1487,6 +1637,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1621,6 +1786,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1755,6 +1935,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1889,6 +2084,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2068,6 +2278,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2247,6 +2472,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2426,6 +2666,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2649,6 +2904,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2872,6 +3142,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3095,6 +3380,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3318,6 +3618,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3541,6 +3856,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3764,6 +4094,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3987,6 +4332,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4210,6 +4570,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4433,6 +4808,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4656,6 +5046,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4879,6 +5284,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5102,6 +5522,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5325,6 +5760,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5548,6 +5998,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5771,6 +6236,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6040,6 +6520,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6309,6 +6804,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6578,6 +7088,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6847,6 +7372,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7116,6 +7656,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7385,6 +7940,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7654,6 +8224,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7923,6 +8508,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8192,6 +8792,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8461,6 +9076,9 @@ entry: define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8730,6 +9360,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8999,6 +9644,9 @@ entry: define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9268,6 +9928,9 @@ entry: define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9537,6 +10212,9 @@ entry: define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9806,6 +10496,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX7-LABEL: flat_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9973,6 +10678,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10140,6 +10860,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX7-LABEL: flat_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10307,6 +11042,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10474,6 +11224,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX7-LABEL: flat_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10608,6 +11373,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10742,6 +11522,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX7-LABEL: flat_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10876,6 +11671,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11010,6 +11820,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11144,6 +11969,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11278,6 +12118,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11412,6 +12267,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11546,6 +12416,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11680,6 +12565,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11859,6 +12759,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12038,6 +12953,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12217,6 +13147,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12440,6 +13385,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12663,6 +13623,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12886,6 +13861,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13109,6 +14099,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13332,6 +14337,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13555,6 +14575,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13778,6 +14813,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14001,6 +15051,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14224,6 +15289,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14447,6 +15527,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14670,6 +15765,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14893,6 +16003,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15116,6 +16241,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15339,6 +16479,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15562,6 +16717,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15831,6 +17001,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16100,6 +17285,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16369,6 +17569,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16638,6 +17853,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16907,6 +18137,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17176,6 +18421,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17445,6 +18705,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17714,6 +18989,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17983,6 +19273,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18252,6 +19557,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18521,6 +19841,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18790,6 +20125,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19059,6 +20409,9 @@ entry: define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index ee7d79a8a8cbb..0fd4aa4a7a93f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -15,6 +15,9 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -182,6 +197,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX7-LABEL: flat_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -349,6 +379,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX7-LABEL: flat_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -379,6 +416,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -409,6 +450,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -422,6 +465,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -528,6 +573,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX7-LABEL: flat_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -544,6 +592,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -561,6 +613,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -593,6 +649,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -607,6 +665,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -726,6 +786,9 @@ entry: define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX7-LABEL: flat_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -737,6 +800,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -748,6 +815,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -770,6 +841,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -780,6 +853,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -860,6 +935,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX7-LABEL: flat_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -871,6 +949,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -882,6 +964,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -904,6 +990,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -914,6 +1002,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -994,6 +1084,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_store( ; GFX7-LABEL: flat_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1006,6 +1099,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1019,6 +1116,10 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1043,6 +1144,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1054,6 +1157,8 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1145,6 +1250,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX7-LABEL: flat_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1157,6 +1265,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1170,6 +1282,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1194,6 +1310,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1205,6 +1323,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1296,6 +1416,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1307,6 +1430,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1318,6 +1445,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1340,6 +1471,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1350,6 +1483,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1430,6 +1565,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1442,6 +1580,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1456,6 +1598,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1480,6 +1626,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1491,6 +1639,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1583,6 +1733,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,6 +1748,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1608,6 +1765,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1632,6 +1793,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1643,6 +1806,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1734,6 +1899,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1747,6 +1915,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1763,6 +1935,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1789,6 +1965,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,6 +1979,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1904,6 +2084,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1917,6 +2100,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1933,6 +2120,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1959,6 +2150,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -1971,6 +2164,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2074,6 +2269,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2138,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2152,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2265,6 +2475,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2282,6 +2495,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2300,6 +2517,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2334,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2349,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2475,6 +2700,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2492,6 +2720,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2510,6 +2742,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2544,6 +2780,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2559,6 +2797,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2685,6 +2925,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2710,6 +2953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2735,6 +2982,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2785,6 +3036,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2799,6 +3052,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -2908,6 +3163,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2934,6 +3192,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -2962,6 +3224,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3014,6 +3280,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3029,6 +3297,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3150,6 +3420,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3176,6 +3449,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3203,6 +3480,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3255,6 +3536,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3270,6 +3553,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3390,6 +3675,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3417,6 +3705,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3447,6 +3739,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3501,6 +3797,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3517,6 +3815,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3649,6 +3949,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3676,6 +3979,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3706,6 +4013,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3760,6 +4071,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3776,6 +4089,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -3908,6 +4223,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3934,6 +4252,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -3962,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4014,6 +4340,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4029,6 +4357,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4150,6 +4480,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4176,6 +4509,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4204,6 +4541,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4256,6 +4597,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4271,6 +4614,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4392,6 +4737,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4419,6 +4767,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4449,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4503,6 +4859,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4519,6 +4877,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4651,6 +5011,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4678,6 +5041,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4708,6 +5075,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4762,6 +5133,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4778,6 +5151,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -4910,6 +5285,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4937,6 +5315,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -4967,6 +5349,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5021,6 +5407,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5037,6 +5425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5169,6 +5559,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5196,6 +5589,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5226,6 +5623,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -5280,6 +5681,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5296,6 +5699,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5428,6 +5833,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5457,6 +5865,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5486,6 +5898,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5544,6 +5960,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5561,6 +5979,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5697,6 +6117,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -5727,6 +6150,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5757,6 +6184,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -5817,6 +6248,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5835,6 +6268,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -5978,6 +6413,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6008,6 +6446,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6039,6 +6481,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6099,6 +6545,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6117,6 +6565,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6264,6 +6714,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6295,6 +6748,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6327,6 +6784,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6389,6 +6850,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6408,6 +6871,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6564,6 +7029,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6595,6 +7063,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6627,6 +7099,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6689,6 +7165,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6708,6 +7186,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -6864,6 +7344,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6894,6 +7377,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6924,6 +7411,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -6984,6 +7475,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7002,6 +7495,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7147,6 +7642,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7177,6 +7675,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7207,6 +7709,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7267,6 +7773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7285,6 +7793,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7428,6 +7938,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7459,6 +7972,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7491,6 +8008,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7553,6 +8074,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7572,6 +8095,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7728,6 +8253,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7759,6 +8287,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7791,6 +8323,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -7853,6 +8389,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -7872,6 +8410,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8028,6 +8568,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8059,6 +8602,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8091,6 +8638,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8153,6 +8704,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8172,6 +8725,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8328,6 +8883,9 @@ entry: define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8359,6 +8917,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8391,6 +8953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8453,6 +9019,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8472,6 +9040,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8628,6 +9198,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8659,6 +9232,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8691,6 +9268,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8753,6 +9334,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8772,6 +9355,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -8926,6 +9511,9 @@ entry: define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8957,6 +9545,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -8989,6 +9581,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9051,6 +9647,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9070,6 +9668,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9226,6 +9826,9 @@ entry: define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9257,6 +9860,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9289,6 +9896,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9351,6 +9962,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9370,6 +9983,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9526,6 +10141,9 @@ entry: define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9557,6 +10175,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9589,6 +10211,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -9651,6 +10277,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9670,6 +10298,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -9826,6 +10456,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX7-LABEL: flat_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9840,6 +10473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9854,6 +10491,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9882,6 +10523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9894,6 +10537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -9993,6 +10638,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10007,6 +10655,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10021,6 +10673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10049,6 +10705,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10061,6 +10719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10160,6 +10820,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX7-LABEL: flat_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10174,6 +10837,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10190,6 +10857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10218,6 +10889,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10230,6 +10903,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,6 +11010,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10349,6 +11027,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10367,6 +11049,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10395,6 +11081,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10407,6 +11095,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10522,6 +11212,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX7-LABEL: flat_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10533,6 +11226,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10544,6 +11241,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10566,6 +11267,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10576,6 +11279,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10656,6 +11361,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10667,6 +11375,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10678,6 +11390,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10700,6 +11416,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10710,6 +11428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10790,6 +11510,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX7-LABEL: flat_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10801,6 +11524,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10814,6 +11541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10836,6 +11567,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10846,6 +11579,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10934,6 +11669,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10945,6 +11683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10958,6 +11700,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10980,6 +11726,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -10990,6 +11738,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11078,6 +11828,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11089,6 +11842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11100,6 +11857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11122,6 +11883,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11132,6 +11895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11212,6 +11977,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11223,6 +11991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11236,6 +12008,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11258,6 +12034,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11268,6 +12046,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11356,6 +12136,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11367,6 +12150,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11380,6 +12167,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11402,6 +12193,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11412,6 +12205,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11500,6 +12295,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11511,6 +12309,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11526,6 +12328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11548,6 +12354,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11558,6 +12366,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11654,6 +12464,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11665,6 +12478,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11680,6 +12497,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11702,6 +12523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11712,6 +12535,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11808,6 +12633,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11823,6 +12651,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11840,6 +12672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11870,6 +12706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11883,6 +12721,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -11995,6 +12835,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12010,6 +12853,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12029,6 +12876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12059,6 +12910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12072,6 +12925,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12194,6 +13049,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12209,6 +13067,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12228,6 +13090,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12258,6 +13124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12271,6 +13139,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -12393,6 +13263,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12418,6 +13291,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12443,6 +13320,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12493,6 +13374,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12507,6 +13390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12616,6 +13501,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12641,6 +13529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12668,6 +13560,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12718,6 +13614,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12732,6 +13630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12849,6 +13749,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12874,6 +13777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12901,6 +13808,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -12951,6 +13862,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -12965,6 +13878,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13082,6 +13997,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13107,6 +14025,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13136,6 +14058,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13186,6 +14112,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13200,6 +14128,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13325,6 +14255,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13350,6 +14283,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13379,6 +14316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13429,6 +14370,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13443,6 +14386,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13568,6 +14513,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13593,6 +14541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13620,6 +14572,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13670,6 +14626,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13684,6 +14642,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13801,6 +14761,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13826,6 +14789,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13853,6 +14820,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -13903,6 +14874,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -13917,6 +14890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14034,6 +15009,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14059,6 +15037,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14088,6 +15070,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14138,6 +15124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14152,6 +15140,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14277,6 +15267,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14302,6 +15295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14331,6 +15328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14381,6 +15382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14395,6 +15398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14520,6 +15525,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14545,6 +15553,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14574,6 +15586,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14624,6 +15640,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14638,6 +15656,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14763,6 +15783,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14788,6 +15811,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14817,6 +15844,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -14867,6 +15898,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -14881,6 +15914,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15006,6 +16041,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15031,6 +16069,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15060,6 +16102,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15110,6 +16156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15124,6 +16172,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15249,6 +16299,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15274,6 +16327,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15303,6 +16360,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15353,6 +16414,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15367,6 +16430,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15492,6 +16557,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15517,6 +16585,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15546,6 +16618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15596,6 +16672,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15610,6 +16688,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15735,6 +16815,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15760,6 +16843,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15789,6 +16876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8 @@ -15839,6 +16930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15853,6 +16946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -15978,6 +17073,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16007,6 +17105,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16036,6 +17138,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16094,6 +17200,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16111,6 +17219,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16247,6 +17357,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16276,6 +17389,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16307,6 +17424,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16365,6 +17486,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16382,6 +17505,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16524,6 +17649,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16553,6 +17681,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16584,6 +17716,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16642,6 +17778,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16659,6 +17797,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16803,6 +17943,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16832,6 +17975,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16865,6 +18012,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -16923,6 +18074,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -16940,6 +18093,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17092,6 +18247,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17121,6 +18279,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17154,6 +18316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17212,6 +18378,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17229,6 +18397,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17381,6 +18551,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17410,6 +18583,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17441,6 +18618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17499,6 +18680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17516,6 +18699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17660,6 +18845,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17689,6 +18877,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17720,6 +18912,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17778,6 +18974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17795,6 +18993,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -17937,6 +19137,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17966,6 +19169,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -17999,6 +19206,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18057,6 +19268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18074,6 +19287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18226,6 +19441,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18255,6 +19473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18288,6 +19510,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18346,6 +19572,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18363,6 +19591,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18515,6 +19745,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18544,6 +19777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18577,6 +19814,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18635,6 +19876,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18652,6 +19895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18804,6 +20049,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18833,6 +20081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18866,6 +20118,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -18924,6 +20180,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -18941,6 +20199,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19093,6 +20353,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19122,6 +20385,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19155,6 +20422,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19213,6 +20484,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19230,6 +20503,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19380,6 +20655,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19409,6 +20687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19442,6 +20724,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19500,6 +20786,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19517,6 +20805,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19669,6 +20959,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19698,6 +20991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19731,6 +21028,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -19789,6 +21090,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19806,6 +21109,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -19958,6 +21263,9 @@ entry: define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19987,6 +21295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17 +; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20020,6 +21332,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_add_u32 s12, s12, s17 +; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8 @@ -20078,6 +21394,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc @@ -20095,6 +21413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index b9487f8e14c2b..8b600c835a160 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_agent_unordered_load( ; ; GFX7-LABEL: global_agent_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; ; GFX7-LABEL: global_agent_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -404,6 +410,9 @@ define amdgpu_kernel void @global_agent_acquire_load( ; ; GFX7-LABEL: global_agent_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -602,6 +611,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; ; GFX7-LABEL: global_agent_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -809,6 +821,9 @@ define amdgpu_kernel void @global_agent_unordered_store( ; ; GFX7-LABEL: global_agent_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -962,6 +977,9 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; ; GFX7-LABEL: global_agent_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1116,6 +1134,9 @@ define amdgpu_kernel void @global_agent_release_store( ; ; GFX7-LABEL: global_agent_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1294,6 +1315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; ; GFX7-LABEL: global_agent_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,6 +1494,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1623,6 +1650,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1802,6 +1832,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; ; GFX7-LABEL: global_agent_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1980,6 +2013,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2185,6 +2221,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2390,6 +2429,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2587,6 +2629,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2812,6 +2857,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3038,6 +3086,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3256,6 +3307,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3500,6 +3554,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3743,6 +3800,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4013,6 +4073,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4282,6 +4345,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4527,6 +4593,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4773,6 +4842,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5043,6 +5115,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5313,6 +5388,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5583,6 +5661,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5853,6 +5934,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6123,6 +6207,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6393,6 +6480,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6663,6 +6753,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6933,6 +7026,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7182,6 +7278,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7447,6 +7546,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7721,6 +7823,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8015,6 +8120,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8308,6 +8416,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8577,6 +8688,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8843,6 +8957,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9137,6 +9254,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9431,6 +9551,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9725,6 +9848,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10019,6 +10145,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10309,6 +10438,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10603,6 +10735,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10897,6 +11032,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -11189,6 +11327,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; ; GFX7-LABEL: global_agent_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11370,6 +11511,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; ; GFX7-LABEL: global_agent_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11552,6 +11696,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; ; GFX7-LABEL: global_agent_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11750,6 +11897,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11957,6 +12107,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; ; GFX7-LABEL: global_agent_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12110,6 +12263,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; ; GFX7-LABEL: global_agent_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12264,6 +12420,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; ; GFX7-LABEL: global_agent_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12442,6 +12601,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12618,6 +12780,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12771,6 +12936,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12950,6 +13118,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13128,6 +13299,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13333,6 +13507,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13538,6 +13715,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13735,6 +13915,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13960,6 +14143,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -14186,6 +14372,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14404,6 +14593,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14648,6 +14840,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14891,6 +15086,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15161,6 +15359,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15430,6 +15631,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15675,6 +15879,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15921,6 +16128,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16191,6 +16401,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16461,6 +16674,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16731,6 +16947,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17001,6 +17220,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17271,6 +17493,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17541,6 +17766,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -17811,6 +18039,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -18081,6 +18312,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18330,6 +18564,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18596,6 +18833,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18890,6 +19130,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19183,6 +19426,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19452,6 +19698,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19718,6 +19967,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20012,6 +20264,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20306,6 +20561,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20600,6 +20858,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20894,6 +21155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21184,6 +21448,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21478,6 +21745,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21772,6 +22042,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index a6bd1b678f95e..16e55058e4fc8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -36,6 +36,9 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; ; GFX7-LABEL: global_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -227,6 +230,9 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; ; GFX7-LABEL: global_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -474,6 +480,9 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; ; GFX7-LABEL: global_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -658,6 +667,9 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; ; GFX7-LABEL: global_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -891,6 +903,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; ; GFX7-LABEL: global_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index a5de6a92db1af..8042d38716107 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; ; GFX7-LABEL: global_singlethread_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; ; GFX7-LABEL: global_singlethread_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -403,6 +409,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; ; GFX7-LABEL: global_singlethread_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -758,6 +770,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; ; GFX7-LABEL: global_singlethread_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -911,6 +926,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; ; GFX7-LABEL: global_singlethread_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_singlethread_release_store( ; ; GFX7-LABEL: global_singlethread_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; ; GFX7-LABEL: global_singlethread_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; ; GFX7-LABEL: global_singlethread_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 69404247ccd6e..9c11781da56f2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_system_unordered_load( ; ; GFX7-LABEL: global_system_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_system_monotonic_load( ; ; GFX7-LABEL: global_system_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -404,6 +410,9 @@ define amdgpu_kernel void @global_system_acquire_load( ; ; GFX7-LABEL: global_system_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -604,6 +613,9 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; ; GFX7-LABEL: global_system_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -813,6 +825,9 @@ define amdgpu_kernel void @global_system_unordered_store( ; ; GFX7-LABEL: global_system_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -966,6 +981,9 @@ define amdgpu_kernel void @global_system_monotonic_store( ; ; GFX7-LABEL: global_system_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1120,6 +1138,9 @@ define amdgpu_kernel void @global_system_release_store( ; ; GFX7-LABEL: global_system_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,6 +1323,9 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; ; GFX7-LABEL: global_system_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1482,6 +1506,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1635,6 +1662,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1816,6 +1846,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; ; GFX7-LABEL: global_system_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1998,6 +2031,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2209,6 +2245,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2420,6 +2459,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2619,6 +2661,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2850,6 +2895,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3082,6 +3130,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3300,6 +3351,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3546,6 +3600,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3793,6 +3850,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4069,6 +4129,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4344,6 +4407,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4591,6 +4657,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4839,6 +4908,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5115,6 +5187,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5391,6 +5466,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5667,6 +5745,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5943,6 +6024,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6192,6 +6276,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6460,6 +6547,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6760,6 +6850,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7059,6 +7152,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7330,6 +7426,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7598,6 +7697,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7898,6 +8000,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8198,6 +8303,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8498,6 +8606,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8798,6 +8909,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9094,6 +9208,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9394,6 +9511,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9694,6 +9814,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9992,6 +10115,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; ; GFX7-LABEL: global_system_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10173,6 +10299,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; ; GFX7-LABEL: global_system_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10355,6 +10484,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; ; GFX7-LABEL: global_system_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10555,6 +10687,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; ; GFX7-LABEL: global_system_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10764,6 +10899,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; ; GFX7-LABEL: global_system_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10917,6 +11055,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; ; GFX7-LABEL: global_system_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11071,6 +11212,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; ; GFX7-LABEL: global_system_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11253,6 +11397,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; ; GFX7-LABEL: global_system_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11433,6 +11580,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11586,6 +11736,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11767,6 +11920,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11949,6 +12105,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12160,6 +12319,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12371,6 +12533,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12570,6 +12735,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12801,6 +12969,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13033,6 +13204,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13251,6 +13425,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13497,6 +13674,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13744,6 +13924,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14020,6 +14203,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14295,6 +14481,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14542,6 +14731,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14790,6 +14982,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15066,6 +15261,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15342,6 +15540,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15618,6 +15819,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15894,6 +16098,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16170,6 +16377,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16446,6 +16656,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16722,6 +16935,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16998,6 +17214,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17247,6 +17466,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17514,6 +17736,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17792,6 +18017,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18092,6 +18320,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18391,6 +18622,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18662,6 +18896,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18930,6 +19167,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19230,6 +19470,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19530,6 +19773,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19830,6 +20076,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20130,6 +20379,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20426,6 +20678,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20726,6 +20981,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -21026,6 +21284,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 7dfd5e60c24f8..8a5c5dda9f79c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -37,6 +37,9 @@ define amdgpu_kernel void @global_volatile_load_0( ; ; GFX7-LABEL: global_volatile_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -184,6 +187,9 @@ define amdgpu_kernel void @global_volatile_load_1( ; ; GFX7-LABEL: global_volatile_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 @@ -372,6 +378,9 @@ define amdgpu_kernel void @global_volatile_store_0( ; ; GFX7-LABEL: global_volatile_store_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -527,6 +536,9 @@ define amdgpu_kernel void @global_volatile_store_1( ; ; GFX7-LABEL: global_volatile_store_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -718,6 +730,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; ; GFX7-LABEL: global_volatile_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -852,6 +867,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; ; GFX7-LABEL: global_volatile_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 4b6c99282dc13..151ba07a0b531 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; ; GFX7-LABEL: global_wavefront_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; ; GFX7-LABEL: global_wavefront_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -403,6 +409,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; ; GFX7-LABEL: global_wavefront_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -584,6 +593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -758,6 +770,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; ; GFX7-LABEL: global_wavefront_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -911,6 +926,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; ; GFX7-LABEL: global_wavefront_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_wavefront_release_store( ; ; GFX7-LABEL: global_wavefront_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; ; GFX7-LABEL: global_wavefront_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; ; GFX7-LABEL: global_wavefront_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 46d65187cb1b2..69b0c7f93ab0e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -41,6 +41,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; ; GFX7-LABEL: global_workgroup_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -222,6 +225,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; ; GFX7-LABEL: global_workgroup_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -403,6 +409,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; ; GFX7-LABEL: global_workgroup_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -590,6 +599,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -780,6 +792,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; ; GFX7-LABEL: global_workgroup_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -933,6 +948,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; ; GFX7-LABEL: global_workgroup_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1087,6 +1105,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; ; GFX7-LABEL: global_workgroup_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1258,6 +1279,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1427,6 +1451,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1578,6 +1605,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1740,6 +1770,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1909,6 +1942,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2088,6 +2124,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2268,6 +2307,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2454,6 +2496,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2659,6 +2704,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2866,6 +2914,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3082,6 +3133,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3309,6 +3363,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3543,6 +3600,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -3787,6 +3847,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4030,6 +4093,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4256,6 +4322,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4483,6 +4552,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4727,6 +4799,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -4971,6 +5046,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5215,6 +5293,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5459,6 +5540,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5703,6 +5787,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -5947,6 +6034,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6191,6 +6281,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -6437,6 +6530,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6685,6 +6781,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -6939,6 +7038,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7205,6 +7307,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7478,6 +7583,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -7750,6 +7858,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8005,6 +8116,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8259,6 +8373,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8532,6 +8649,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -8805,6 +8925,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9078,6 +9201,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9351,6 +9477,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9622,6 +9751,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -9895,6 +10027,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10168,6 +10303,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -10440,6 +10578,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10621,6 +10762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10802,6 +10946,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -10988,6 +11135,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11175,6 +11325,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; ; GFX7-LABEL: global_workgroup_one_as_unordered_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11328,6 +11481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11481,6 +11637,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; ; GFX7-LABEL: global_workgroup_one_as_release_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11644,6 +11803,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11806,6 +11968,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -11957,6 +12122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12118,6 +12286,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12279,6 +12450,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12450,6 +12624,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12623,6 +12800,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -12808,6 +12988,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13005,6 +13188,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -13205,6 +13391,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13421,6 +13610,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13647,6 +13839,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -13873,6 +14068,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14109,6 +14307,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14345,6 +14546,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14571,6 +14775,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -14797,6 +15004,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15033,6 +15243,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15269,6 +15482,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15505,6 +15721,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15741,6 +15960,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -15977,6 +16199,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16213,6 +16438,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16449,6 +16677,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2 @@ -16688,6 +16919,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -16936,6 +17170,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17189,6 +17426,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17447,6 +17687,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17712,6 +17955,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -17977,6 +18223,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18232,6 +18481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18485,6 +18737,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -18750,6 +19005,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19015,6 +19273,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19280,6 +19541,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19545,6 +19809,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -19808,6 +20075,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20073,6 +20343,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 @@ -20338,6 +20611,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9] ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 04b0f00fe77b5..78209ee34cad4 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -38,6 +38,9 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; ; GFX7-LABEL: local_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -224,6 +227,9 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; ; GFX7-LABEL: local_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 @@ -830,6 +836,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; ; GFX7-LABEL: local_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 9e5f5fcffca9f..bc2508411ed6b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -34,6 +34,9 @@ define amdgpu_kernel void @local_volatile_load_0( ; ; GFX7-LABEL: local_volatile_load_0: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 m0, -1 @@ -172,6 +175,9 @@ define amdgpu_kernel void @local_volatile_load_1( ; ; GFX7-LABEL: local_volatile_load_1: ; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 ; GFX7-NEXT: s_mov_b32 s7, 2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index fceee413f3f97..2aa4f021c259c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -38,7 +38,10 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX7-LABEL: private_nontemporal_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-WGP-LABEL: private_nontemporal_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -67,7 +70,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX10-CU-LABEL: private_nontemporal_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -107,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -121,7 +124,7 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -232,7 +235,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX7-LABEL: private_nontemporal_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -249,7 +255,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-WGP-LABEL: private_nontemporal_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -265,7 +271,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX10-CU-LABEL: private_nontemporal_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -309,7 +315,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -328,7 +334,7 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -470,7 +476,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX7-LABEL: private_nontemporal_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 @@ -484,7 +490,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-WGP-LABEL: private_nontemporal_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -498,7 +504,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX10-CU-LABEL: private_nontemporal_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -544,7 +550,7 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -647,7 +653,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX7-LABEL: private_nontemporal_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -663,7 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-WGP-LABEL: private_nontemporal_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -678,7 +684,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX10-CU-LABEL: private_nontemporal_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -713,7 +719,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -731,7 +737,7 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -874,7 +880,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX7-LABEL: private_nontemporal_volatile_load: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -889,7 +898,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-WGP-LABEL: private_nontemporal_volatile_load: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -903,7 +912,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX10-CU-LABEL: private_nontemporal_volatile_load: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -943,7 +952,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -957,7 +966,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17 ; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index f8fb7986938f2..df4193969f8a0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -38,7 +38,10 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX7-LABEL: private_volatile_load_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-WGP-LABEL: private_volatile_load_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -67,7 +70,7 @@ define amdgpu_kernel void @private_volatile_load_0( ; ; GFX10-CU-LABEL: private_volatile_load_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 @@ -190,7 +193,10 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX7-LABEL: private_volatile_load_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX7-NEXT: s_add_i32 s12, s12, s17 +; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 @@ -207,7 +213,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-WGP-LABEL: private_volatile_load_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -223,7 +229,7 @@ define amdgpu_kernel void @private_volatile_load_1( ; ; GFX10-CU-LABEL: private_volatile_load_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 @@ -365,7 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX7-LABEL: private_volatile_store_0: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 @@ -380,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-WGP-LABEL: private_volatile_store_0: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -395,7 +401,7 @@ define amdgpu_kernel void @private_volatile_store_0( ; ; GFX10-CU-LABEL: private_volatile_store_0: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 @@ -515,7 +521,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX7-LABEL: private_volatile_store_1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_add_u32 s0, s0, s15 +; GFX7-NEXT: s_add_u32 s0, s0, s17 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 @@ -532,7 +538,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-WGP-LABEL: private_volatile_store_1: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 +; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 @@ -548,7 +554,7 @@ define amdgpu_kernel void @private_volatile_store_1( ; ; GFX10-CU-LABEL: private_volatile_store_1: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 +; GFX10-CU-NEXT: s_add_u32 s0, s0, s17 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index aaf81e2fa4000..07072f6a36296 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -144,6 +150,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_sle_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -155,6 +164,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_sle_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -214,6 +226,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_imin_sle_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -225,6 +240,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_imin_sle_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -288,6 +306,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s3, s3, s7 ; CI-NEXT: s_min_i32 s2, s2, s6 @@ -306,6 +327,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s3, s3, s7 ; VI-NEXT: s_min_i32 s2, s2, s6 @@ -414,11 +438,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: s_sext_i32_i8 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_byte v[0:1], v2 @@ -429,11 +456,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: s_sext_i32_i8 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_byte v[0:1], v2 @@ -549,6 +579,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 24 ; CI-NEXT: s_sext_i32_i8 s5, s2 @@ -572,6 +604,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_or_b32 s2, s3, s2 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -582,6 +615,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 24 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 @@ -605,6 +640,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32] ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_or_b32 s2, s2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -757,6 +793,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; CI-LABEL: s_test_imin_sle_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s4, s2, 16 ; CI-NEXT: s_sext_i32_i16 s2, s2 @@ -776,6 +815,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; VI-LABEL: s_test_imin_sle_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s4, s2, 16 ; VI-NEXT: s_sext_i32_i16 s2, s2 @@ -857,6 +899,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_ashr_i32 s6, s0, 16 ; CI-NEXT: s_ashr_i32 s7, s1, 16 @@ -887,6 +932,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ashr_i32 s6, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 @@ -983,10 +1031,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1005,10 +1056,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1122,10 +1176,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1144,10 +1201,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1233,6 +1293,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_imin_slt_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1244,6 +1307,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_imin_slt_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1305,6 +1371,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s1, s1, s3 ; CI-NEXT: s_min_i32 s0, s0, s2 @@ -1319,6 +1388,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s1, s1, s3 ; VI-NEXT: s_min_i32 s0, s0, s2 @@ -1391,6 +1463,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1403,6 +1478,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1468,6 +1546,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_i32 s2, s2, 8 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1480,6 +1561,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_i32 s2, s2, 8 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1557,10 +1641,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1579,10 +1666,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1686,12 +1776,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v2, s5 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1710,12 +1803,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] @@ -1838,12 +1934,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -1874,12 +1973,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -1976,6 +2078,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ule_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -1987,6 +2092,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ule_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2059,10 +2167,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2081,10 +2192,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -2188,6 +2302,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s3 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 @@ -2209,6 +2326,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 @@ -2294,6 +2414,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; CI-LABEL: s_test_umin_ult_i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2305,6 +2428,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; VI-LABEL: s_test_umin_ult_i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2386,6 +2512,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i32_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2407,6 +2536,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i32_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0 @@ -2534,6 +2666,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; CI-LABEL: v_test_umin_ult_i16_multi_use: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 @@ -2556,6 +2691,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; VI-LABEL: v_test_umin_ult_i16_multi_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -2646,6 +2784,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; CI-LABEL: s_test_umin_ult_v1i32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 @@ -2657,6 +2798,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; VI-LABEL: s_test_umin_ult_v1i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2726,6 +2870,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; CI-LABEL: s_test_umin_ult_v8i32: ; CI: ; %bb.0: +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -2757,6 +2904,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 ; ; VI-LABEL: s_test_umin_ult_v8i32: ; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -2921,6 +3071,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s10, s0, 16 ; CI-NEXT: s_and_b32 s0, s0, 0xffff @@ -2967,6 +3120,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s10, s3, 16 ; VI-NEXT: s_and_b32 s3, s3, 0xffff @@ -3088,11 +3244,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s2, s2, 0xffff ; CI-NEXT: s_and_b32 s3, s3, 0xffff ; CI-NEXT: s_min_u32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3103,11 +3262,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xffff ; VI-NEXT: s_and_b32 s3, s3, 0xffff ; VI-NEXT: s_min_u32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3195,11 +3357,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; CI-NEXT: s_load_dword s2, s[8:9], 0xa ; CI-NEXT: s_load_dword s3, s[8:9], 0x13 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s2, s2 ; CI-NEXT: s_sext_i32_i16 s3, s3 ; CI-NEXT: s_min_i32 s2, s2, s3 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -3210,11 +3375,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace ; VI-NEXT: s_load_dword s2, s[8:9], 0x28 ; VI-NEXT: s_load_dword s3, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s2, s2 ; VI-NEXT: s_sext_i32_i16 s3, s3 ; VI-NEXT: s_min_i32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -3309,6 +3477,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i16 s3, s2 ; CI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3323,6 +3494,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i16 s3, s2 ; VI-NEXT: s_ashr_i32 s2, s2, 16 @@ -3403,6 +3577,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3421,6 +3598,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3510,6 +3690,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3528,6 +3711,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3617,6 +3803,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3635,6 +3824,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3724,6 +3916,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -3742,6 +3937,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s4 @@ -3855,9 +4053,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -3886,10 +4087,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -4005,9 +4209,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_load_dword v4, v[0:1] @@ -4035,10 +4242,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll index 337320b9eeea1..b1ce5a3423f20 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: s_add_i32 s12, s12, s17 +; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll index bc1710686a087..5803821a1d2c0 100644 --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 { ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0 ; GFX803: ; %bb.0: ; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX803-NEXT: s_add_i32 s12, s12, s17 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_mov_b32_e32 v1, s1 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll index 2e9f09ad41813..7c9ecc892478c 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals ; Check that no attributes are added to graphics functions -; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s ; Check that it doesn't crash @@ -12,12 +11,6 @@ target datalayout = "A5" define amdgpu_cs void @test_simple_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call() { -; AKF_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; AKF_GCN-NEXT: [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr -; AKF_GCN-NEXT: call amdgpu_gfx void [[FUN]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc() @@ -68,7 +61,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0 attributes #0 = { nounwind readnone speculatable willreturn } ;. -; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 1a34fa3bbbf4d..24677b60be6c2 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -10,32 +10,33 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908: bb.0 (%ir-block.0): ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} - ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %6 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7 - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def %7 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8 + + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; ; PEI-GFX908-LABEL: name: partial_copy ; PEI-GFX908: bb.0 (%ir-block.0): - ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 + ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 ; PEI-GFX908-NEXT: {{ $}} - ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) @@ -44,7 +45,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -55,31 +56,31 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A: bb.0 (%ir-block.0): ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %6 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7 - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %7 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8 + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 ; ; PEI-GFX90A-LABEL: name: partial_copy ; PEI-GFX90A: bb.0 (%ir-block.0): - ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7 + ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9 ; PEI-GFX90A-NEXT: {{ $}} - ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 + ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 - ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) @@ -87,7 +88,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec - ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) + ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 00507c1eafd6e..c26f0926d86b2 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -19,16 +19,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preload_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB0_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB0_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -54,17 +54,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr ; ; GFX90a-LABEL: preload_unused_arg_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB1_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB1_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s12 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -90,7 +89,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: no_free_sgprs_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB2_0 ; GFX90a-NEXT: .p2align 8 @@ -100,7 +99,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] +; GFX90a-NEXT: global_store_dword v0, v1, s[14:15] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -181,7 +180,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; ; GFX90a-LABEL: incorrect_type_i64_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB5_0 ; GFX90a-NEXT: .p2align 8 @@ -191,7 +190,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i64, ptr addrspace(4) %imp_arg_ptr @@ -217,7 +216,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; ; GFX90a-LABEL: incorrect_type_i16_block_count_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB6_0 ; GFX90a-NEXT: .p2align 8 @@ -227,7 +226,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i16, ptr addrspace(4) %imp_arg_ptr @@ -252,16 +251,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preload_block_count_y: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB7_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB7_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 @@ -289,7 +287,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: random_incorrect_offset: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB8_0 ; GFX90a-NEXT: .p2align 8 @@ -300,7 +298,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 @@ -327,17 +325,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preload_block_count_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB9_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB9_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s12 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 @@ -366,19 +363,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa ; ; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB10_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB10_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff -; GFX90a-NEXT: s_add_i32 s0, s10, s0 +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff +; GFX90a-NEXT: s_add_i32 s0, s12, s0 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -408,19 +404,18 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_block_count_xyz: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB11_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB11_0: ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 @@ -454,17 +449,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_workgroup_size_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB12_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB12_0: -; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 @@ -492,17 +487,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_workgroup_size_y: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB13_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB13_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 @@ -531,18 +526,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) ; ; GFX90a-LABEL: preload_workgroup_size_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB14_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB14_0: -; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 @@ -575,22 +570,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou ; ; GFX90a-LABEL: preload_workgroup_size_xyz: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB15_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB15_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 -; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff -; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff +; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff +; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 @@ -628,18 +623,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { ; ; GFX90a-LABEL: preload_remainder_x: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB16_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB16_0: -; GFX90a-NEXT: s_lshr_b32 s0, s12, 16 +; GFX90a-NEXT: s_lshr_b32 s0, s14, 16 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 @@ -668,18 +663,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { ; ; GFX90a-LABEL: preloadremainder_y: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB17_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB17_0: -; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 @@ -708,18 +701,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { ; ; GFX90a-LABEL: preloadremainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB18_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB18_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 +; GFX90a-NEXT: s_lshr_b32 s0, s15, 16 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 @@ -752,22 +743,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 ; ; GFX90a-LABEL: preloadremainder_xyz: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB19_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB19_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_lshr_b32 s1, s12, 16 -; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff +; GFX90a-NEXT: s_lshr_b32 s0, s15, 16 +; GFX90a-NEXT: s_lshr_b32 s1, s14, 16 +; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v1, s2 ; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 @@ -803,7 +792,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; ; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB20_0 ; GFX90a-NEXT: .p2align 8 @@ -814,7 +803,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] +; GFX90a-NEXT: global_store_dword v0, v1, s[14:15] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 @@ -844,10 +833,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg % ; ; GFX90a-LABEL: preload_block_max_user_sgprs: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB21_0 ; GFX90a-NEXT: .p2align 8 @@ -857,7 +843,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg % ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr @@ -887,21 +873,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt ; ; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB22_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c +; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 ; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index fe6378435a42e..7ae0c11dca279 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -21,17 +21,17 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) ; ; GFX90a-LABEL: ptr1_i8: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB0_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB0_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out @@ -56,17 +56,17 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero ; ; GFX90a-LABEL: ptr1_i8_zext_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB1_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB1_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -91,17 +91,17 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 ; ; GFX90a-LABEL: ptr1_i16_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB2_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB2_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out, align 4 @@ -125,16 +125,16 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 ; ; GFX90a-LABEL: ptr1_i32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB3_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB3_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store i32 %arg0, ptr addrspace(1) %out ret void @@ -160,18 +160,17 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa ; ; GFX90a-LABEL: i32_ptr1_i32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB4_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB4_0: -; GFX90a-NEXT: s_add_i32 s0, s6, s10 +; GFX90a-NEXT: s_add_i32 s0, s8, s12 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] +; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] ; GFX90a-NEXT: s_endpgm %add = add i32 %arg0, %arg1 store i32 %add, ptr addrspace(1) %out @@ -198,19 +197,19 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: ptr1_i16_i16_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB5_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB5_0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 -; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-NEXT: s_lshr_b32 s0, s10, 16 +; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff ; GFX90a-NEXT: s_add_i32 s0, s1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -236,16 +235,16 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 ; ; GFX90a-LABEL: ptr1_v2i8_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB6_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB6_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <2 x i8> %in, ptr addrspace(1) %out ret void @@ -274,7 +273,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; ; GFX90a-LABEL: byref_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB7_0 ; GFX90a-NEXT: .p2align 8 @@ -285,9 +284,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) -; GFX90a-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v2, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -320,7 +319,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: byref_staggered_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB8_0 ; GFX90a-NEXT: .p2align 8 @@ -331,9 +330,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, s1 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) -; GFX90a-NEXT: global_store_dword v0, v2, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v2, s[8:9] ; GFX90a-NEXT: s_waitcnt vmcnt(0) ; GFX90a-NEXT: s_endpgm %in = load i32, ptr addrspace(4) %in.byref @@ -370,26 +369,26 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x ; ; GFX90a-LABEL: v8i32_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB9_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB9_0: -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v0, s16 +; GFX90a-NEXT: v_mov_b32_e32 v1, s17 +; GFX90a-NEXT: v_mov_b32_e32 v2, s18 +; GFX90a-NEXT: v_mov_b32_e32 v3, s19 +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 +; GFX90a-NEXT: s_nop 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 ; GFX90a-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 -; GFX90a-NEXT: s_nop 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90a-NEXT: s_endpgm store <8 x i32> %in, ptr addrspace(1) %out, align 4 ret void @@ -414,18 +413,17 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX90a-LABEL: v3i16_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB10_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB10_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x i16> %in, ptr addrspace(1) %out, align 4 ret void @@ -451,19 +449,17 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX90a-LABEL: v3i32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB11_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB11_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x i32> %in, ptr addrspace(1) %out, align 4 ret void @@ -489,19 +485,17 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o ; ; GFX90a-LABEL: v3f32_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB12_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB12_0: ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x float> %in, ptr addrspace(1) %out, align 4 ret void @@ -533,25 +527,24 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou ; ; GFX90a-LABEL: v5i8_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB13_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB13_0: -; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s1, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 -; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010 ; GFX90a-NEXT: s_or_b32 s1, s2, s1 -; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff ; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 ; GFX90a-NEXT: s_or_b32 s0, s0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <5 x i8> %in, ptr addrspace(1) %out, align 4 ret void @@ -587,29 +580,29 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x ; ; GFX90a-LABEL: v5f64_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB14_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB14_0: ; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 +; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40 ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; GFX90a-NEXT: v_mov_b32_e32 v0, s16 +; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32 +; GFX90a-NEXT: v_mov_b32_e32 v1, s17 +; GFX90a-NEXT: v_mov_b32_e32 v2, s18 +; GFX90a-NEXT: v_mov_b32_e32 v3, s19 +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 +; GFX90a-NEXT: s_nop 0 ; GFX90a-NEXT: v_mov_b32_e32 v0, s12 -; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 ; GFX90a-NEXT: v_mov_b32_e32 v1, s13 ; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, s15 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 -; GFX90a-NEXT: s_nop 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: v_mov_b32_e32 v3, s11 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90a-NEXT: s_endpgm store <5 x double> %in, ptr addrspace(1) %out, align 8 ret void @@ -647,31 +640,30 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8 ; ; GFX90a-LABEL: v8i8_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB15_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB15_0: -; GFX90a-NEXT: s_lshr_b32 s1, s9, 24 +; GFX90a-NEXT: s_lshr_b32 s1, s11, 24 ; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 -; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010 ; GFX90a-NEXT: s_or_b32 s1, s2, s1 -; GFX90a-NEXT: s_lshr_b32 s2, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s2, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s2, s2, 8 -; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010 -; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff +; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010 +; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff ; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 ; GFX90a-NEXT: s_or_b32 s2, s3, s2 ; GFX90a-NEXT: s_or_b32 s0, s0, s1 -; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff ; GFX90a-NEXT: s_lshl_b32 s2, s2, 16 ; GFX90a-NEXT: s_or_b32 s1, s1, s2 ; GFX90a-NEXT: v_mov_b32_e32 v0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm store <8 x i8> %in, ptr addrspace(1) %out ret void @@ -694,16 +686,15 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i ; ; GFX90a-LABEL: i64_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB16_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB16_0: ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm store i64 %a, ptr addrspace(1) %out, align 8 ret void @@ -726,16 +717,15 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d ; ; GFX90a-LABEL: f64_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB17_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB17_0: ; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] +; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90a-NEXT: s_endpgm store double %in, ptr addrspace(1) %out ret void @@ -758,16 +748,16 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: half_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB18_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB18_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out ret void @@ -790,16 +780,16 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out ; ; GFX90a-LABEL: bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB19_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB19_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store bfloat %in, ptr addrspace(1) %out ret void @@ -822,16 +812,16 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: v2bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB20_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB20_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <2 x bfloat> %in, ptr addrspace(1) %out ret void @@ -856,18 +846,17 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: v3bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB21_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB21_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <3 x bfloat> %in, ptr addrspace(1) %out ret void @@ -893,19 +882,17 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: v6bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB22_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <6 x bfloat> %in, ptr addrspace(1) %out ret void @@ -934,24 +921,24 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr ; ; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB23_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB23_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: global_store_short v3, v0, s[6:7] -; GFX90a-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 ; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: global_store_short v3, v0, s[8:9] +; GFX90a-NEXT: s_waitcnt lgkmcnt(0) +; GFX90a-NEXT: v_mov_b32_e32 v0, s3 +; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12 +; GFX90a-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] ; GFX90a-NEXT: s_endpgm store half %in, ptr addrspace(1) %out store <7 x bfloat> %in2, ptr addrspace(1) %out2 @@ -976,17 +963,17 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 ; ; GFX90a-LABEL: i1_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB24_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB24_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 1 +; GFX90a-NEXT: s_and_b32 s0, s10, 1 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] +; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store i1 %in, ptr addrspace(1) %out ret void @@ -1013,20 +1000,18 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: fp128_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB25_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB25_0: ; GFX90a-NEXT: v_mov_b32_e32 v4, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: v_mov_b32_e32 v3, s13 -; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: v_mov_b32_e32 v3, s15 +; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] ; GFX90a-NEXT: s_endpgm store fp128 %in, ptr addrspace(1) %out ret void @@ -1059,26 +1044,25 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: v7i8_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB26_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB26_0: -; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s1, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 -; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010 ; GFX90a-NEXT: s_or_b32 s1, s2, s1 -; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff ; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 ; GFX90a-NEXT: s_or_b32 s0, s0, s1 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm store <7 x i8> %in, ptr addrspace(1) %out ret void @@ -1106,21 +1090,19 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out ; ; GFX90a-LABEL: v7half_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB27_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB27_0: ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s13 -; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v0, s15 +; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12 +; GFX90a-NEXT: v_mov_b32_e32 v2, s14 +; GFX90a-NEXT: v_mov_b32_e32 v0, s12 +; GFX90a-NEXT: v_mov_b32_e32 v1, s13 +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9] ; GFX90a-NEXT: s_endpgm store <7 x half> %in, ptr addrspace(1) %out ret void @@ -1145,18 +1127,18 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou ; ; GFX90a-LABEL: i16_i32_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB28_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB28_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] +; GFX90a-NEXT: v_mov_b32_e32 v1, s11 +; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store i32 %in2, ptr addrspace(1) %out2 @@ -1184,22 +1166,22 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg % ; ; GFX90a-LABEL: i16_v3i32_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB29_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB29_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v4, s8 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s11 -; GFX90a-NEXT: v_mov_b32_e32 v2, s12 -; GFX90a-NEXT: global_store_short v3, v4, s[6:7] +; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 +; GFX90a-NEXT: v_mov_b32_e32 v4, s10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX90a-NEXT: v_mov_b32_e32 v0, s0 +; GFX90a-NEXT: v_mov_b32_e32 v1, s1 +; GFX90a-NEXT: v_mov_b32_e32 v2, s2 +; GFX90a-NEXT: global_store_short v3, v4, s[8:9] +; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <3 x i32> %in2, ptr addrspace(1) %out2 @@ -1224,17 +1206,17 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou ; ; GFX90a-LABEL: i16_i16_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB30_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB30_0: ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] +; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store i16 %in2, ptr addrspace(1) %out2 @@ -1264,22 +1246,22 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o ; ; GFX90a-LABEL: i16_v2i8_kernel_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB31_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB31_0: -; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 +; GFX90a-NEXT: s_lshr_b32 s0, s10, 24 ; GFX90a-NEXT: s_lshl_b32 s0, s0, 8 -; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010 +; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010 ; GFX90a-NEXT: s_or_b32 s0, s1, s0 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] +; GFX90a-NEXT: v_mov_b32_e32 v1, s10 +; GFX90a-NEXT: global_store_short v0, v1, s[8:9] ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_short v0, v1, s[10:11] +; GFX90a-NEXT: global_store_short v0, v1, s[12:13] ; GFX90a-NEXT: s_endpgm store i16 %in, ptr addrspace(1) %out store <2 x i8> %in2, ptr addrspace(1) %out2 @@ -1308,7 +1290,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p ; ; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB32_0 ; GFX90a-NEXT: .p2align 8 @@ -1318,7 +1300,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p ; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_add_i32 s2, s6, s2 +; GFX90a-NEXT: s_add_i32 s2, s8, s2 ; GFX90a-NEXT: v_mov_b32_e32 v1, s2 ; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] ; GFX90a-NEXT: s_endpgm @@ -1345,17 +1327,16 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, ; ; GFX90a-LABEL: ptr1_i8_trailing_unused: ; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 +; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: s_branch .LBB33_0 ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB33_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff +; GFX90a-NEXT: s_and_b32 s0, s10, 0xff ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] +; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] ; GFX90a-NEXT: s_endpgm %ext = zext i8 %arg0 to i32 store i32 %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 5474338514522..8f25e6519588b 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0x5a +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_sad_u32 v2, s2, v0, 20 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 @@ -79,12 +88,14 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: s_max_u32 s0, s0, s1 @@ -92,8 +103,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -115,19 +127,22 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_add_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NEXT: v_mov_b32_e32 v3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_sad_u32 v2, s0, v2, v3 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -147,21 +162,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_max_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -182,21 +200,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_min_pat1: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -218,21 +239,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_sub_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s1 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v3 ; GCN-NEXT: s_endpgm @@ -251,12 +275,14 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) { ; GCN-LABEL: v_sad_u32_multi_use_select_pat2: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b64 s[18:19], s[2:3] -; GCN-NEXT: s_mov_b64 s[16:17], s[0:1] +; GCN-NEXT: s_mov_b64 s[22:23], s[2:3] +; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 -; GCN-NEXT: s_add_u32 s16, s16, s15 -; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s20, s20, s17 +; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_min_u32 s3, s0, s1 ; GCN-NEXT: s_max_u32 s0, s0, s1 @@ -264,8 +290,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen +; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat1: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc ; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 @@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; GCN-LABEL: v_sad_u32_vector_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc ; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 @@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_lshr_b32 s0, s0, 16 @@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: v_sad_u32 v2, s4, v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: flat_store_short v[0:1], v2 ; GCN-NEXT: s_endpgm @@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i16_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: flat_load_ushort v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) { ; GCN-LABEL: v_sad_u32_i8_pat2: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 @@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s2, 0xff ; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008 @@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_max_u32 s6, s0, s1 ; GCN-NEXT: s_cmp_le_u32 s0, s1 @@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) % ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s3 ; GCN-NEXT: s_sub_i32 s6, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll index 884ba3fc34dff..29448ab2d822e 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 { ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 @@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll index 0ad10437299f4..90dfd5a21d107 100644 --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -20,179 +20,183 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: ; def s[2:3] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] -; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s2, 0 ; CHECK-NEXT: v_writelane_b32 v22, s3, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[48:51] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[4:11] +; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v22, s4, 2 ; CHECK-NEXT: v_writelane_b32 v22, s5, 3 ; CHECK-NEXT: v_writelane_b32 v22, s6, 4 -; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8 ; CHECK-NEXT: v_writelane_b32 v22, s7, 5 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 6 -; CHECK-NEXT: v_writelane_b32 v22, s5, 7 -; CHECK-NEXT: v_writelane_b32 v22, s6, 8 -; CHECK-NEXT: v_writelane_b32 v22, s7, 9 -; CHECK-NEXT: v_writelane_b32 v22, s8, 10 -; CHECK-NEXT: v_writelane_b32 v22, s9, 11 -; CHECK-NEXT: v_writelane_b32 v22, s10, 12 -; CHECK-NEXT: v_writelane_b32 v22, s11, 13 +; CHECK-NEXT: v_writelane_b32 v22, s8, 6 +; CHECK-NEXT: v_writelane_b32 v22, s9, 7 +; CHECK-NEXT: v_writelane_b32 v22, s10, 8 +; CHECK-NEXT: v_writelane_b32 v22, s11, 9 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:19] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 14 -; CHECK-NEXT: v_writelane_b32 v22, s5, 15 -; CHECK-NEXT: v_writelane_b32 v22, s6, 16 -; CHECK-NEXT: v_writelane_b32 v22, s7, 17 -; CHECK-NEXT: v_writelane_b32 v22, s8, 18 -; CHECK-NEXT: v_writelane_b32 v22, s9, 19 -; CHECK-NEXT: v_writelane_b32 v22, s10, 20 -; CHECK-NEXT: v_writelane_b32 v22, s11, 21 -; CHECK-NEXT: v_writelane_b32 v22, s12, 22 -; CHECK-NEXT: v_writelane_b32 v22, s13, 23 -; CHECK-NEXT: v_writelane_b32 v22, s14, 24 -; CHECK-NEXT: v_writelane_b32 v22, s15, 25 -; CHECK-NEXT: v_writelane_b32 v22, s16, 26 -; CHECK-NEXT: v_writelane_b32 v22, s17, 27 -; CHECK-NEXT: v_writelane_b32 v22, s18, 28 -; CHECK-NEXT: v_writelane_b32 v22, s19, 29 +; CHECK-NEXT: v_writelane_b32 v22, s4, 10 +; CHECK-NEXT: v_writelane_b32 v22, s5, 11 +; CHECK-NEXT: v_writelane_b32 v22, s6, 12 +; CHECK-NEXT: v_writelane_b32 v22, s7, 13 +; CHECK-NEXT: v_writelane_b32 v22, s8, 14 +; CHECK-NEXT: v_writelane_b32 v22, s9, 15 +; CHECK-NEXT: v_writelane_b32 v22, s10, 16 +; CHECK-NEXT: v_writelane_b32 v22, s11, 17 +; CHECK-NEXT: v_writelane_b32 v22, s12, 18 +; CHECK-NEXT: v_writelane_b32 v22, s13, 19 +; CHECK-NEXT: v_writelane_b32 v22, s14, 20 +; CHECK-NEXT: v_writelane_b32 v22, s15, 21 +; CHECK-NEXT: v_writelane_b32 v22, s16, 22 +; CHECK-NEXT: v_writelane_b32 v22, s17, 23 +; CHECK-NEXT: v_writelane_b32 v22, s18, 24 +; CHECK-NEXT: v_writelane_b32 v22, s19, 25 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[42:43] +; CHECK-NEXT: ; def s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:55] +; CHECK-NEXT: ; def s[44:47] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[4:11] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s4, 30 -; CHECK-NEXT: v_writelane_b32 v22, s5, 31 -; CHECK-NEXT: v_writelane_b32 v22, s6, 32 -; CHECK-NEXT: v_writelane_b32 v22, s7, 33 -; CHECK-NEXT: v_writelane_b32 v22, s8, 34 -; CHECK-NEXT: v_writelane_b32 v22, s9, 35 -; CHECK-NEXT: v_writelane_b32 v22, s10, 36 -; CHECK-NEXT: v_writelane_b32 v22, s11, 37 +; CHECK-NEXT: v_writelane_b32 v22, s4, 26 +; CHECK-NEXT: v_writelane_b32 v22, s5, 27 +; CHECK-NEXT: v_writelane_b32 v22, s6, 28 +; CHECK-NEXT: v_writelane_b32 v22, s7, 29 +; CHECK-NEXT: v_writelane_b32 v22, s8, 30 +; CHECK-NEXT: v_writelane_b32 v22, s9, 31 +; CHECK-NEXT: v_writelane_b32 v22, s10, 32 +; CHECK-NEXT: v_writelane_b32 v22, s11, 33 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[40:41] +; CHECK-NEXT: ; def s[36:37] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:39] +; CHECK-NEXT: ; def s[40:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:51] +; CHECK-NEXT: ; def s[0:7] ; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v22, s0, 34 +; CHECK-NEXT: v_writelane_b32 v22, s1, 35 +; CHECK-NEXT: v_writelane_b32 v22, s2, 36 +; CHECK-NEXT: v_writelane_b32 v22, s3, 37 +; CHECK-NEXT: v_writelane_b32 v22, s4, 38 +; CHECK-NEXT: v_writelane_b32 v22, s5, 39 +; CHECK-NEXT: v_writelane_b32 v22, s6, 40 +; CHECK-NEXT: v_writelane_b32 v22, s7, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 38 -; CHECK-NEXT: v_writelane_b32 v22, s1, 39 -; CHECK-NEXT: v_writelane_b32 v22, s2, 40 -; CHECK-NEXT: v_writelane_b32 v22, s3, 41 -; CHECK-NEXT: v_writelane_b32 v22, s4, 42 -; CHECK-NEXT: v_writelane_b32 v22, s5, 43 -; CHECK-NEXT: v_writelane_b32 v22, s6, 44 -; CHECK-NEXT: v_writelane_b32 v22, s7, 45 -; CHECK-NEXT: v_writelane_b32 v22, s8, 46 -; CHECK-NEXT: v_writelane_b32 v22, s9, 47 -; CHECK-NEXT: v_writelane_b32 v22, s10, 48 -; CHECK-NEXT: v_writelane_b32 v22, s11, 49 -; CHECK-NEXT: v_writelane_b32 v22, s12, 50 -; CHECK-NEXT: v_writelane_b32 v22, s13, 51 -; CHECK-NEXT: v_writelane_b32 v22, s14, 52 -; CHECK-NEXT: v_writelane_b32 v22, s15, 53 +; CHECK-NEXT: v_writelane_b32 v22, s0, 42 +; CHECK-NEXT: v_writelane_b32 v22, s1, 43 +; CHECK-NEXT: v_writelane_b32 v22, s2, 44 +; CHECK-NEXT: v_writelane_b32 v22, s3, 45 +; CHECK-NEXT: v_writelane_b32 v22, s4, 46 +; CHECK-NEXT: v_writelane_b32 v22, s5, 47 +; CHECK-NEXT: v_writelane_b32 v22, s6, 48 +; CHECK-NEXT: v_writelane_b32 v22, s7, 49 +; CHECK-NEXT: v_writelane_b32 v22, s8, 50 +; CHECK-NEXT: v_writelane_b32 v22, s9, 51 +; CHECK-NEXT: v_writelane_b32 v22, s10, 52 +; CHECK-NEXT: v_writelane_b32 v22, s11, 53 +; CHECK-NEXT: v_writelane_b32 v22, s12, 54 +; CHECK-NEXT: v_writelane_b32 v22, s13, 55 +; CHECK-NEXT: v_writelane_b32 v22, s14, 56 +; CHECK-NEXT: v_writelane_b32 v22, s15, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v22, s0, 54 -; CHECK-NEXT: v_writelane_b32 v22, s1, 55 -; CHECK-NEXT: v_writelane_b32 v22, s2, 56 -; CHECK-NEXT: v_writelane_b32 v22, s3, 57 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v22, s0, 58 ; CHECK-NEXT: v_writelane_b32 v22, s1, 59 ; CHECK-NEXT: v_writelane_b32 v22, s2, 60 -; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane ; CHECK-NEXT: v_writelane_b32 v22, s3, 61 -; CHECK-NEXT: v_writelane_b32 v22, s4, 62 -; CHECK-NEXT: v_writelane_b32 v23, s6, 0 -; CHECK-NEXT: v_writelane_b32 v22, s5, 63 -; CHECK-NEXT: v_writelane_b32 v23, s7, 1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane +; CHECK-NEXT: v_writelane_b32 v22, s0, 62 +; CHECK-NEXT: v_writelane_b32 v23, s2, 0 +; CHECK-NEXT: v_writelane_b32 v23, s3, 1 +; CHECK-NEXT: v_writelane_b32 v23, s4, 2 +; CHECK-NEXT: v_writelane_b32 v23, s5, 3 +; CHECK-NEXT: v_writelane_b32 v23, s6, 4 +; CHECK-NEXT: v_writelane_b32 v22, s1, 63 +; CHECK-NEXT: v_writelane_b32 v23, s7, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 2 -; CHECK-NEXT: v_writelane_b32 v23, s1, 3 -; CHECK-NEXT: v_writelane_b32 v23, s2, 4 -; CHECK-NEXT: v_writelane_b32 v23, s3, 5 -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 -; CHECK-NEXT: v_writelane_b32 v23, s12, 14 -; CHECK-NEXT: v_writelane_b32 v23, s13, 15 -; CHECK-NEXT: v_writelane_b32 v23, s14, 16 -; CHECK-NEXT: v_writelane_b32 v23, s15, 17 +; CHECK-NEXT: v_writelane_b32 v23, s0, 6 +; CHECK-NEXT: v_writelane_b32 v23, s1, 7 +; CHECK-NEXT: v_writelane_b32 v23, s2, 8 +; CHECK-NEXT: v_writelane_b32 v23, s3, 9 +; CHECK-NEXT: v_writelane_b32 v23, s4, 10 +; CHECK-NEXT: v_writelane_b32 v23, s5, 11 +; CHECK-NEXT: v_writelane_b32 v23, s6, 12 +; CHECK-NEXT: v_writelane_b32 v23, s7, 13 +; CHECK-NEXT: v_writelane_b32 v23, s8, 14 +; CHECK-NEXT: v_writelane_b32 v23, s9, 15 +; CHECK-NEXT: v_writelane_b32 v23, s10, 16 +; CHECK-NEXT: v_writelane_b32 v23, s11, 17 +; CHECK-NEXT: v_writelane_b32 v23, s12, 18 +; CHECK-NEXT: v_writelane_b32 v23, s13, 19 +; CHECK-NEXT: v_writelane_b32 v23, s14, 20 +; CHECK-NEXT: v_writelane_b32 v23, s15, 21 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 18 -; CHECK-NEXT: v_writelane_b32 v23, s1, 19 +; CHECK-NEXT: v_writelane_b32 v23, s0, 22 +; CHECK-NEXT: v_writelane_b32 v23, s1, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 20 -; CHECK-NEXT: v_writelane_b32 v23, s1, 21 -; CHECK-NEXT: v_writelane_b32 v23, s2, 22 -; CHECK-NEXT: v_writelane_b32 v23, s3, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_writelane_b32 v23, s0, 24 ; CHECK-NEXT: v_writelane_b32 v23, s1, 25 ; CHECK-NEXT: v_writelane_b32 v23, s2, 26 ; CHECK-NEXT: v_writelane_b32 v23, s3, 27 -; CHECK-NEXT: v_writelane_b32 v23, s4, 28 -; CHECK-NEXT: v_writelane_b32 v23, s5, 29 -; CHECK-NEXT: v_writelane_b32 v23, s6, 30 -; CHECK-NEXT: v_writelane_b32 v23, s7, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; def s[0:7] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_writelane_b32 v23, s0, 28 +; CHECK-NEXT: v_writelane_b32 v23, s1, 29 +; CHECK-NEXT: v_writelane_b32 v23, s2, 30 +; CHECK-NEXT: v_writelane_b32 v23, s3, 31 +; CHECK-NEXT: v_writelane_b32 v23, s4, 32 +; CHECK-NEXT: v_writelane_b32 v23, s5, 33 +; CHECK-NEXT: v_writelane_b32 v23, s6, 34 +; CHECK-NEXT: v_writelane_b32 v23, s7, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 32 -; CHECK-NEXT: v_writelane_b32 v23, s1, 33 -; CHECK-NEXT: v_writelane_b32 v23, s2, 34 -; CHECK-NEXT: v_writelane_b32 v23, s3, 35 -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 -; CHECK-NEXT: v_writelane_b32 v23, s12, 44 -; CHECK-NEXT: v_writelane_b32 v23, s13, 45 -; CHECK-NEXT: v_writelane_b32 v23, s14, 46 -; CHECK-NEXT: v_writelane_b32 v23, s15, 47 +; CHECK-NEXT: v_writelane_b32 v23, s0, 36 +; CHECK-NEXT: v_writelane_b32 v23, s1, 37 +; CHECK-NEXT: v_writelane_b32 v23, s2, 38 +; CHECK-NEXT: v_writelane_b32 v23, s3, 39 +; CHECK-NEXT: v_writelane_b32 v23, s4, 40 +; CHECK-NEXT: v_writelane_b32 v23, s5, 41 +; CHECK-NEXT: v_writelane_b32 v23, s6, 42 +; CHECK-NEXT: v_writelane_b32 v23, s7, 43 +; CHECK-NEXT: v_writelane_b32 v23, s8, 44 +; CHECK-NEXT: v_writelane_b32 v23, s9, 45 +; CHECK-NEXT: v_writelane_b32 v23, s10, 46 +; CHECK-NEXT: v_writelane_b32 v23, s11, 47 +; CHECK-NEXT: v_writelane_b32 v23, s12, 48 +; CHECK-NEXT: v_writelane_b32 v23, s13, 49 +; CHECK-NEXT: v_writelane_b32 v23, s14, 50 +; CHECK-NEXT: v_writelane_b32 v23, s15, 51 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %ret ; CHECK-NEXT: s_endpgm @@ -206,166 +210,170 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { ; CHECK-NEXT: v_readlane_b32 s1, v22, 3 ; CHECK-NEXT: v_readlane_b32 s2, v22, 4 ; CHECK-NEXT: v_readlane_b32 s3, v22, 5 +; CHECK-NEXT: v_readlane_b32 s4, v22, 6 +; CHECK-NEXT: v_readlane_b32 s5, v22, 7 +; CHECK-NEXT: v_readlane_b32 s6, v22, 8 +; CHECK-NEXT: v_readlane_b32 s7, v22, 9 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ; use s[48:51] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 6 -; CHECK-NEXT: v_readlane_b32 s1, v22, 7 -; CHECK-NEXT: v_readlane_b32 s2, v22, 8 -; CHECK-NEXT: v_readlane_b32 s3, v22, 9 -; CHECK-NEXT: v_readlane_b32 s4, v22, 10 -; CHECK-NEXT: v_readlane_b32 s5, v22, 11 -; CHECK-NEXT: v_readlane_b32 s6, v22, 12 -; CHECK-NEXT: v_readlane_b32 s7, v22, 13 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 14 -; CHECK-NEXT: v_readlane_b32 s1, v22, 15 -; CHECK-NEXT: v_readlane_b32 s2, v22, 16 -; CHECK-NEXT: v_readlane_b32 s3, v22, 17 -; CHECK-NEXT: v_readlane_b32 s4, v22, 18 -; CHECK-NEXT: v_readlane_b32 s5, v22, 19 -; CHECK-NEXT: v_readlane_b32 s6, v22, 20 -; CHECK-NEXT: v_readlane_b32 s7, v22, 21 -; CHECK-NEXT: v_readlane_b32 s8, v22, 22 -; CHECK-NEXT: v_readlane_b32 s9, v22, 23 -; CHECK-NEXT: v_readlane_b32 s10, v22, 24 -; CHECK-NEXT: v_readlane_b32 s11, v22, 25 -; CHECK-NEXT: v_readlane_b32 s12, v22, 26 -; CHECK-NEXT: v_readlane_b32 s13, v22, 27 -; CHECK-NEXT: v_readlane_b32 s14, v22, 28 -; CHECK-NEXT: v_readlane_b32 s15, v22, 29 +; CHECK-NEXT: v_readlane_b32 s0, v22, 10 +; CHECK-NEXT: v_readlane_b32 s1, v22, 11 +; CHECK-NEXT: v_readlane_b32 s2, v22, 12 +; CHECK-NEXT: v_readlane_b32 s3, v22, 13 +; CHECK-NEXT: v_readlane_b32 s4, v22, 14 +; CHECK-NEXT: v_readlane_b32 s5, v22, 15 +; CHECK-NEXT: v_readlane_b32 s6, v22, 16 +; CHECK-NEXT: v_readlane_b32 s7, v22, 17 +; CHECK-NEXT: v_readlane_b32 s8, v22, 18 +; CHECK-NEXT: v_readlane_b32 s9, v22, 19 +; CHECK-NEXT: v_readlane_b32 s10, v22, 20 +; CHECK-NEXT: v_readlane_b32 s11, v22, 21 +; CHECK-NEXT: v_readlane_b32 s12, v22, 22 +; CHECK-NEXT: v_readlane_b32 s13, v22, 23 +; CHECK-NEXT: v_readlane_b32 s14, v22, 24 +; CHECK-NEXT: v_readlane_b32 s15, v22, 25 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 30 -; CHECK-NEXT: v_readlane_b32 s1, v22, 31 -; CHECK-NEXT: v_readlane_b32 s2, v22, 32 -; CHECK-NEXT: v_readlane_b32 s3, v22, 33 -; CHECK-NEXT: v_readlane_b32 s4, v22, 34 -; CHECK-NEXT: v_readlane_b32 s5, v22, 35 -; CHECK-NEXT: v_readlane_b32 s6, v22, 36 -; CHECK-NEXT: v_readlane_b32 s7, v22, 37 +; CHECK-NEXT: v_readlane_b32 s0, v22, 26 +; CHECK-NEXT: v_readlane_b32 s1, v22, 27 +; CHECK-NEXT: v_readlane_b32 s2, v22, 28 +; CHECK-NEXT: v_readlane_b32 s3, v22, 29 +; CHECK-NEXT: v_readlane_b32 s4, v22, 30 +; CHECK-NEXT: v_readlane_b32 s5, v22, 31 +; CHECK-NEXT: v_readlane_b32 s6, v22, 32 +; CHECK-NEXT: v_readlane_b32 s7, v22, 33 ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[42:43] +; CHECK-NEXT: ; use s[38:39] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:55] +; CHECK-NEXT: ; use s[44:47] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 38 -; CHECK-NEXT: v_readlane_b32 s1, v22, 39 -; CHECK-NEXT: v_readlane_b32 s2, v22, 40 -; CHECK-NEXT: v_readlane_b32 s3, v22, 41 +; CHECK-NEXT: v_readlane_b32 s0, v22, 34 +; CHECK-NEXT: v_readlane_b32 s1, v22, 35 +; CHECK-NEXT: v_readlane_b32 s2, v22, 36 +; CHECK-NEXT: v_readlane_b32 s3, v22, 37 +; CHECK-NEXT: v_readlane_b32 s4, v22, 38 +; CHECK-NEXT: v_readlane_b32 s5, v22, 39 +; CHECK-NEXT: v_readlane_b32 s6, v22, 40 +; CHECK-NEXT: v_readlane_b32 s7, v22, 41 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[16:31] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[40:41] +; CHECK-NEXT: ; use s[36:37] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:39] +; CHECK-NEXT: ; use s[40:43] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:51] +; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s4, v22, 42 -; CHECK-NEXT: v_readlane_b32 s5, v22, 43 -; CHECK-NEXT: v_readlane_b32 s6, v22, 44 -; CHECK-NEXT: v_readlane_b32 s7, v22, 45 -; CHECK-NEXT: v_readlane_b32 s8, v22, 46 -; CHECK-NEXT: v_readlane_b32 s9, v22, 47 -; CHECK-NEXT: v_readlane_b32 s10, v22, 48 -; CHECK-NEXT: v_readlane_b32 s11, v22, 49 -; CHECK-NEXT: v_readlane_b32 s12, v22, 50 -; CHECK-NEXT: v_readlane_b32 s13, v22, 51 -; CHECK-NEXT: v_readlane_b32 s14, v22, 52 -; CHECK-NEXT: v_readlane_b32 s15, v22, 53 +; CHECK-NEXT: v_readlane_b32 s0, v22, 42 +; CHECK-NEXT: v_readlane_b32 s1, v22, 43 +; CHECK-NEXT: v_readlane_b32 s2, v22, 44 +; CHECK-NEXT: v_readlane_b32 s3, v22, 45 +; CHECK-NEXT: v_readlane_b32 s4, v22, 46 +; CHECK-NEXT: v_readlane_b32 s5, v22, 47 +; CHECK-NEXT: v_readlane_b32 s6, v22, 48 +; CHECK-NEXT: v_readlane_b32 s7, v22, 49 +; CHECK-NEXT: v_readlane_b32 s8, v22, 50 +; CHECK-NEXT: v_readlane_b32 s9, v22, 51 +; CHECK-NEXT: v_readlane_b32 s10, v22, 52 +; CHECK-NEXT: v_readlane_b32 s11, v22, 53 +; CHECK-NEXT: v_readlane_b32 s12, v22, 54 +; CHECK-NEXT: v_readlane_b32 s13, v22, 55 +; CHECK-NEXT: v_readlane_b32 s14, v22, 56 +; CHECK-NEXT: v_readlane_b32 s15, v22, 57 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 54 -; CHECK-NEXT: v_readlane_b32 s1, v22, 55 -; CHECK-NEXT: v_readlane_b32 s2, v22, 56 -; CHECK-NEXT: v_readlane_b32 s3, v22, 57 +; CHECK-NEXT: v_readlane_b32 s0, v22, 58 +; CHECK-NEXT: v_readlane_b32 s1, v22, 59 +; CHECK-NEXT: v_readlane_b32 s2, v22, 60 +; CHECK-NEXT: v_readlane_b32 s3, v22, 61 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[34:35] ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:3] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v22, 58 -; CHECK-NEXT: v_readlane_b32 s1, v22, 59 -; CHECK-NEXT: v_readlane_b32 s2, v22, 60 -; CHECK-NEXT: v_readlane_b32 s3, v22, 61 -; CHECK-NEXT: v_readlane_b32 s4, v22, 62 -; CHECK-NEXT: v_readlane_b32 s5, v22, 63 -; CHECK-NEXT: v_readlane_b32 s6, v23, 0 -; CHECK-NEXT: v_readlane_b32 s7, v23, 1 +; CHECK-NEXT: v_readlane_b32 s0, v22, 62 +; CHECK-NEXT: v_readlane_b32 s1, v22, 63 +; CHECK-NEXT: v_readlane_b32 s2, v23, 0 +; CHECK-NEXT: v_readlane_b32 s3, v23, 1 +; CHECK-NEXT: v_readlane_b32 s4, v23, 2 +; CHECK-NEXT: v_readlane_b32 s5, v23, 3 +; CHECK-NEXT: v_readlane_b32 s6, v23, 4 +; CHECK-NEXT: v_readlane_b32 s7, v23, 5 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 -; CHECK-NEXT: v_readlane_b32 s4, v23, 6 -; CHECK-NEXT: v_readlane_b32 s5, v23, 7 -; CHECK-NEXT: v_readlane_b32 s6, v23, 8 -; CHECK-NEXT: v_readlane_b32 s7, v23, 9 -; CHECK-NEXT: v_readlane_b32 s8, v23, 10 -; CHECK-NEXT: v_readlane_b32 s9, v23, 11 -; CHECK-NEXT: v_readlane_b32 s10, v23, 12 -; CHECK-NEXT: v_readlane_b32 s11, v23, 13 -; CHECK-NEXT: v_readlane_b32 s12, v23, 14 -; CHECK-NEXT: v_readlane_b32 s13, v23, 15 -; CHECK-NEXT: v_readlane_b32 s14, v23, 16 -; CHECK-NEXT: v_readlane_b32 s15, v23, 17 +; CHECK-NEXT: v_readlane_b32 s0, v23, 6 +; CHECK-NEXT: v_readlane_b32 s1, v23, 7 +; CHECK-NEXT: v_readlane_b32 s2, v23, 8 +; CHECK-NEXT: v_readlane_b32 s3, v23, 9 +; CHECK-NEXT: v_readlane_b32 s4, v23, 10 +; CHECK-NEXT: v_readlane_b32 s5, v23, 11 +; CHECK-NEXT: v_readlane_b32 s6, v23, 12 +; CHECK-NEXT: v_readlane_b32 s7, v23, 13 +; CHECK-NEXT: v_readlane_b32 s8, v23, 14 +; CHECK-NEXT: v_readlane_b32 s9, v23, 15 +; CHECK-NEXT: v_readlane_b32 s10, v23, 16 +; CHECK-NEXT: v_readlane_b32 s11, v23, 17 +; CHECK-NEXT: v_readlane_b32 s12, v23, 18 +; CHECK-NEXT: v_readlane_b32 s13, v23, 19 +; CHECK-NEXT: v_readlane_b32 s14, v23, 20 +; CHECK-NEXT: v_readlane_b32 s15, v23, 21 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 18 -; CHECK-NEXT: v_readlane_b32 s1, v23, 19 +; CHECK-NEXT: v_readlane_b32 s0, v23, 22 +; CHECK-NEXT: v_readlane_b32 s1, v23, 23 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:1] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 20 -; CHECK-NEXT: v_readlane_b32 s1, v23, 21 -; CHECK-NEXT: v_readlane_b32 s2, v23, 22 -; CHECK-NEXT: v_readlane_b32 s3, v23, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_readlane_b32 s0, v23, 24 ; CHECK-NEXT: v_readlane_b32 s1, v23, 25 ; CHECK-NEXT: v_readlane_b32 s2, v23, 26 ; CHECK-NEXT: v_readlane_b32 s3, v23, 27 -; CHECK-NEXT: v_readlane_b32 s4, v23, 28 -; CHECK-NEXT: v_readlane_b32 s5, v23, 29 -; CHECK-NEXT: v_readlane_b32 s6, v23, 30 -; CHECK-NEXT: v_readlane_b32 s7, v23, 31 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:3] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s0, v23, 28 +; CHECK-NEXT: v_readlane_b32 s1, v23, 29 +; CHECK-NEXT: v_readlane_b32 s2, v23, 30 +; CHECK-NEXT: v_readlane_b32 s3, v23, 31 +; CHECK-NEXT: v_readlane_b32 s4, v23, 32 +; CHECK-NEXT: v_readlane_b32 s5, v23, 33 +; CHECK-NEXT: v_readlane_b32 s6, v23, 34 +; CHECK-NEXT: v_readlane_b32 s7, v23, 35 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:7] ; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 32 -; CHECK-NEXT: v_readlane_b32 s1, v23, 33 -; CHECK-NEXT: v_readlane_b32 s2, v23, 34 -; CHECK-NEXT: v_readlane_b32 s3, v23, 35 -; CHECK-NEXT: v_readlane_b32 s4, v23, 36 -; CHECK-NEXT: v_readlane_b32 s5, v23, 37 -; CHECK-NEXT: v_readlane_b32 s6, v23, 38 -; CHECK-NEXT: v_readlane_b32 s7, v23, 39 -; CHECK-NEXT: v_readlane_b32 s8, v23, 40 -; CHECK-NEXT: v_readlane_b32 s9, v23, 41 -; CHECK-NEXT: v_readlane_b32 s10, v23, 42 -; CHECK-NEXT: v_readlane_b32 s11, v23, 43 -; CHECK-NEXT: v_readlane_b32 s12, v23, 44 -; CHECK-NEXT: v_readlane_b32 s13, v23, 45 -; CHECK-NEXT: v_readlane_b32 s14, v23, 46 -; CHECK-NEXT: v_readlane_b32 s15, v23, 47 +; CHECK-NEXT: v_readlane_b32 s0, v23, 36 +; CHECK-NEXT: v_readlane_b32 s1, v23, 37 +; CHECK-NEXT: v_readlane_b32 s2, v23, 38 +; CHECK-NEXT: v_readlane_b32 s3, v23, 39 +; CHECK-NEXT: v_readlane_b32 s4, v23, 40 +; CHECK-NEXT: v_readlane_b32 s5, v23, 41 +; CHECK-NEXT: v_readlane_b32 s6, v23, 42 +; CHECK-NEXT: v_readlane_b32 s7, v23, 43 +; CHECK-NEXT: v_readlane_b32 s8, v23, 44 +; CHECK-NEXT: v_readlane_b32 s9, v23, 45 +; CHECK-NEXT: v_readlane_b32 s10, v23, 46 +; CHECK-NEXT: v_readlane_b32 s11, v23, 47 +; CHECK-NEXT: v_readlane_b32 s12, v23, 48 +; CHECK-NEXT: v_readlane_b32 s13, v23, 49 +; CHECK-NEXT: v_readlane_b32 s14, v23, 50 +; CHECK-NEXT: v_readlane_b32 s15, v23, 51 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; use s[0:15] ; CHECK-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll index 455d22f2aa29c..cdfba3cf0db7f 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -7,7 +7,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s15 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index a423b6f831a9d..65a17ed67481c 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, s4, 64 ; GCN-NEXT: s_sub_i32 s12, 64, s4 @@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s5, 64, s4 ; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 @@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index 8531b2ad4e405..3c47e2504747d 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s @@ -7,9 +6,6 @@ target datalayout = "A5" define internal void @indirect() { -; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect ; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: ret void @@ -22,15 +18,6 @@ define internal void @indirect() { } define amdgpu_kernel void @test_simple_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { -; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) -; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr -; AKF_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8 -; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8 -; AKF_GCN-NEXT: call void [[FP]]() -; AKF_GCN-NEXT: ret void -; ; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call ; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { ; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -79,12 +66,10 @@ define amdgpu_kernel void @test_simple_indirect_call() { !llvm.module.flags = !{!0} !0 = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. -; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. ; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index 8129a7ac51df9..d71d0f78fe1c3 100644 --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; CI-NEXT: v_mov_b32_e32 v3, s1 @@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) { ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_bitcmp1_b32 s2, 0 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; CI-LABEL: s_sint_to_fp_i64_to_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_sint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_sext_i32_i8 s2, s2 ; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_sext_i32_i8 s2, s2 ; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2 @@ -231,11 +267,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -245,11 +284,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -282,11 +324,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -296,11 +341,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -352,11 +400,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s2, s[8:9], 0x2 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: s_add_i32 s12, s12, s17 +; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_cmp_eq_u32 s2, 0 ; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -366,11 +417,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index 5ae339454a0ba..bd255e88b9512 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0 + ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) - ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13 + ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) + ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %14 ; GCN-NEXT: S_ENDPGM 0 %v0 = call i32 asm sideeffect "; def $0", "=v"() %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index f791135d45e9a..ef92cf3214e7f 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: +; HAWAII-NEXT: s_add_i32 s12, s12, s17 ; HAWAII-NEXT: s_or_b32 s0, s8, 14 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s9 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] @@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: +; FIJI-NEXT: s_add_i32 s12, s12, s17 ; FIJI-NEXT: s_or_b32 s0, s8, 14 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s9 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll index 19d633651fdd0..30accc846d2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... +; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll index 2097579e0c995..4f84b31f1877b 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................ +; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 5 diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll index 775c62e73261a..644f434923368 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 { ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................ -; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @............... +; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!....... ; ELF: AMDGPU Metadata ; ELF: .sgpr_count: 9 diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll index b8f0d7617167e..69cc63eba6243 100644 --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) { ; HSA-TRAP-GFX803-LABEL: trap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3 +; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) ; HSA-TRAP-GFX803-NEXT: s_trap 2 @@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a ; HSA-TRAP-GFX803-LABEL: non_entry_trap: ; HSA-TRAP-GFX803: ; %bb.0: ; %entry ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 @@ -280,6 +286,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7] ; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5 @@ -411,10 +420,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) ; HSA-TRAP-GFX803-LABEL: debugtrap: ; HSA-TRAP-GFX803: ; %bb.0: ; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17 +; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1 -; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0 +; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 6e29536feb51b..660ff4677547a 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-LABEL: udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GCN-NEXT: s_sub_i32 s4, 0, s3 @@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: udiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 16 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac ; GCN-LABEL: udiv_i32_div_pow2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp ; GCN-LABEL: udiv_i32_div_k_even: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa ; GCN-LABEL: udiv_i32_div_k_odd: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; GCN-LABEL: v_udiv_i8: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i23: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: v_udiv_i24: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_add_u32 s4, s2, 4 ; GCN-NEXT: s_addc_u32 s5, s3, 0 @@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read ; GCN-LABEL: scalarize_mulhu_4xi32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; GCN-LABEL: test_udiv2: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, s0, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 @@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 @@ -2378,6 +2420,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon ; GCN-LABEL: fdiv_test_denormals: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index 55cbc14a46706..97738a7944741 100644 --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; SI-LABEL: s_uint_to_fp_i64_to_f64: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i ; VI-LABEL: s_uint_to_fp_i64_to_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 @@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 @@ -128,6 +146,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 ; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1 @@ -160,6 +181,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7 ; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5 @@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; SI-NEXT: v_mov_b32_e32 v3, s1 @@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; VI-NEXT: v_mov_b32_e32 v3, s1 @@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 ; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 ; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 @@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bitcmp1_b32 s2, 0 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 % ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bitcmp1_b32 s2, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_and_b32 s2, s2, 0xff ; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s2, s2, 0xff ; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 @@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_load_dword s2, s[8:9], 0x2 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_add_i32 s12, s12, s17 +; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; SI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s2, 0 ; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v2, s0 ; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[8:9], 0x8 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: s_add_i32 s12, s12, s17 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s2, 0 ; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 ; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll index 45ea6b62761cc..ab7e85fdff516 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -11,7 +11,7 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() { ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_add_u32 s0, s0, s15 +; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll index e99a06f497016..1bc25a1386074 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll @@ -95,7 +95,7 @@ ; Function Attrs: convergent nocallback nofree nounwind willreturn declare void @llvm.amdgcn.end.cf.i64(i64) #2 - attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } + attributes #0 = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #2 = { convergent nocallback nofree nounwind willreturn } attributes #3 = { convergent nocallback nofree nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll index 076d7c9cd8842..0515ffa094329 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll @@ -69,5 +69,5 @@ bb4: ret void } -attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone }