Skip to content

Commit 91cb8f5

Browse files
authored
[NVPTX] Add tcgen05 alloc/dealloc intrinsics (#124961)
This patch adds intrinsics for the tcgen05 alloc/dealloc family of PTX instructions. This patch also adds an addrspace 6 for tensor memory which is used by these intrinsics. lit tests are added and verified with a ptxas-12.8 executable. Documentation for these additions is also added in NVPTXUsage.rst. Signed-off-by: Durgadoss R <[email protected]>
1 parent b53da77 commit 91cb8f5

File tree

10 files changed

+327
-6
lines changed

10 files changed

+327
-6
lines changed

clang/lib/Basic/Targets/NVPTX.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,13 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
6262
HasFloat16 = true;
6363

6464
if (TargetPointerWidth == 32)
65-
resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
66-
else if (Opts.NVPTXUseShortPointers)
6765
resetDataLayout(
68-
"e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
66+
"e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
67+
else if (Opts.NVPTXUseShortPointers)
68+
resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
69+
"16-v32:32-n16:32:64");
6970
else
70-
resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64");
71+
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
7172

7273
// If possible, get a TargetInfo for our host triple, so we can match its
7374
// types.

clang/test/CodeGen/target-data.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,11 @@
160160

161161
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
162162
// RUN: FileCheck %s -check-prefix=NVPTX
163-
// NVPTX: target datalayout = "e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
163+
// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
164164

165165
// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
166166
// RUN: FileCheck %s -check-prefix=NVPTX64
167-
// NVPTX64: target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
167+
// NVPTX64: target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
168168

169169
// RUN: %clang_cc1 -triple r600-unknown -o - -emit-llvm %s | \
170170
// RUN: FileCheck %s -check-prefix=R600

llvm/docs/NVPTXUsage.rst

+98
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,104 @@ The ``griddepcontrol`` intrinsics allows the dependent grids and prerequisite gr
962962
For more information, refer
963963
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol>`__.
964964

965+
TCGEN05 family of Intrinsics
966+
----------------------------
967+
968+
The llvm.nvvm.tcgen05.* intrinsics model the TCGEN05 family of instructions
969+
exposed by PTX. These intrinsics use 'Tensor Memory' (henceforth ``tmem``).
970+
NVPTX represents this memory using ``addrspace(6)`` and is always 32-bits.
971+
972+
For more information, refer to the PTX ISA
973+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory>`_.
974+
975+
The tensor-memory pointers may only be used with the tcgen05 intrinsics.
976+
There are specialized load/store instructions provided (tcgen05.ld/st) to
977+
work with tensor-memory.
978+
979+
See the PTX ISA for more information on tensor-memory load/store instructions
980+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-and-register-load-store-instructions>`_.
981+
982+
'``llvm.nvvm.tcgen05.alloc``'
983+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
984+
985+
Syntax:
986+
"""""""
987+
988+
.. code-block:: llvm
989+
990+
declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %dst, i32 %ncols)
991+
declare void @llvm.nvvm.tcgen05.alloc.cg2(ptr %dst, i32 %ncols)
992+
declare void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %dst, i32 %ncols)
993+
declare void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %dst, i32 %ncols)
994+
995+
Overview:
996+
"""""""""
997+
998+
The '``@llvm.nvvm.tcgen05.alloc.*``' intrinsics correspond to the
999+
``tcgen05.alloc.cta_group*.sync.aligned.b32`` family of PTX instructions.
1000+
The ``tcgen05.alloc`` is a potentially blocking instruction which dynamically
1001+
allocates the specified number of columns in the Tensor Memory and writes
1002+
the address of the allocated Tensor Memory into shared memory at the
1003+
location specified by ``%dst``. The 32-bit operand ``%ncols`` specifies
1004+
the number of columns to be allocated and it must be a power-of-two.
1005+
The ``.shared`` variant explicitly uses shared memory address space for
1006+
the ``%dst`` operand. The ``.cg1`` and ``.cg2`` variants generate
1007+
``cta_group::1`` and ``cta_group::2`` variants of the instruction respectively.
1008+
1009+
For more information, refer to the PTX ISA
1010+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_.
1011+
1012+
'``llvm.nvvm.tcgen05.dealloc``'
1013+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1014+
1015+
Syntax:
1016+
"""""""
1017+
1018+
.. code-block:: llvm
1019+
1020+
declare void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %tmem_addr, i32 %ncols)
1021+
declare void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %tmem_addr, i32 %ncols)
1022+
1023+
Overview:
1024+
"""""""""
1025+
1026+
The '``@llvm.nvvm.tcgen05.dealloc.*``' intrinsics correspond to the
1027+
``tcgen05.dealloc.*`` set of PTX instructions. The ``tcgen05.dealloc``
1028+
instructions deallocates the Tensor Memory specified by the Tensor Memory
1029+
address ``%tmem_addr``. The operand ``%tmem_addr`` must point to a previous
1030+
Tensor Memory allocation. The 32-bit operand ``%ncols`` specifies the number
1031+
of columns to be de-allocated. The ``.cg1`` and ``.cg2`` variants generate
1032+
``cta_group::1`` and ``cta_group::2`` variants of the instruction respectively.
1033+
1034+
For more information, refer to the PTX ISA
1035+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_.
1036+
1037+
'``llvm.nvvm.tcgen05.relinq.alloc.permit``'
1038+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1039+
1040+
Syntax:
1041+
"""""""
1042+
1043+
.. code-block:: llvm
1044+
1045+
declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
1046+
declare void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
1047+
1048+
Overview:
1049+
"""""""""
1050+
1051+
The '``@llvm.nvvm.tcgen05.relinq.alloc.permit.*``' intrinsics correspond
1052+
to the ``tcgen05.relinquish_alloc_permit.*`` set of PTX instructions.
1053+
This instruction specifies that the CTA of the executing thread is
1054+
relinquishing the right to allocate Tensor Memory. So, it is illegal
1055+
for a CTA to perform ``tcgen05.alloc`` after any of its constituent
1056+
threads execute ``tcgen05.relinquish_alloc_permit``. The ``.cg1``
1057+
and ``.cg2`` variants generate ``cta_group::1`` and ``cta_group::2``
1058+
flavors of the instruction respectively.
1059+
1060+
For more information, refer to the PTX ISA
1061+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_.
1062+
9651063
Other Intrinsics
9661064
----------------
9671065

llvm/include/llvm/IR/IntrinsicsNVVM.td

+30
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
5050
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
51+
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
5152

5253
//
5354
// MISC
@@ -5055,4 +5056,33 @@ def int_nvvm_cp_async_bulk_prefetch_L2
50555056
def int_nvvm_griddepcontrol_launch_dependents: Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
50565057
def int_nvvm_griddepcontrol_wait: Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
50575058

5059+
//
5060+
// Tcgen05 family of Intrinsics
5061+
//
5062+
5063+
// Tcgen05 alloc/dealloc related intrinsics
5064+
5065+
foreach cta_group = ["cg1", "cg2"] in {
5066+
def int_nvvm_tcgen05_alloc_ # cta_group : Intrinsic<[],
5067+
[llvm_ptr_ty, // dst_ptr
5068+
llvm_i32_ty] , // num_columns
5069+
[IntrConvergent, IntrInaccessibleMemOrArgMemOnly,
5070+
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
5071+
5072+
def int_nvvm_tcgen05_alloc_shared_ # cta_group : Intrinsic<[],
5073+
[llvm_shared_ptr_ty, // dst_ptr
5074+
llvm_i32_ty], // num_columns
5075+
[IntrConvergent, IntrInaccessibleMemOrArgMemOnly,
5076+
WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
5077+
5078+
def int_nvvm_tcgen05_dealloc_ # cta_group : Intrinsic<[],
5079+
[llvm_tmem_ptr_ty, // tmem_addr
5080+
llvm_i32_ty], // num_columns
5081+
[IntrConvergent, IntrArgMemOnly,
5082+
NoCapture<ArgIndex<0>>]>;
5083+
5084+
def int_nvvm_tcgen05_relinq_alloc_permit_ # cta_group : Intrinsic<[], [],
5085+
[IntrConvergent, IntrInaccessibleMemOnly]>;
5086+
}
5087+
50585088
} // let TargetPrefix = "nvvm"

llvm/include/llvm/Support/NVPTXAddrSpace.h

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ enum AddressSpace : unsigned {
2323
ADDRESS_SPACE_SHARED = 3,
2424
ADDRESS_SPACE_CONST = 4,
2525
ADDRESS_SPACE_LOCAL = 5,
26+
ADDRESS_SPACE_TENSOR = 6,
2627

2728
ADDRESS_SPACE_PARAM = 101,
2829
};

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

+1
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
163163
def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
164164
def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
165165
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
166+
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
166167

167168
def True : Predicate<"true">;
168169
def False : Predicate<"false">;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

+41
Original file line numberDiff line numberDiff line change
@@ -7582,3 +7582,44 @@ def GRIDDEPCONTROL_WAIT :
75827582
Requires<[hasSM<90>, hasPTX<78>]>;
75837583

75847584
def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;
7585+
7586+
// Tcgen05 intrinsics
7587+
let isConvergent = true in {
7588+
7589+
multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> {
7590+
def NAME : NVPTXInst<(outs),
7591+
(ins rc:$dst, Int32Regs:$ncols),
7592+
!strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"),
7593+
[(Intr rc:$dst, Int32Regs:$ncols)]>,
7594+
Requires<[hasTcgen05Instructions]>;
7595+
}
7596+
7597+
defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>;
7598+
defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>;
7599+
7600+
defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
7601+
defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
7602+
7603+
defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
7604+
defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
7605+
7606+
multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> {
7607+
def NAME : NVPTXInst<(outs),
7608+
(ins Int32Regs:$tmem_addr, Int32Regs:$ncols),
7609+
!strconcat("tcgen05.dealloc.cta_group::", num, ".sync.aligned.b32 $tmem_addr, $ncols;"),
7610+
[(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>,
7611+
Requires<[hasTcgen05Instructions]>;
7612+
}
7613+
defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1>;
7614+
defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
7615+
7616+
multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
7617+
def NAME : NVPTXInst<(outs), (ins),
7618+
!strconcat("tcgen05.relinquish_alloc_permit.cta_group::", num, ".sync.aligned;"),
7619+
[(Intr)]>,
7620+
Requires<[hasTcgen05Instructions]>;
7621+
}
7622+
defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
7623+
defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
7624+
7625+
} // isConvergent

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

+15
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,21 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
9393
bool hasDotInstructions() const {
9494
return SmVersion >= 61 && PTXVersion >= 50;
9595
}
96+
// Tcgen05 instructions in Blackwell family
97+
bool hasTcgen05Instructions() const {
98+
bool HasTcgen05 = false;
99+
switch (FullSmVersion) {
100+
default:
101+
break;
102+
case 1001: // sm_100a
103+
case 1011: // sm_101a
104+
HasTcgen05 = true;
105+
break;
106+
}
107+
108+
return HasTcgen05 && PTXVersion >= 86;
109+
}
110+
96111
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
97112
// terminates a basic block. Instead, it would assume that control flow
98113
// continued to the next instruction. The next instruction could be in the

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
140140
else if (UseShortPointers)
141141
Ret += "-p3:32:32-p4:32:32-p5:32:32";
142142

143+
// Tensor Memory (addrspace:6) is always 32-bits.
144+
Ret += "-p6:32:32";
145+
143146
Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
144147

145148
return Ret;

0 commit comments

Comments
 (0)