@@ -577,6 +577,133 @@ __nv_associate_access_property(const void *__ptr, unsigned long long __prop) {
577
577
}
578
578
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
579
579
580
+ #if !defined(__CUDA_ARCH__ ) || __CUDA_ARCH__ >= 900
581
+ __device__ inline unsigned __isCtaShared (const void * ptr ) {
582
+ return __isShared (ptr );
583
+ }
584
+
585
+ __device__ inline unsigned __isClusterShared (const void * __ptr ) {
586
+ return __nvvm_isspacep_shared_cluster (__ptr );
587
+ }
588
+
589
+ __device__ inline void * __cluster_map_shared_rank (const void * __ptr ,
590
+ unsigned __rank ) {
591
+ return __nvvm_mapa ((void * )__ptr , __rank );
592
+ }
593
+
594
+ __device__ inline unsigned __cluster_query_shared_rank (const void * __ptr ) {
595
+ return __nvvm_getctarank ((void * )__ptr );
596
+ }
597
+
598
+ __device__ inline uint2
599
+ __cluster_map_shared_multicast (const void * __ptr ,
600
+ unsigned int __cluster_cta_mask ) {
601
+ return make_uint2 ((unsigned )__cvta_generic_to_shared (__ptr ),
602
+ __cluster_cta_mask );
603
+ }
604
+
605
+ __device__ inline unsigned __clusterDimIsSpecified () {
606
+ return __nvvm_is_explicit_cluster ();
607
+ }
608
+
609
+ __device__ inline dim3 __clusterDim () {
610
+ return {__nvvm_read_ptx_sreg_cluster_nctaid_x (),
611
+ __nvvm_read_ptx_sreg_cluster_nctaid_y (),
612
+ __nvvm_read_ptx_sreg_cluster_nctaid_z ()};
613
+ }
614
+
615
+ __device__ inline dim3 __clusterRelativeBlockIdx () {
616
+ return {__nvvm_read_ptx_sreg_cluster_ctaid_x (),
617
+ __nvvm_read_ptx_sreg_cluster_ctaid_y (),
618
+ __nvvm_read_ptx_sreg_cluster_ctaid_z ()};
619
+ }
620
+
621
+ __device__ inline dim3 __clusterGridDimInClusters () {
622
+ return {__nvvm_read_ptx_sreg_nclusterid_x (),
623
+ __nvvm_read_ptx_sreg_nclusterid_y (),
624
+ __nvvm_read_ptx_sreg_nclusterid_z ()};
625
+ }
626
+
627
+ __device__ inline dim3 __clusterIdx () {
628
+ return {__nvvm_read_ptx_sreg_clusterid_x (),
629
+ __nvvm_read_ptx_sreg_clusterid_y (),
630
+ __nvvm_read_ptx_sreg_clusterid_z ()};
631
+ }
632
+
633
+ __device__ inline unsigned __clusterRelativeBlockRank () {
634
+ return __nvvm_read_ptx_sreg_cluster_ctarank ();
635
+ }
636
+
637
+ __device__ inline unsigned __clusterSizeInBlocks () {
638
+ return __nvvm_read_ptx_sreg_cluster_nctarank ();
639
+ }
640
+
641
+ __device__ inline void __cluster_barrier_arrive () {
642
+ __nvvm_barrier_cluster_arrive ();
643
+ }
644
+
645
+ __device__ inline void __cluster_barrier_arrive_relaxed () {
646
+ __nvvm_barrier_cluster_arrive_relaxed ();
647
+ }
648
+
649
+ __device__ inline void __cluster_barrier_wait () {
650
+ __nvvm_barrier_cluster_wait ();
651
+ }
652
+
653
+ __device__ inline void __threadfence_cluster () { __nvvm_fence_sc_cluster (); }
654
+
655
+ __device__ inline float2 atomicAdd (float2 * __ptr , float2 __val ) {
656
+ float2 __ret ;
657
+ __asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
658
+ : "=f" (__ret .x ), "=f" (__ret .y )
659
+ : "l" (__ptr ), "f" (__val .x ), "f" (__val .y ));
660
+ return __ret ;
661
+ }
662
+
663
+ __device__ inline float2 atomicAdd_block (float2 * __ptr , float2 __val ) {
664
+ float2 __ret ;
665
+ __asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
666
+ : "=f" (__ret .x ), "=f" (__ret .y )
667
+ : "l" (__ptr ), "f" (__val .x ), "f" (__val .y ));
668
+ return __ret ;
669
+ }
670
+
671
+ __device__ inline float2 atomicAdd_system (float2 * __ptr , float2 __val ) {
672
+ float2 __ret ;
673
+ __asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
674
+ : "=f" (__ret .x ), "=f" (__ret .y )
675
+ : "l" (__ptr ), "f" (__val .x ), "f" (__val .y ));
676
+ return __ret ;
677
+ }
678
+
679
+ __device__ inline float4 atomicAdd (float4 * __ptr , float4 __val ) {
680
+ float4 __ret ;
681
+ __asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
682
+ : "=f" (__ret .x ), "=f" (__ret .y ), "=f" (__ret .z ), "=f" (__ret .w )
683
+ : "l" (__ptr ), "f" (__val .x ), "f" (__val .y ), "f" (__val .z ), "f" (__val .w ));
684
+ return __ret ;
685
+ }
686
+
687
+ __device__ inline float4 atomicAdd_block (float4 * __ptr , float4 __val ) {
688
+ float4 __ret ;
689
+ __asm__(
690
+ "atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
691
+ : "=f" (__ret .x ), "=f" (__ret .y ), "=f" (__ret .z ), "=f" (__ret .w )
692
+ : "l" (__ptr ), "f" (__val .x ), "f" (__val .y ), "f" (__val .z ), "f" (__val .w ));
693
+ return __ret ;
694
+ }
695
+
696
+ __device__ inline float4 atomicAdd_system (float4 * __ptr , float4 __val ) {
697
+ float4 __ret ;
698
+ __asm__(
699
+ "atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
700
+ : "=f" (__ret .x ), "=f" (__ret .y ), "=f" (__ret .z ), "=f" (__ret .w )
701
+ : "l" (__ptr ), "f" (__val .x ), "f" (__val .y ), "f" (__val .z ), "f" (__val .w )
702
+ :);
703
+ return __ret ;
704
+ }
705
+
706
+ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
580
707
#endif // CUDA_VERSION >= 11000
581
708
582
709
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
0 commit comments