Rust-GPU
diff --git a/‎crates/cudnn/Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎crates/cudnn/Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cudnn/README.md
Lines changed: 1 addition & 1 deletion b/‎crates/cudnn/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cudnn/src/attention/attention_descriptor.rs
Lines changed: 192 additions & 0 deletions b/‎crates/cudnn/src/attention/attention_descriptor.rs
Lines changed: 192 additions & 0 deletions
diff --git a/‎crates/cudnn/src/attention/mod.rs
Lines changed: 188 additions & 0 deletions b/‎crates/cudnn/src/attention/mod.rs
Lines changed: 188 additions & 0 deletions
@@ -6,4 +6,4 @@ version = "0.1.0"
 
 [dependencies]
 bitflags = "1.3.2"
-cust = {version = "0.3.0", path = "../cust"}
+cust = {version = "0.3.2", path = "../cust"}
@@ -1,2 +1,2 @@
-# cudnn-rs
+# cudnn
 Type safe cuDNN wrapper for the Rust programming language.
@@ -0,0 +1,192 @@
+use crate::{sys, CudnnError, DataType, DropoutDescriptor, IntoResult, MathType, SeqDataType};
+use cust::memory::GpuBuffer;
+use std::{marker::PhantomData, mem::MaybeUninit};
+
+bitflags::bitflags! {
+    /// Miscellaneous switches for configuring auxiliary multi-head attention features.
+    pub struct AttnModeFlags: u32 {
+        /// Forward declaration of mapping between Q, K and V vectors when the beam size is greater
+        /// than one in the Q input. Multiple Q vectors from the same beam bundle map to the **same**
+        /// K, V vectors. This means that the beam size in the K, V sets are equal to 1.
+        const CUDNN_ATTN_QUERYMAP_ALL_TO_ONE = 0;
+        /// Forward declaration of mapping between Q, K and V vectors when the beam size is greater
+        /// than one in the Q input. Multiple Q vectors from the same beam bundle map to the **different**
+        /// K, V vectors. This requires beam sized in K, V sets to be the same as the Q input.
+        const CUDNN_ATTN_QUERYMAP_ONE_TO_ONE = 1;
+        /// Use no biases in the attention input and output projections.
+        const CUDNN_ATTN_DISABLE_PROJ_BIASES = 0;
+        /// Use extra biases in the attention input and output projections.
+        const CUDNN_ATTN_ENABLE_PROJ_BIASES = 2;
+    }
+}
+
+/// A multi-head attention descriptor.
+pub struct AttentionDescriptor<T, U, D1, D2>
+where
+    T: SeqDataType,
+    U: SupportedAttn<T>,
+    D1: GpuBuffer<u8>,
+    D2: GpuBuffer<u8>,
+{
+    pub(crate) raw: sys::cudnnAttnDescriptor_t,
+    data_type: PhantomData<T>,
+    math_prec: PhantomData<U>,
+    attn_dropout_desc: DropoutDescriptor<D1>,
+    post_dropout_desc: DropoutDescriptor<D2>,
+}
+
+impl<T, U, D1, D2> AttentionDescriptor<T, U, D1, D2>
+where
+    T: SeqDataType,
+    U: SupportedAttn<T>,
+    D1: GpuBuffer<u8>,
+    D2: GpuBuffer<u8>,
+{
+    /// Creates a new multi-head attention descriptor.
+    ///
+    /// # Arguments
+    ///
+    /// * `mode` -  bit flag enabling various attention options that do not require additional
+    /// numerical values.
+    ///
+    /// * `n_heads` - number of attention heads.
+    ///
+    /// * `sm_scaler` - softmax sharpening/smoothing coefficient. Must be positive.
+    ///
+    /// * `math_type` - nvidia tensor cores setting.
+    ///
+    /// * `attn_dropout_desc` - descriptor of the dropout operation applied to the softmax output.
+    ///
+    /// * `post_dropout_desc` - descriptor of the dropout operation applied to the multi-head
+    /// attention output, just before the point where residual connections are added.
+    ///
+    /// * `q_size` - q vectors length.
+    ///
+    /// * `k_size` - k vectors length.
+    ///
+    /// * `v_size` - v vectors length.
+    ///
+    /// * `q_proj_size` - q vectors length after input projection.
+    ///
+    /// * `k_proj_size` - k vectors length after input projection.
+    ///
+    /// * `v_proj_size` - v vectors length after input projection.
+    ///
+    /// * `o_proj_size` - h vectors length after output projection.
+    ///
+    /// * `qo_max_seq_length` - largest sequence length expected in sequence data descriptors
+    /// related to Q, O, dQ and dO inputs and outputs.
+    ///
+    /// * `kv_max_seq_length` - largest sequence length expected in sequence data descriptors
+    /// related to K, V, dK and dV inputs and outputs.
+    ///
+    /// * `max_batch_size` - largest batch expected in any sequential data descriptor.
+    ///
+    /// * `max_bream_size` - largest beam expected in any sequential data descriptor.
+    ///
+    /// # Errors
+    ///
+    /// Returns errors if an unsupported combination of arguments is detected. Some examples
+    /// include:
+    ///
+    /// * post projection Q and K are not equal.
+    ///
+    /// * math type is not supported.
+    ///
+    /// * one or more of the following arguments were either negative or zero: `n_heads`,
+    /// `q_size`, `k_size`, `v_size`, `qo_max_seq_length`, `kv_max_seq_length`, `max_batch_size` and
+    /// ` max_beam_size`.
+    ///
+    /// * one or more of the following arguments were negative: `q_proj_size`, `k_proj_size`,
+    /// `v_proj_size`, `sm_scaler`.
+    ///
+    pub fn new(
+        mode: AttnModeFlags,
+        n_heads: i32,
+        sm_scaler: f64,
+        math_type: MathType,
+        attn_dropout_desc: DropoutDescriptor<D1>,
+        post_dropout_desc: DropoutDescriptor<D2>,
+        q_size: i32,
+        k_size: i32,
+        v_size: i32,
+        q_proj_size: impl Into<Option<i32>>,
+        k_proj_size: impl Into<Option<i32>>,
+        v_proj_size: impl Into<Option<i32>>,
+        o_proj_size: impl Into<Option<i32>>,
+        qo_max_seq_length: i32,
+        kv_max_seq_lenght: i32,
+        max_batch_size: i32,
+        max_beam_size: i32,
+    ) -> Result<Self, CudnnError> {
+        let mut raw = MaybeUninit::uninit();
+
+        unsafe {
+            sys::cudnnCreateAttnDescriptor(raw.as_mut_ptr()).into_result()?;
+
+            let mut raw = raw.assume_init();
+
+            sys::cudnnSetAttnDescriptor(
+                raw,
+                mode.bits(),
+                n_heads,
+                sm_scaler,
+                T::into_raw(),
+                U::into_raw(),
+                math_type.into(),
+                attn_dropout_desc.raw,
+                post_dropout_desc.raw,
+                q_size,
+                k_size,
+                v_size,
+                q_proj_size.into().unwrap_or(0),
+                k_proj_size.into().unwrap_or(0),
+                v_proj_size.into().unwrap_or(0),
+                o_proj_size.into().unwrap_or(0),
+                qo_max_seq_length,
+                kv_max_seq_lenght,
+                max_batch_size,
+                max_beam_size,
+            )
+            .into_result()?;
+
+            Ok(Self {
+                raw,
+                data_type: PhantomData,
+                math_prec: PhantomData,
+                attn_dropout_desc,
+                post_dropout_desc,
+            })
+        }
+    }
+}
+
+impl<T, U, D1, D2> Drop for AttentionDescriptor<T, U, D1, D2>
+where
+    T: SeqDataType,
+    U: SupportedAttn<T>,
+    D1: GpuBuffer<u8>,
+    D2: GpuBuffer<u8>,
+{
+    fn drop(&mut self) {
+        unsafe {
+            sys::cudnnDestroyAttnDescriptor(self.raw);
+        }
+    }
+}
+
+/// Controls the compute math precision in the multi-head attention. The following
+/// applies:
+///
+/// * For input and output in `f32`, the math precision of the layer can only be `f32`.
+///
+/// * For input and output in `f64` the math precision of the layer can only be `f64`.
+pub trait SupportedAttn<T>
+where
+    Self: DataType,
+    T: SeqDataType,
+{
+}
+
+impl SupportedAttn<f32> for f32 {}
+impl SupportedAttn<f64> for f64 {}
@@ -0,0 +1,188 @@
+mod attention_descriptor;
+mod seq_data_axis;
+mod seq_data_descriptor;
+
+pub use attention_descriptor::*;
+pub use seq_data_axis::*;
+pub use seq_data_descriptor::*;
+
+use crate::{sys, CudnnContext, CudnnError, DataType, IntoResult};
+use cust::memory::GpuBuffer;
+use std::mem::MaybeUninit;
+
+impl CudnnContext {
+    /// This function computes weight, work, and reserve space buffer sizes used by the following
+    /// functions:
+    ///
+    /// * `multi_head_attn_forward()`
+    ///
+    /// * `multi_head_attn_backward_data()`
+    ///
+    /// * `multi_head_attn_backward_weights()`
+    ///
+    /// # Arguments
+    ///
+    /// `desc` - multi-head attention descriptor.
+    ///
+    /// # Errors
+    ///
+    /// Returns errors if invalid arguments are detected.
+    pub fn get_attn_buffers_size<T, U, D1, D2>(
+        &self,
+        desc: &AttentionDescriptor<T, U, D1, D2>,
+    ) -> Result<(usize, usize, usize), CudnnError>
+    where
+        T: SeqDataType,
+        U: SupportedAttn<T>,
+        D1: GpuBuffer<u8>,
+        D2: GpuBuffer<u8>,
+    {
+        let mut weight_space_size = MaybeUninit::uninit();
+        let mut work_space_size = MaybeUninit::uninit();
+        let mut reserve_space_size = MaybeUninit::uninit();
+
+        unsafe {
+            sys::cudnnGetMultiHeadAttnBuffers(
+                self.raw,
+                desc.raw,
+                weight_space_size.as_mut_ptr(),
+                work_space_size.as_mut_ptr(),
+                reserve_space_size.as_mut_ptr(),
+            )
+            .into_result()?;
+
+            Ok((
+                weight_space_size.assume_init(),
+                work_space_size.assume_init(),
+                reserve_space_size.assume_init(),
+            ))
+        }
+    }
+
+    /// Computes the forward response of a multi-head attention layer.
+    ///
+    /// When `reserve_space` is `None` the function operates in the inference mode in which backward
+    /// functions are not invoked, otherwise, the training mode is assumed.
+    ///
+    /// # Arguments
+    ///
+    /// * `attn_desc` - multi-head attention descriptor.
+    ///
+    /// * `current_idx` - time-step in queries to process. When the such argument is negative,
+    /// all Q time-steps are processed. When `current_idx` is zero or positive, the forward response
+    /// is computed for the selected time-step only.
+    ///
+    /// * `lo_win_idx` - integer array specifying the start indices of the attention window for
+    /// each Q time-step. The start index in K, V sets is inclusive.
+    ///
+    /// * `hi_win_idx` - integer array specifying the end indices of the attention window for each
+    /// Q time-step. The end index is exclusive.
+    ///
+    /// * `device_seq_lengths_qo` - device array specifying sequence lengths of query, residual,
+    /// and output sequence data.
+    ///
+    /// * `device_seq_lengths_kv` - device array specifying sequence lengths of key and value \
+    /// input data.
+    ///
+    /// * `q_desc` - descriptor for the query and residual sequence data.
+    ///
+    /// * `queries` - queries data in the device memory.
+    ///
+    /// * `residuals` - residual data in device memory. Set this argument to `None` if no residual
+    /// connections are required.
+    ///
+    /// * `k_desc` - descriptor for the keys sequence data.
+    ///
+    /// * `keys` - keys data in device memory.
+    ///
+    /// * `v_desc` - descriptor for the values sequence data.
+    ///
+    /// * `values` - values data in device memory.
+    ///
+    /// * `o_desc` - descriptor for the out sequence data.
+    ///
+    /// * `out` - out data in device memory.
+    ///
+    /// * `weights` - weight buffer in device memory.
+    ///
+    /// * `work_space` - work space buffer in device memory.
+    ///
+    /// * `reserve_space` - reserve space buffer in device memory. This argument should be `None` in
+    /// inference mode.
+    pub fn multi_head_attn_forward<T, U, D1, D2>(
+        &self,
+        attn_desc: &AttentionDescriptor<T, U, D1, D2>,
+        current_idx: i32,
+        lo_win_idx: &[i32],
+        hi_win_idx: &[i32],
+        device_seq_lengths_qo: &impl GpuBuffer<i32>,
+        device_seq_lengths_kv: &impl GpuBuffer<i32>,
+        q_desc: &SeqDataDescriptor<T>,
+        queries: &impl GpuBuffer<T>,
+        residuals: Option<&impl GpuBuffer<T>>,
+        k_desc: &SeqDataDescriptor<T>,
+        keys: &impl GpuBuffer<T>,
+        v_desc: &SeqDataDescriptor<T>,
+        values: &impl GpuBuffer<T>,
+        o_desc: &SeqDataDescriptor<T>,
+        out: &mut impl GpuBuffer<T>,
+        weights: &impl GpuBuffer<T>,
+        work_space: &mut impl GpuBuffer<T>,
+        reserve_space: Option<&mut impl GpuBuffer<T>>,
+    ) -> Result<(), CudnnError>
+    where
+        T: SeqDataType,
+        U: SupportedAttn<T>,
+        D1: GpuBuffer<u8>,
+        D2: GpuBuffer<u8>,
+    {
+        let device_seq_lenghts_qo_ptr = device_seq_lengths_qo.as_device_ptr().as_ptr() as *const _;
+        let device_seq_lengths_kv_ptr = device_seq_lengths_kv.as_device_ptr().as_ptr() as *const _;
+
+        let queries_ptr = queries.as_device_ptr().as_ptr() as *const _;
+        let residuals_ptr = residuals.map_or(std::ptr::null(), |buff| {
+            buff.as_device_ptr().as_ptr() as *const _
+        });
+        let keys_ptr = keys.as_device_ptr().as_ptr() as *const _;
+        let values_ptr = values.as_device_ptr().as_ptr() as *const _;
+        let out_ptr = out.as_device_ptr().as_mut_ptr() as *mut _;
+
+        let weights_ptr = weights.as_device_ptr().as_ptr() as *const _;
+        let work_space_ptr = work_space.as_device_ptr().as_mut_ptr() as *mut _;
+
+        let (reserve_space_ptr, reserve_space_size) = reserve_space
+            .map_or((std::ptr::null_mut(), 0), |buff| {
+                (buff.as_device_ptr().as_mut_ptr() as *mut _, 0)
+            });
+
+        unsafe {
+            sys::cudnnMultiHeadAttnForward(
+                self.raw,
+                attn_desc.raw,
+                current_idx,
+                lo_win_idx.as_ptr(),
+                hi_win_idx.as_ptr(),
+                device_seq_lenghts_qo_ptr,
+                device_seq_lengths_kv_ptr,
+                q_desc.raw,
+                queries_ptr,
+                residuals_ptr,
+                k_desc.raw,
+                keys_ptr,
+                v_desc.raw,
+                values_ptr,
+                o_desc.raw,
+                out_ptr,
+                weights.len(),
+                weights_ptr,
+                work_space.len(),
+                work_space_ptr,
+                reserve_space_size,
+                reserve_space_ptr,
+            )
+            .into_result()?;
+
+            Ok(())
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-# cudnn-rs`
	`1`	`+# cudnn`
`2`	`2`	`Type safe cuDNN wrapper for the Rust programming language.`