Skip to content

Commit 9c30bdf

Browse files
authored
Add pitched malloc and 2d memcpy to cust (#64)
* Update cust to 2021 edition * Add support for malloc pitch and 2d copy between host and device in cust
1 parent c263664 commit 9c30bdf

File tree

4 files changed

+226
-1
lines changed

4 files changed

+226
-1
lines changed

crates/cust/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Notable changes to this project will be documented in this file.
55
## Unreleased
66

77
- Add `memory::memcpy_dtoh` to allow copying from device to host.
8+
- Add support in `memory` for pitched malloc and 2D memcpy between device and host.
89

910
## 0.3.2 - 2/16/22
1011

crates/cust/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ authors = [
66
"Riccardo D'Ambrosio <[email protected]>",
77
"Brook Heisler <[email protected]>",
88
]
9-
edition = "2018"
9+
edition = "2021"
1010
license = "MIT OR Apache-2.0"
1111
description = "High level bindings to the CUDA Driver API"
1212
repository = "https://github.com/Rust-GPU/Rust-CUDA"

crates/cust/src/memory/malloc.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,60 @@ pub unsafe fn cuda_malloc_unified<T: DeviceCopy>(count: usize) -> CudaResult<Uni
148148
Ok(UnifiedPointer::wrap(ptr as *mut T))
149149
}
150150

151+
/// Unsafe wrapper around the `cuMemAllocPitch` function, which allocates device memory in two dimensions
152+
/// where rows are memory aligned to the containing datatype.
153+
///
154+
/// Returns a [`DevicePointer`](struct.DevicePointer.html), pointing to the allocated memory and
155+
/// an `usize` containing the row pitch in *bytes*. The memory is not cleared.
156+
///
157+
/// Note that `count` is in units of T; thus a `count` of 3 will allocate `3 * size_of::<T>()` bytes
158+
/// of memory.
159+
///
160+
/// Memory buffers allocated using `cuda_malloc` must be freed using [`cuda_free`](fn.cuda_free.html).
161+
///
162+
/// # Errors
163+
///
164+
/// If allocating memory fails, returns the CUDA error value.
165+
/// If the number of bytes to allocate is zero (either because count is zero or because T is a
166+
/// zero-sized type), or if the size of the allocation would overflow a usize, returns InvalidValue.
167+
///
168+
/// # Safety
169+
///
170+
/// Since the allocated memory is not initialized, the caller must ensure that it is initialized
171+
/// before copying it to the host in any way. Additionally, the caller must ensure that the memory
172+
/// allocated is freed using cuda_free, or the memory will be leaked.
173+
///
174+
/// # Examples
175+
///
176+
/// ```
177+
/// # let _context = cust::quick_init().unwrap();
178+
/// # fn foo() -> Result<(), cust::error::CudaError> {
179+
/// use cust::memory::*;
180+
/// unsafe {
181+
/// // Allocate space for a 3x3 matrix of f32s
182+
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
183+
/// cuda_free(device_buffer)?;
184+
/// }
185+
/// # Ok(())
186+
/// # }
187+
/// # foo().unwrap();
188+
/// ```
189+
pub unsafe fn cuda_malloc_pitched<T: DeviceCopy>(width: usize, height: usize) -> CudaResult<(DevicePointer<T>, usize)> {
190+
let element_size: std::os::raw::c_uint = std::mem::size_of::<T>()
191+
.try_into()
192+
.map_err(|_| CudaError::InvalidMemoryAllocation)?;
193+
194+
let width_bytes = width.checked_mul(std::mem::size_of::<T>()).unwrap_or(0);
195+
if width_bytes == 0 || height == 0 {
196+
return Err(CudaError::InvalidMemoryAllocation);
197+
}
198+
199+
let mut ptr = 0;
200+
let mut pitch = 0;
201+
cuda::cuMemAllocPitch_v2(&mut ptr, &mut pitch, width_bytes, height, element_size).to_result()?;
202+
Ok((DevicePointer::from_raw(ptr), pitch))
203+
}
204+
151205
/// Free memory allocated with [`cuda_malloc`](fn.cuda_malloc.html).
152206
///
153207
/// # Errors

crates/cust/src/memory/mod.rs

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,176 @@ pub unsafe fn memcpy_dtoh(
225225
Ok(())
226226
}
227227

228+
/// Similar to `cudaMemcpy2D` with `HostToDevice` copy type.
229+
///
230+
/// `dpitch`/`spitch` is bytes between the start of two rows.
231+
/// `width` is the number of *elements* (not bytes) in a row.
232+
/// `height` is the total number of rows (not bytes).
233+
///
234+
/// # Examples
235+
///
236+
/// ```
237+
/// # let _context = cust::quick_init().unwrap();
238+
/// # fn foo() -> Result<(), cust::error::CudaError> {
239+
/// use cust::memory::*;
240+
/// unsafe {
241+
/// // Allocate space for a 3x3 matrix of f32s
242+
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
243+
///
244+
/// let src_array: [f32; 9] = [
245+
/// 1.0, 2.0, 3.0,
246+
/// 4.0, 5.0, 6.0,
247+
/// 7.0, 8.0, 9.0];
248+
///
249+
/// memcpy_2d_htod(
250+
/// device_buffer,
251+
/// pitch,
252+
/// src_array.as_slice().as_ptr(),
253+
/// 3*std::mem::size_of::<f32>(),
254+
/// 3,
255+
/// 3
256+
/// )?;
257+
///
258+
/// let mut dst_array = [0.0f32; 9];
259+
///
260+
/// memcpy_2d_dtoh(
261+
/// dst_array.as_mut_slice().as_mut_ptr(),
262+
/// 3*std::mem::size_of::<f32>(),
263+
/// device_buffer,
264+
/// pitch,
265+
/// 3,
266+
/// 3
267+
/// )?;
268+
///
269+
/// assert_eq!(dst_array, src_array);
270+
/// cuda_free(device_buffer)?;
271+
/// }
272+
/// # Ok(())
273+
/// # }
274+
/// # foo().unwrap();
275+
/// ```
276+
#[allow(clippy::missing_safety_doc)]
277+
pub unsafe fn memcpy_2d_htod<T: DeviceCopy>(
278+
dst: DevicePointer<T>,
279+
dpitch: usize,
280+
src: *const T,
281+
spitch: usize,
282+
width: usize,
283+
height: usize,
284+
) -> CudaResult<()> {
285+
use cust_raw::CUmemorytype;
286+
287+
let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
288+
.ok_or(CudaError::InvalidMemoryAllocation)?;
289+
290+
let pcopy = cust_raw::CUDA_MEMCPY2D_st {
291+
srcXInBytes: 0,
292+
srcY: 0,
293+
srcMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
294+
srcHost: src as *const c_void,
295+
srcDevice: 0, // Ignored
296+
srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
297+
srcPitch: spitch,
298+
dstXInBytes: 0,
299+
dstY: 0,
300+
dstMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
301+
dstHost: std::ptr::null_mut::<c_void>(), // Ignored
302+
dstDevice: dst.as_raw(),
303+
dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
304+
dstPitch: dpitch,
305+
WidthInBytes: width_in_bytes,
306+
Height: height,
307+
};
308+
309+
crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
310+
Ok(())
311+
}
312+
313+
/// Similar to `cudaMemcpy2D` with `DeviceToHost` copy type.
314+
///
315+
/// `dpitch`/`spitch` is bytes between the start of two rows.
316+
/// `width` is the number of *elements* (not bytes) in a row.
317+
/// `height` is the total number of rows (not bytes).
318+
///
319+
/// # Examples
320+
///
321+
/// ```
322+
/// # let _context = cust::quick_init().unwrap();
323+
/// # fn foo() -> Result<(), cust::error::CudaError> {
324+
/// use cust::memory::*;
325+
/// unsafe {
326+
/// // Allocate space for a 3x3 matrix of f32s
327+
/// let (device_buffer, pitch) = cuda_malloc_pitched::<f32>(3, 3)?;
328+
///
329+
/// let src_array: [f32; 9] = [
330+
/// 1.0, 2.0, 3.0,
331+
/// 4.0, 5.0, 6.0,
332+
/// 7.0, 8.0, 9.0];
333+
///
334+
/// memcpy_2d_htod(
335+
/// device_buffer,
336+
/// pitch,
337+
/// src_array.as_slice().as_ptr(),
338+
/// 3*std::mem::size_of::<f32>(),
339+
/// 3,
340+
/// 3
341+
/// )?;
342+
///
343+
/// let mut dst_array = [0.0f32; 9];
344+
///
345+
/// memcpy_2d_dtoh(
346+
/// dst_array.as_mut_slice().as_mut_ptr(),
347+
/// 3*std::mem::size_of::<f32>(),
348+
/// device_buffer,
349+
/// pitch,
350+
/// 3,
351+
/// 3
352+
/// )?;
353+
///
354+
/// assert_eq!(dst_array, src_array);
355+
/// cuda_free(device_buffer)?;
356+
/// }
357+
/// # Ok(())
358+
/// # }
359+
/// # foo().unwrap();
360+
/// ```
361+
#[allow(clippy::missing_safety_doc)]
362+
pub unsafe fn memcpy_2d_dtoh<T: DeviceCopy>(
363+
dst: *mut T,
364+
dpitch: usize,
365+
src: DevicePointer<T>,
366+
spitch: usize,
367+
width: usize,
368+
height: usize,
369+
) -> CudaResult<()> {
370+
use cust_raw::CUmemorytype;
371+
372+
let width_in_bytes = width.checked_mul(std::mem::size_of::<T>())
373+
.ok_or(CudaError::InvalidMemoryAllocation)?;
374+
375+
let pcopy = cust_raw::CUDA_MEMCPY2D_st {
376+
srcXInBytes: 0,
377+
srcY: 0,
378+
srcMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
379+
srcHost: std::ptr::null_mut::<c_void>(), // Ignored
380+
srcDevice: src.as_raw(),
381+
srcArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
382+
srcPitch: spitch,
383+
dstXInBytes: 0,
384+
dstY: 0,
385+
dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
386+
dstHost: dst as *mut c_void,
387+
dstDevice: 0, // Ignored
388+
dstArray: std::ptr::null_mut::<cust_raw::CUarray_st>(), // Ignored
389+
dstPitch: dpitch,
390+
WidthInBytes: width_in_bytes,
391+
Height: height,
392+
};
393+
394+
crate::sys::cuMemcpy2D_v2(&pcopy).to_result()?;
395+
Ok(())
396+
}
397+
228398
/// Get the current free and total memory.
229399
///
230400
/// Returns in `.1` the total amount of memory available to the the current context.

0 commit comments

Comments
 (0)