13 #ifndef SFCSORTBENCH_CUDAUTILS_H 14 #define SFCSORTBENCH_CUDAUTILS_H 16 #include "cuda_runtime.h" 21 #define CUDA_CHECK_ERROR() { \ 22 cudaError_t e=cudaGetLastError(); \ 23 if(e!=cudaSuccess) { \ 24 printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \ 114 template <
typename T>
116 const cudaStream_t stream);
120 void copyArrayToHostAsync(T* host_ptr,
const T*__deviceptr,
unsigned int numElems,
const cudaStream_t stream);
128 template <
typename T>
137 void computeDendroBlockToGPUMap(
const ot::Block* blkList,
unsigned int numBlocks,
unsigned int*& blockMap,dim3 & gridDim);
147 void copyArrayToHost(T* host_ptr,
const T* __device_ptr,
unsigned int numElems);
158 void copy2DArrayToHost(T** host_ptr,
const T** __device_ptr,
unsigned int sz1,
unsigned int sz2);
178 cudaMalloc(&__devicePtr,
sizeof(T)*numElems);
181 cudaMemcpy(__devicePtr,in,
sizeof(T)*numElems,cudaMemcpyHostToDevice);
194 cudaMalloc(&__devicePtr,
sizeof(T));
197 cudaMemcpy(__devicePtr,in,
sizeof(T),cudaMemcpyHostToDevice);
205 template <
typename T>
209 cudaMalloc(&__tmp1d,
sizeof(T)*sz1);
216 template <
typename T>
220 cudaMalloc(&__tmp2d,
sizeof(T*)*sz1);
225 for(
unsigned int i=0;i<sz1;i++)
227 cudaMalloc(&hostPtr[i],
sizeof(T)*sz2);
231 cudaMemcpy(__tmp2d,hostPtr,
sizeof(T*)*sz1,cudaMemcpyHostToDevice);
238 template <
typename T>
243 cudaMalloc(&__tmp2d,
sizeof(T*)*sz1);
246 T** tmp2D=
new T*[sz1];
248 for(
unsigned int i=0;i<sz1;i++)
250 cudaMalloc(&tmp2D[i],
sizeof(T)*sz2);
254 cudaMemcpy(__tmp2d,tmp2D,
sizeof(T*)*sz1,cudaMemcpyHostToDevice);
262 template <
typename T>
266 cudaMalloc(&__tmp2d,
sizeof(T*)*sz1);
269 T** tmp2D=
new T*[sz1];
271 for(
unsigned int i=0;i<sz1;i++)
273 cudaMalloc(&tmp2D[i],
sizeof(T)*sz2);
275 cudaMemcpy(tmp2D[i],in[i],
sizeof(T)*sz2 ,cudaMemcpyHostToDevice);
279 cudaMemcpy(__tmp2d,tmp2D,
sizeof(T*)*sz1,cudaMemcpyHostToDevice);
287 template <
typename T>
290 T** tmp2D=
new T*[sz1];
292 cudaMemcpy(tmp2D,__array2D,
sizeof(T*)*sz1,cudaMemcpyDeviceToHost);
295 for(
unsigned int i=0;i<sz1;i++)
311 cudaMemcpyAsync(__deviceptr,in,
sizeof(T)*numElems,cudaMemcpyHostToDevice,stream);
318 void copyArrayToHostAsync(T* host_ptr,
const T*__deviceptr,
unsigned int numElems,
const cudaStream_t stream)
320 cudaMemcpyAsync(host_ptr,__deviceptr,
sizeof(T)*numElems,cudaMemcpyDeviceToHost,stream);
328 cudaMemcpy(host_ptr,__device_ptr,
sizeof(T)*numElems,cudaMemcpyDeviceToHost);
338 T** tmp2D=
new T*[sz1];
339 cudaMemcpy(tmp2D,__device_ptr,
sizeof(T*)*sz1,cudaMemcpyDeviceToHost);
342 for(
unsigned int i=0;i<sz1;i++)
344 cudaMemcpy(host_ptr[i],tmp2D[i],
sizeof(T)*sz2,cudaMemcpyDeviceToHost);
357 #endif //SFCSORTBENCH_CUDAUTILS_H T * copyArrayToDevice(const T *in, unsigned int numElems)
Definition: cudaUtils.h:174
void copy2DCudaArrayToDeviceAsync(const T **in, T **__devicePtr, unsigned int sz1, unsigned int sz2, const cudaStream_t stream)
allocates a 2D cuda array on the device and copy data.
void dealloc2DCudaArray(T **&__array2D, unsigned int sz1)
deallocates the 2D cuda array.
Definition: cudaUtils.h:288
Contains utility function for the host related to GPUs.
Definition: block_cu.h:22
void copyArrayToDeviceAsync(const T *in, T *__deviceptr, unsigned int numElems, const cudaStream_t stream)
Definition: cudaUtils.h:309
T ** alloc2DCudaArray(unsigned int sz1, unsigned int sz2)
allocates a 2D cuda array on the device.
Definition: cudaUtils.h:239
cudaDeviceProp * getGPUDeviceInfo(unsigned int device)
send device information to the gpu
Definition: cudaUtils.cpp:15
T * alloc1DCudaArray(unsigned int sz1)
Definition: cudaUtils.h:206
T * copyValueToDevice(const T *in)
Definition: cudaUtils.h:190
void copy2DArrayToHost(T **host_ptr, const T **__device_ptr, unsigned int sz1, unsigned int sz2)
Definition: cudaUtils.h:336
void copyArrayToHost(T *host_ptr, const T *__device_ptr, unsigned int numElems)
Definition: cudaUtils.h:326