Dendro  5.01
Dendro in Greek language means tree. The Dendro library is a large scale (262K cores on ORNL's Titan) distributed memory adaptive octree framework. The main goal of Dendro is to perform large scale multiphysics simulations efficeiently in mordern supercomputers. Dendro consists of efficient parallel data structures and algorithms to perform variational ( finite element) methods and finite difference mthods on 2:1 balanced arbitary adaptive octrees which enables the users to perform simulations raning from black holes (binary black hole mergers) to blood flow in human body, where applications ranging from relativity, astrophysics to biomedical engineering.
cudaUtils.h
1 //
2 // Created by milinda on 8/9/18.
3 //
13 #ifndef SFCSORTBENCH_CUDAUTILS_H
14 #define SFCSORTBENCH_CUDAUTILS_H
15 
16 #include "cuda_runtime.h"
17 #include "block.h"
18 
19 
20 //Macro for checking cuda errors following a cuda launch or api call
21 #define CUDA_CHECK_ERROR() { \
22  cudaError_t e=cudaGetLastError(); \
23  if(e!=cudaSuccess) { \
24  printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \
25  exit(0); \
26  } \
27 }
28 
29 
30 
31 namespace cuda
32 {
33 
39  cudaDeviceProp* getGPUDeviceInfo(unsigned int device);
40 
46  template<typename T>
47  T * copyArrayToDevice(const T* in, unsigned int numElems);
48 
49 
55  template<typename T>
56  inline T * copyValueToDevice(const T* in);
57 
64  template <typename T>
65  T* alloc1DCudaArray(unsigned int sz1);
66 
73  template <typename T>
74  T** alloc2DCudaArray(unsigned int sz1, unsigned int sz2);
75 
76 
84  template <typename T>
85  T** alloc2DCudaArray(T**& hostPtr,unsigned int sz1, unsigned int sz2);
86 
87 
88 
95  template <typename T>
96  T** alloc2DCudaArray(const T** in,unsigned int sz1, unsigned int sz2);
97 
98 
99 
105  template<typename T>
106  void copyArrayToDeviceAsync(const T* in, T*__deviceptr,unsigned int numElems,const cudaStream_t stream);
107 
114  template <typename T>
115  void copy2DCudaArrayToDeviceAsync(const T **in, T **__devicePtr, unsigned int sz1, unsigned int sz2,
116  const cudaStream_t stream);
117 
118 
119  template<typename T>
120  void copyArrayToHostAsync(T* host_ptr, const T*__deviceptr,unsigned int numElems,const cudaStream_t stream);
121 
122 
123 
128  template <typename T>
129  void dealloc2DCudaArray(T ** & __array2D, unsigned int sz1);
130 
131  /***
132  * computes the how dendro blocks (octree blocks result in from unzip) to the gpu/
133  * @param[in] blkList: list of dendro data blocks
134  * @param[in] numBlocks: number of data blocks,
135  * @param[out] blockMap: (blockMap[2*blocDim.x] , blockMap[2*blocDim.x+1]) begin & end of data block that is going to be process by the gpu block
136  * */
137  void computeDendroBlockToGPUMap(const ot::Block* blkList, unsigned int numBlocks, unsigned int*& blockMap,dim3 & gridDim);
138 
139 
146  template<typename T>
147  void copyArrayToHost(T* host_ptr,const T* __device_ptr, unsigned int numElems);
148 
149 
157  template<typename T>
158  void copy2DArrayToHost(T** host_ptr,const T** __device_ptr, unsigned int sz1,unsigned int sz2);
159 
160 
161 
162 
163 }
164 
165 
166 
167 
168 // templated functions
169 
170 namespace cuda
171 {
172 
173  template<typename T>
174  T * copyArrayToDevice(const T* in, unsigned int numElems)
175  {
176 
177  T* __devicePtr;
178  cudaMalloc(&__devicePtr,sizeof(T)*numElems);
179  CUDA_CHECK_ERROR();
180 
181  cudaMemcpy(__devicePtr,in,sizeof(T)*numElems,cudaMemcpyHostToDevice);
182  CUDA_CHECK_ERROR();
183 
184  return __devicePtr;
185 
186  }
187 
188 
189  template<typename T>
190  inline T * copyValueToDevice(const T* in)
191  {
192 
193  T* __devicePtr;
194  cudaMalloc(&__devicePtr,sizeof(T));
195  CUDA_CHECK_ERROR();
196 
197  cudaMemcpy(__devicePtr,in,sizeof(T),cudaMemcpyHostToDevice);
198  CUDA_CHECK_ERROR();
199 
200  return __devicePtr;
201 
202  }
203 
204 
205  template <typename T>
206  T* alloc1DCudaArray(unsigned int sz1)
207  {
208  T* __tmp1d;
209  cudaMalloc(&__tmp1d,sizeof(T)*sz1);
210  CUDA_CHECK_ERROR();
211 
212  return __tmp1d;
213  }
214 
215 
216  template <typename T>
217  T** alloc2DCudaArray(T**& hostPtr,unsigned int sz1, unsigned int sz2)
218  {
219  T** __tmp2d;
220  cudaMalloc(&__tmp2d,sizeof(T*)*sz1);
221  CUDA_CHECK_ERROR();
222 
223  hostPtr=new T*[sz1];
224 
225  for(unsigned int i=0;i<sz1;i++)
226  {
227  cudaMalloc(&hostPtr[i],sizeof(T)*sz2);
228  CUDA_CHECK_ERROR();
229  }
230 
231  cudaMemcpy(__tmp2d,hostPtr,sizeof(T*)*sz1,cudaMemcpyHostToDevice);
232  CUDA_CHECK_ERROR();
233 
234  return __tmp2d;
235 
236  }
237 
238  template <typename T>
239  T** alloc2DCudaArray(unsigned int sz1, unsigned int sz2)
240  {
241 
242  T** __tmp2d;
243  cudaMalloc(&__tmp2d,sizeof(T*)*sz1);
244  CUDA_CHECK_ERROR();
245 
246  T** tmp2D=new T*[sz1];
247 
248  for(unsigned int i=0;i<sz1;i++)
249  {
250  cudaMalloc(&tmp2D[i],sizeof(T)*sz2);
251  CUDA_CHECK_ERROR();
252  }
253 
254  cudaMemcpy(__tmp2d,tmp2D,sizeof(T*)*sz1,cudaMemcpyHostToDevice);
255  CUDA_CHECK_ERROR();
256  delete [] tmp2D;
257 
258  return __tmp2d;
259 
260  }
261 
262  template <typename T>
263  T** alloc2DCudaArray(const T** in,unsigned int sz1, unsigned int sz2)
264  {
265  T** __tmp2d;
266  cudaMalloc(&__tmp2d,sizeof(T*)*sz1);
267  CUDA_CHECK_ERROR();
268 
269  T** tmp2D=new T*[sz1];
270 
271  for(unsigned int i=0;i<sz1;i++)
272  {
273  cudaMalloc(&tmp2D[i],sizeof(T)*sz2);
274  CUDA_CHECK_ERROR();
275  cudaMemcpy(tmp2D[i],in[i], sizeof(T)*sz2 ,cudaMemcpyHostToDevice);
276  CUDA_CHECK_ERROR();
277  }
278 
279  cudaMemcpy(__tmp2d,tmp2D,sizeof(T*)*sz1,cudaMemcpyHostToDevice);
280  CUDA_CHECK_ERROR();
281  delete [] tmp2D;
282 
283  return __tmp2d;
284  }
285 
286 
287  template <typename T>
288  void dealloc2DCudaArray(T ** & __array2D, unsigned int sz1)
289  {
290  T** tmp2D=new T*[sz1];
291 
292  cudaMemcpy(tmp2D,__array2D,sizeof(T*)*sz1,cudaMemcpyDeviceToHost);
293  CUDA_CHECK_ERROR();
294 
295  for(unsigned int i=0;i<sz1;i++)
296  {
297  cudaFree(tmp2D[i]);
298  CUDA_CHECK_ERROR();
299  }
300 
301  delete [] tmp2D;
302 
303  cudaFree(__array2D);
304  CUDA_CHECK_ERROR();
305  }
306 
307 
308  template<typename T>
309  void copyArrayToDeviceAsync(const T* in,T*__deviceptr, unsigned int numElems,const cudaStream_t stream)
310  {
311  cudaMemcpyAsync(__deviceptr,in,sizeof(T)*numElems,cudaMemcpyHostToDevice,stream);
312  CUDA_CHECK_ERROR();
313 
314  }
315 
316 
317  template<typename T>
318  void copyArrayToHostAsync(T* host_ptr, const T*__deviceptr,unsigned int numElems,const cudaStream_t stream)
319  {
320  cudaMemcpyAsync(host_ptr,__deviceptr,sizeof(T)*numElems,cudaMemcpyDeviceToHost,stream);
321  CUDA_CHECK_ERROR();
322  }
323 
324 
325  template<typename T>
326  void copyArrayToHost(T* host_ptr,const T* __device_ptr, unsigned int numElems)
327  {
328  cudaMemcpy(host_ptr,__device_ptr,sizeof(T)*numElems,cudaMemcpyDeviceToHost);
329  CUDA_CHECK_ERROR();
330 
331  }
332 
333 
334 
335  template<typename T>
336  void copy2DArrayToHost(T** host_ptr,const T** __device_ptr, unsigned int sz1,unsigned int sz2)
337  {
338  T** tmp2D=new T*[sz1];
339  cudaMemcpy(tmp2D,__device_ptr,sizeof(T*)*sz1,cudaMemcpyDeviceToHost);
340  CUDA_CHECK_ERROR();
341 
342  for(unsigned int i=0;i<sz1;i++)
343  {
344  cudaMemcpy(host_ptr[i],tmp2D[i],sizeof(T)*sz2,cudaMemcpyDeviceToHost);
345  CUDA_CHECK_ERROR();
346  }
347 
348  delete [] tmp2D;
349  return;
350 
351  }
352 
353 
354 }
355 
356 
357 #endif //SFCSORTBENCH_CUDAUTILS_H
T * copyArrayToDevice(const T *in, unsigned int numElems)
Definition: cudaUtils.h:174
void copy2DCudaArrayToDeviceAsync(const T **in, T **__devicePtr, unsigned int sz1, unsigned int sz2, const cudaStream_t stream)
allocates a 2D cuda array on the device and copy data.
void dealloc2DCudaArray(T **&__array2D, unsigned int sz1)
deallocates the 2D cuda array.
Definition: cudaUtils.h:288
Contains utility function for the host related to GPUs.
Definition: block_cu.h:22
void copyArrayToDeviceAsync(const T *in, T *__deviceptr, unsigned int numElems, const cudaStream_t stream)
Definition: cudaUtils.h:309
T ** alloc2DCudaArray(unsigned int sz1, unsigned int sz2)
allocates a 2D cuda array on the device.
Definition: cudaUtils.h:239
Definition: block.h:35
cudaDeviceProp * getGPUDeviceInfo(unsigned int device)
send device information to the gpu
Definition: cudaUtils.cpp:15
T * alloc1DCudaArray(unsigned int sz1)
Definition: cudaUtils.h:206
T * copyValueToDevice(const T *in)
Definition: cudaUtils.h:190
void copy2DArrayToHost(T **host_ptr, const T **__device_ptr, unsigned int sz1, unsigned int sz2)
Definition: cudaUtils.h:336
void copyArrayToHost(T *host_ptr, const T *__device_ptr, unsigned int numElems)
Definition: cudaUtils.h:326