milupHPC documentation
  • src
  • cuda_utils
cuda_utilities.cu
Go to the documentation of this file.
1#include "../../include/cuda_utils/cuda_utilities.cuh"
2#include "../../include/cuda_utils/cuda_launcher.cuh"
3
4// see:
5// * [CUDA atomicAdd for doubles definition error](https://stackoverflow.com/questions/37566987/cuda-atomicadd-for-doubles-definition-error)
6// * [Why does atomicAdd not work with doubles as input?](https://forums.developer.nvidia.com/t/why-does-atomicadd-not-work-with-doubles-as-input/56429)
7#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
8#else
9__device__ double atomicAdd(double* address, double val) {
10 unsigned long long int* address_as_ull = (unsigned long long int*)address;
11 unsigned long long int old = *address_as_ull, assumed;
12 do {
13 assumed = old;
14 old = atomicCAS(address_as_ull, assumed,
15 __double_as_longlong(val + __longlong_as_double(assumed)));
16 } while (assumed != old);
17 return __longlong_as_double(old);
18}
19#endif
20
21void gpuAssert(cudaError_t code, const char *file, int line, bool abort)
22{
23 if (code != cudaSuccess)
24 {
25 fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
26 if (abort) getchar();
27 }
28}
29
30void checkCudaCall(cudaError_t command, const char * commandName, const char * fileName, int line)
31{
32 if (command != cudaSuccess)
33 {
34 fprintf(stderr, "Error: CUDA result \"%s\" for call \"%s\" in file \"%s\" at line %d. Terminating...\n",
35 cudaGetErrorString(command), commandName, fileName, line);
36 exit(0);
37 }
38}
39
40namespace CudaUtils {
41
42 namespace Kernel {
43
44 __global__ void collectValues(integer *indices, real *entries, real *collector, integer count) {
45
46 integer index = threadIdx.x + blockIdx.x * blockDim.x;
47 integer stride = blockDim.x * gridDim.x;
48 integer offset = 0;
49
50 while ((index + offset) < count) {
51 collector[index + offset] = entries[indices[index + offset]];
52 offset += stride;
53 }
54 }
55
56 __global__ void checkValues(integer *indices, real *entry1, real *entry2, real *entry3,
57 integer count) {
58
59 integer index = threadIdx.x + blockIdx.x * blockDim.x;
60 integer stride = blockDim.x * gridDim.x;
61 integer offset = 0;
62
63 while ((index + offset) < count) {
64 for (int i=0; i<count; i++) {
65 if (i != index + offset) {
66 if (entry1[indices[i]] == entry1[indices[index + offset]] && entry2[indices[i]] == entry2[indices[index + offset]]) {
67 printf("Same entry for %i vs %i | %i vs %i!\n", i, index + offset, indices[i], indices[index + offset]);
68 }
69 }
70 }
71 offset += stride;
72 }
73 }
74
75 template<typename T>
76 __global__ void findDuplicates(T *array, integer *duplicateCounter, int length) {
77
78 integer index = threadIdx.x + blockIdx.x * blockDim.x;
79 integer stride = blockDim.x * gridDim.x;
80 integer offset = 0;
81
82 while ((index + offset) < length) {
83 for (int i=0; i<length; i++) {
84 if (i != (index + offset)) {
85 if (array[index + offset] == array[i]) {
86 atomicAdd(duplicateCounter, 1);
87 }
88 }
89 }
90 offset += stride;
91 }
92 }
93
94 template<typename T>
95 __global__ void findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length) {
96 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
97 integer stride = blockDim.x * gridDim.x;
98 integer offset = 0;
99
100 while ((bodyIndex + offset) < length) {
101
102 for (int i=0; i<length; i++) {
103 if (i != (bodyIndex + offset)) {
104 if (array1[bodyIndex + offset] == array1[i] && array2[bodyIndex + offset] == array2[i]) {
105 atomicAdd(duplicateCounter, 1);
106 //printf("duplicate! (%i vs. %i) (x = %f, y = %f)\n", i, bodyIndex + offset, array1[i], array2[i]);
107 }
108 }
109 }
110
111 offset += stride;
112 }
113 }
114
115 template<typename T>
116 __global__ void findDuplicateEntries(T *array1, T *array2, T *array3, integer *duplicateCounter, int length) {
117 integer bodyIndex = threadIdx.x + blockIdx.x * blockDim.x;
118 integer stride = blockDim.x * gridDim.x;
119 integer offset = 0;
120
121 while ((bodyIndex + offset) < length) {
122
123 for (int i=0; i<length; i++) {
124 if (i != (bodyIndex + offset)) {
125 if (array1[bodyIndex + offset] == array1[i] && array2[bodyIndex + offset] == array2[i] &&
126 array3[bodyIndex + offset] == array3[i]) {
127
128 atomicAdd(duplicateCounter, 1);
129
130 //printf("duplicate! (%i vs. %i) (x = %f, y = %f)\n", i, bodyIndex + offset, array1[i], array2[i]);
131 }
132 }
133 }
134
135 offset += stride;
136 }
137 }
138
139 template<typename T, typename U>
140 __global__ void findDuplicates(T *array, U *entry1, U *entry2, integer *duplicateCounter, int length) {
141 integer index = threadIdx.x + blockIdx.x * blockDim.x;
142 integer stride = blockDim.x * gridDim.x;
143 integer offset = 0;
144
145 while ((index + offset) < length) {
146 if ((index + offset) % 100 == 0) {
147 //printf("array[%i] = %i\n", index+offset, array[index + offset]);
148 }
149 for (int i=0; i<length; i++) {
150 if (i != (index + offset)) {
151 if (array[index + offset] == array[i] || (entry1[array[index + offset]] == entry1[array[i]] && entry2[array[index + offset]] == entry2[array[i]])) {
152 //if (array[index + offset] == array[i]) { //|| (entry1[index + offset] = entry1[i] && entry2[index + offset] == entry2[i])) {
153 printf("Found duplicate!\n");
154 atomicAdd(duplicateCounter, 1);
155 }
156 }
157 }
158 offset += stride;
159 }
160 }
161
162 template<typename T>
163 __global__ void markDuplicates(T *array, integer *duplicateCounter, int length) {
164
165 integer index = threadIdx.x + blockIdx.x * blockDim.x;
166 integer stride = blockDim.x * gridDim.x;
167 integer offset = 0;
168 integer maxIndex;
169
170 //remark: check only x, but in principle check all
171 while ((index + offset) < length) {
172 if (array[index + offset] != -1) {
173 for (integer i = 0; i < length; i++) {
174 if (i != (index + offset)) {
175 if (array[i] != -1 && array[index + offset] == array[i]) {
176
177 printf("DUPLICATE: %i vs %i (%i vs. %i)\n",
178 array[index + offset], array[i], index + offset, i);
179
180 maxIndex = max(index + offset, i);
181 // mark larger index with -1 (thus a duplicate)
182 array[maxIndex] = -1;
183 atomicAdd(duplicateCounter, 1);
184 }
185 }
186
187 }
188 }
189 offset += stride;
190 }
191 }
192
193 /*template<typename T, typename U>
194 __global__ void markDuplicates(T *array, U *entry1, U *entry2, integer *duplicateCounter, int length) {
195
196 integer index = threadIdx.x + blockIdx.x * blockDim.x;
197 integer stride = blockDim.x * gridDim.x;
198 integer offset = 0;
199 integer maxIndex;
200
201 //remark: check only x, but in principle check all
202 while ((index + offset) < length) {
203 if (array[index + offset] != -1) {
204 for (integer i = 0; i < length; i++) {
205 if (i != (index + offset)) {
206 if (array[i] != -1 && (array[index + offset] == array[i] || (entry1[array[i]] == entry1[array[index + offset]] &&
207 entry2[array[i]] == entry2[array[index + offset]]))) {
208
209 if (true) { //array[index + offset] == array[i]) {
210 printf("DUPLICATE: %i vs %i | (%f, %f) vs (%f, %f):\n",
211 array[index + offset], array[i],
212 entry1[array[index + offset]], entry2[array[index + offset]],
213 entry1[array[i]], entry2[array[i]]);
214
215 }
216
217 maxIndex = max(index + offset, i);
218 // mark larger index with -1 (thus a duplicate)
219 array[maxIndex] = -1;
220 atomicAdd(duplicateCounter, 1);
221 }
222 }
223
224 }
225 }
226 offset += stride;
227 }
228 }*/
229
230 template<typename T, typename U>
231 __global__ void markDuplicates(T *array, U *entry1, U *entry2, U *entry3, integer *duplicateCounter, integer *child, int length) {
232
233 integer index = threadIdx.x + blockIdx.x * blockDim.x;
234 integer stride = blockDim.x * gridDim.x;
235 integer offset = 0;
236 integer maxIndex;
237
238 bool isChild;
239 //remark: check only x, but in principle check all
240 while ((index + offset) < length) {
241 if (array[index + offset] != -1) {
242 for (integer i = 0; i < length; i++) {
243 if (i != (index + offset)) {
244 if (array[i] != -1 && (array[index + offset] == array[i] || (entry1[array[i]] == entry1[array[index + offset]] &&
245 entry2[array[i]] == entry2[array[index + offset]] &&
246 entry3[array[i]] == entry3[array[index + offset]]))) {
247 isChild = false;
248
249 if (false/*array[index + offset] == array[i]*/) {
250 printf("DUPLICATE: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
251 array[index + offset], array[i],
252 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
253 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
254
255 for (int k=0; k<POW_DIM; k++) {
256 if (child[POW_DIM*array[index + offset] + k] == array[i]) {
257 printf("isChild: index = %i: child %i == i = %i\n", array[index + offset],
258 k, array[i]);
259 isChild = true;
260 }
261
262 if (child[8*array[i] + k] == array[index + offset]) {
263 printf("isChild: index = %i: child %i == index = %i\n", array[i],
264 k, array[index + offset]);
265 isChild = true;
266 }
267 }
268
269 if (!isChild) {
270 printf("isChild: Duplicate NOT a child: %i vs %i | (%f, %f, %f) vs (%f, %f, %f):\n",
271 array[index + offset], array[i],
272 entry1[array[index + offset]], entry2[array[index + offset]], entry3[array[index + offset]],
273 entry1[array[i]], entry2[array[i]], entry3[array[i]]);
274 //for (int k=0; k<POW_DIM; k++) {
275 // printf("isChild: Duplicate NOT a child: children index = %i: child %i == i = %i\n", array[index + offset],
276 // k, array[i]);
277 //
278 // printf("isChild: Duplicate NOT a child: children index = %i: child %i == index = %i\n", array[i],
279 // k, array[index + offset]);
280 //
281 //}
282 }
283
284 }
285
286 maxIndex = max(index + offset, i);
287 // mark larger index with -1 (thus a duplicate)
288 array[maxIndex] = -1;
289 atomicAdd(duplicateCounter, 1);
290 }
291 }
292
293 }
294 }
295 offset += stride;
296 }
297 }
298
299 template<typename T>
300 __global__ void removeDuplicates(T *array, T *removedArray, integer *duplicateCounter, int length) {
301
302 int index = threadIdx.x + blockIdx.x * blockDim.x;
303 int stride = blockDim.x * gridDim.x;
304 int offset = 0;
305
306 int indexToInsert;
307
308 while ((index + offset) < length) {
309
310 if (array[index + offset] != -1) {
311 indexToInsert = atomicAdd(duplicateCounter, 1);
312 removedArray[indexToInsert] = array[index + offset];
313 }
314
315 offset += stride;
316 }
317 }
318
319
320 real Launch::collectValues(integer *indices, real *entries, real *collector, integer count) {
321 ExecutionPolicy executionPolicy;
322 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::collectValues, indices, entries,
323 collector, count);
324 }
325
326 real Launch::checkValues(integer *indices, real *entry1, real *entry2, real *entry3, integer count) {
327 ExecutionPolicy executionPolicy;
328 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::checkValues, indices, entry1, entry2, entry3,
329 count);
330 }
331
332 template<typename T>
333 real Launch::findDuplicates(T *array, integer *duplicateCounter, int length) {
334 ExecutionPolicy executionPolicy;
335 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::findDuplicates<T>, array,
336 duplicateCounter, length);
337 }
338 // explicit instantiation for type "integer"
339 template real Launch::findDuplicates<integer>(integer *array, integer *duplicateCounter, int length);
340 // explicit instantiation for type "real"
341 template real Launch::findDuplicates<real>(real *array, integer *duplicateCounter, int length);
342
343 template<typename T>
344 real Launch::findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length) {
345 ExecutionPolicy executionPolicy;
346 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::findDuplicateEntries<T>, array1,
347 array2, duplicateCounter, length);
348 }
349 template real Launch::findDuplicateEntries<real>(real *array1, real *array2, integer *duplicateCounter, int length);
350
351 template<typename T>
352 real Launch::findDuplicateEntries(T *array1, T *array2, T *array3, integer *duplicateCounter, int length) {
353 ExecutionPolicy executionPolicy;
354 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::findDuplicateEntries<T>, array1,
355 array2, array3, duplicateCounter, length);
356 }
357 template real Launch::findDuplicateEntries<real>(real *array1, real *array2, real *array3, integer *duplicateCounter, int length);
358
359 template<typename T, typename U>
360 real Launch::findDuplicates(T *array, U *entry1, U *entry2, integer *duplicateCounter, int length) {
361 ExecutionPolicy executionPolicy;
362 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::findDuplicates<T, U>, array, entry1, entry2,
363 duplicateCounter, length);
364 }
365 template real Launch::findDuplicates<integer, real>(integer *array, real *entry1, real *entry2,
366 integer *duplicateCounter, int length);
367
368 template<typename T>
369 real Launch::markDuplicates(T *array, integer *duplicateCounter, int length) {
370 ExecutionPolicy executionPolicy;
371 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::markDuplicates<T>, array,
372 duplicateCounter, length);
373 }
374 // explicit instantiation for type "integer"
375 template real Launch::markDuplicates<integer>(integer *array, integer *duplicateCounter, int length);
376 // explicit instantiation for type "real"
377 template real Launch::markDuplicates<real>(real *array, integer *duplicateCounter, int length);
378
379 /*template<typename T, typename U>
380 real Launch::markDuplicates(T *array, U *entry1, U *entry2, integer *duplicateCounter, int length) {
381 ExecutionPolicy executionPolicy;
382 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::markDuplicates<T, U>, array, entry1, entry2,
383 duplicateCounter, length);
384 }
385 template real Launch::markDuplicates<integer, real>(integer *array, real *entry1, real *entry2, integer *duplicateCounter, int length);
386 */
387 template<typename T, typename U>
388 real Launch::markDuplicates(T *array, U *entry1, U *entry2, U *entry3, integer *duplicateCounter, integer *child, int length) {
389 ExecutionPolicy executionPolicy;
390 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::markDuplicates<T, U>, array, entry1, entry2,
391 entry3, duplicateCounter, child, length);
392 }
393 template real Launch::markDuplicates<integer, real>(integer *array, real *entry1, real *entry2, real *entry3, integer *duplicateCounter, integer *child, int length);
394
395
396 template<typename T>
397 real Launch::removeDuplicates(T *array, T *removedArray, integer *duplicateCounter, int length) {
398 ExecutionPolicy executionPolicy;
399 return cuda::launch(true, executionPolicy, ::CudaUtils::Kernel::removeDuplicates<T>, array, removedArray,
400 duplicateCounter, length);
401 }
402 // explicit instantiation for type "integer"
403 template real Launch::removeDuplicates<integer>(integer *array, integer *removedArray,
404 integer *duplicateCounter, int length);
405 // explicit instantiation for type "real"
406 template real Launch::removeDuplicates<real>(real *array, real *removedArray,
407 integer *duplicateCounter, int length);
408 }
409
410}
411
412namespace cuda {
413 namespace math {
414 __device__ real min(real a, real b) {
415#if SINGLE_PRECISION
416 return fminf(a, b);
417#else
418 return fmin(a, b);
419#endif
420 }
421
422 __device__ real min(real a, real b, real c) {
423 real temp = min(a, b);
424#if SINGLE_PRECISION
425 return fminf(temp, c);
426#else
427 return fmin(temp, c);
428#endif
429 }
430
431 __device__ real max(real a, real b) {
432#if SINGLE_PRECISION
433 return fmaxf(a, b);
434#else
435 return fmax(a, b);
436#endif
437 }
438
439 __device__ real max(real a, real b, real c) {
440 real temp = max(a, b);
441#if SINGLE_PRECISION
442 return fmaxf(temp, c);
443#else
444 return fmax(temp, c);
445#endif
446 }
447
448 __device__ real abs(real a) {
449#if SINGLE_PRECISION
450 return fabsf(a);
451#else
452 return fabs(a);
453#endif
454 }
455
456 __device__ real sqrt(real a) {
457#if SINGLE_PRECISION
458 return sqrtf(a);
459#else
460 return ::sqrt(a);
461#endif
462 }
463
464 __device__ real rsqrt(real a) {
465#if SINGLE_PRECISION
466 return rsqrtf(a);
467#else
468 return ::rsqrt(a);
469#endif
470 }
471 }
472}
ExecutionPolicy
Execution policy/instruction for CUDA kernel execution.
Definition: cuda_launcher.cuh:33
gpuAssert
void gpuAssert(cudaError_t code, const char *file, int line, bool abort)
Check CUDA error codes.
Definition: cuda_utilities.cu:21
checkCudaCall
void checkCudaCall(cudaError_t command, const char *commandName, const char *fileName, int line)
Check CUDA call.
Definition: cuda_utilities.cu:30
CudaUtils::Kernel::Launch::findDuplicateEntries
real findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:344
CudaUtils::Kernel::Launch::markDuplicates
real markDuplicates(T *array, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:369
CudaUtils::Kernel::Launch::findDuplicates
real findDuplicates(T *array, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:333
CudaUtils::Kernel::Launch::collectValues
real collectValues(integer *indices, real *entries, real *collector, integer count)
Definition: cuda_utilities.cu:320
CudaUtils::Kernel::Launch::removeDuplicates
real removeDuplicates(T *array, T *removedArray, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:397
CudaUtils::Kernel::Launch::checkValues
real checkValues(integer *indices, real *entry1, real *entry2, real *entry3, integer count)
Definition: cuda_utilities.cu:326
CudaUtils::Kernel::removeDuplicates
__global__ void removeDuplicates(T *array, T *removedArray, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:300
CudaUtils::Kernel::collectValues
__global__ void collectValues(integer *indices, real *entries, real *collector, integer count)
Definition: cuda_utilities.cu:44
CudaUtils::Kernel::checkValues
__global__ void checkValues(integer *indices, real *entry1, real *entry2, real *entry3, integer count)
Definition: cuda_utilities.cu:56
CudaUtils::Kernel::findDuplicateEntries
__global__ void findDuplicateEntries(T *array1, T *array2, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:95
CudaUtils::Kernel::findDuplicates
__global__ void findDuplicates(T *array, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:76
CudaUtils::Kernel::markDuplicates
__global__ void markDuplicates(T *array, integer *duplicateCounter, int length)
Definition: cuda_utilities.cu:163
CudaUtils
Definition: cuda_utilities.cuh:100
Kernel
Definition: device_rhs.cuh:7
cuda::math::min
__device__ real min(real a, real b)
Minimum value out of two floating point values.
Definition: cuda_utilities.cu:414
cuda::math::rsqrt
__device__ real rsqrt(real a)
Inverse square root of a floating point value.
Definition: cuda_utilities.cu:464
cuda::math::sqrt
__device__ real sqrt(real a)
Square root of a floating point value.
Definition: cuda_utilities.cu:456
cuda::math::abs
__device__ real abs(real a)
Absolute value of a floating point value.
Definition: cuda_utilities.cu:448
cuda::math::max
__device__ real max(real a, real b)
Maximum value out of two floating point values.
Definition: cuda_utilities.cu:431
cuda
Definition: cuda_launcher.cuh:101
cuda::launch
real launch(bool timeKernel, const ExecutionPolicy &policy, void(*f)(Arguments...), Arguments... args)
CUDA execution wrapper function.
Definition: cuda_launcher.cuh:114
real
double real
Definition: parameter.h:15
integer
int integer
Definition: parameter.h:17
POW_DIM
#define POW_DIM
Definition: parameter.h:40

milupHPC - src/cuda_utils/cuda_utilities.cu Source File
Generated on Wed Aug 31 2022 12:16:52 by Doxygen 1.9.3